def __init__(self): super(Net, self).__init__() #name self.name = "DirCNNh5" #optimizer self.lr = 0.001 self.optimizer_name = 'Adam-Exp' #data #self.data_name = "ModelNet10" self.data_name = "Geometry" self.batch_size = 20 self.nr_points = 1024 self.nr_classes = 10 if self.data_name == 'ModelNet10' else 40 #train_info self.max_epochs = 301 self.save_every = 100 #model self.k = 20 self.l = 7 # DD1 self.in_size = 3 self.out_size = 64 layers = [] layers.append(Linear(self.in_size, 64)) layers.append(ReLU()) layers.append(torch.nn.BatchNorm1d(64)) layers.append(Linear(64 , 64)) layers.append(ReLU()) layers.append(torch.nn.BatchNorm1d(64)) layers.append(Linear(64, self.out_size)) layers.append(ReLU()) layers.append(torch.nn.BatchNorm1d(self.out_size)) dense3dnet = Sequential(*layers) self.dd = DD(l = self.l, k = self.k, mlp = dense3dnet, conv_p = True, conv_fc = False, conv_fn = False, out_3d = True) # DD2 self.in_size_2 = 64*3 self.out_size_2 = 128 layers2 = [] layers2.append(Linear(self.in_size_2, self.out_size_2)) layers2.append(ReLU()) layers2.append(torch.nn.BatchNorm1d(self.out_size_2)) dense3dnet2 = Sequential(*layers2) self.dd2 = DD(l = self.l, k = self.k, mlp = dense3dnet2, conv_p = False, conv_fc = False, conv_fn = True, out_3d = False) self.nn1 = torch.nn.Linear(self.out_size_2, 1024) self.bn1 = torch.nn.BatchNorm1d(1024) self.nn2 = torch.nn.Linear(1024, 512) self.bn2 = torch.nn.BatchNorm1d(512) self.nn3 = torch.nn.Linear(512, 265) self.bn3 = torch.nn.BatchNorm1d(265) self.nn4 = torch.nn.Linear(265, self.nr_classes) self.sm = torch.nn.LogSoftmax(dim=1)
def __init__(self, in_features, out_features): super(ComplexLinear, self).__init__() self.fc_r = Linear(in_features, out_features) self.fc_i = Linear(in_features, out_features)
def make_model_and_optim(): model = Linear(in_dim, 2, bias=False) model = model.cuda() optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9), num_gradients_to_accumulate=accum_steps) return model, optim
def __init__( self, embed_dim=None, # type: Optional[int] num_heads=1, # type: int dropout=0.0, # type: float bias=True, # type: bool add_bias_kv=False, # type: bool add_zero_attn=False, # type: bool kdim=None, # type: Optional[int] vdim=None, # type: Optional[int] head_dim=None, # type: Optional[int] pattern_dim=None, # type: Optional[int] out_dim=None, # type: Optional[int] disable_out_projection=False, # type: bool key_as_static=False, # type: bool query_as_static=False, # type: bool value_as_static=False, # type: bool value_as_connected=False, # type: bool normalize_pattern=False, # type: bool normalize_pattern_affine=False # type: bool ): super(HopfieldCore, self).__init__() assert (type(key_as_static) == bool) and (type(query_as_static) == bool) and (type(value_as_static) == bool) self.key_as_static, self.query_as_static, self.value_as_static = key_as_static, query_as_static, value_as_static num_non_static = 3 - (self.key_as_static + self.query_as_static + self.value_as_static) assert 0 <= num_non_static < 4 self.value_as_connected = value_as_connected self.normalize_pattern, self.normalize_pattern_affine = normalize_pattern, normalize_pattern_affine self.disable_out_projection = disable_out_projection # In case of a static-only executions, check corresponding projections and normalizations. self.static_execution = self._check_execution_mode() if self.static_execution: embed_dim, kdim, vdim = None, None, None if embed_dim is None: assert self.static_execution, r'static-only execution requires all projections to be deactivated.' # Check and set all other properties, conditioned on <static_execution>. self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self._qkv_same_embed_dim = all( (self.kdim == embed_dim, self.vdim == embed_dim, pattern_dim is None, not self.value_as_connected)) assert (not self.value_as_connected) or ( self.kdim == self.vdim), r'key and value need to be of same dimension.' self.num_heads = num_heads self.dropout = dropout self.head_dim = None self.pattern_dim = pattern_dim self.virtual_hopfield_dim = None self.virtual_pattern_dim = None if not self.static_execution: if head_dim is None: self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads." else: assert head_dim > 0, "dimension of the association space has to be positive." self.head_dim = head_dim if self.pattern_dim is None: self.pattern_dim = self.head_dim self.virtual_hopfield_dim = self.num_heads * self.head_dim self.virtual_pattern_dim = self.num_heads * self.pattern_dim self.out_dim = embed_dim if out_dim is None else out_dim assert disable_out_projection or ( self.out_dim > 0), "output projection dimension has to be positive." if normalize_pattern_affine: assert normalize_pattern, "affine pattern normalization without pattern normalization has no effect." self.p_norm_weight = Parameter(torch.Tensor(head_dim)) self.p_norm_bias = Parameter(torch.Tensor(head_dim)) else: self.register_parameter('p_norm_weight', None) self.register_parameter('p_norm_bias', None) if self._qkv_same_embed_dim is False: if query_as_static: self.register_parameter('q_proj_weight', None) else: self.q_proj_weight = Parameter( torch.Tensor(self.virtual_hopfield_dim, embed_dim)) if key_as_static: self.register_parameter('k_proj_weight', None) else: self.k_proj_weight = Parameter( torch.Tensor(self.virtual_hopfield_dim, self.kdim)) if value_as_static: self.register_parameter('v_proj_weight', None) else: self.v_proj_weight = Parameter( torch.Tensor( self.virtual_pattern_dim, self.virtual_hopfield_dim if (value_as_connected and not key_as_static) else self.vdim)) self.register_parameter('in_proj_weight', None) else: if num_non_static > 0: self.in_proj_weight = Parameter( torch.empty( (not query_as_static) * self.virtual_hopfield_dim + (not key_as_static) * self.virtual_hopfield_dim + (not value_as_static) * self.virtual_pattern_dim, embed_dim)) else: self.register_parameter('in_proj_weight', None) self.register_parameter('q_proj_weight', None) self.register_parameter('k_proj_weight', None) self.register_parameter('v_proj_weight', None) if bias and (num_non_static > 0): self.in_proj_bias = Parameter( torch.empty((not query_as_static) * self.virtual_hopfield_dim + (not key_as_static) * self.virtual_hopfield_dim + self.virtual_pattern_dim)) else: self.register_parameter('in_proj_bias', None) if disable_out_projection: self.register_parameter('out_proj', None) else: if bias and _LinearWithBias is not None: self.out_proj = _LinearWithBias(self.virtual_pattern_dim, self.out_dim) else: self.out_proj = Linear(self.virtual_pattern_dim, self.out_dim, bias=bias) self.bias_k, self.bias_v = None, None if add_bias_kv: if not key_as_static: self.bias_k = Parameter( torch.empty(1, 1, self.virtual_hopfield_dim)) if not value_as_static: self.bias_v = Parameter( torch.empty(1, 1, self.virtual_hopfield_dim)) assert not (self.bias_k is None and self.bias_v is None ), r'cannot set key/value bias if both are static.' self.add_zero_attn = add_zero_attn self.reset_parameters()
Compute the gradient with PyTorch and the gradient variance with BackPACK. """ from torch.nn import CrossEntropyLoss, Flatten, Linear, Sequential from backpack import backpack, extend, extensions from backpack.utils.examples import load_mnist_data B = 4 X, y = load_mnist_data(B) print("# Gradient with PyTorch, gradient variance with BackPACK | B =", B) model = Sequential( Flatten(), Linear(784, 10), ) lossfunc = CrossEntropyLoss() model = extend(model) lossfunc = extend(lossfunc) loss = lossfunc(model(X), y) with backpack(extensions.Variance()): loss.backward() for name, param in model.named_parameters(): print(name) print(".grad.shape: ", param.grad.shape) print(".variance.shape: ", param.variance.shape)
def __init__(self, D_key, D_query): super(AttentionLayer, self).__init__() self.W_k = Linear(D_key, D_query, bias=False) self.W_q = Linear(D_key + D_query, D_query, bias=False)
def __init__(self, state_dim, action_dim): super().__init__() self.model = Sequential(Linear(state_dim + action_dim, 64), LeakyReLU(), Linear(64, 32), LeakyReLU(), Linear(32, 1))
import torch from torch.optim import Adam from torch.nn.functional import cross_entropy from collections import OrderedDict from torch.nn import Linear, ReLU, Sequential def classify_target(x, y): return (y > (x * 3).sin()).long() mlp = torch.nn.Sequential(OrderedDict([ ('layer1', Sequential(Linear(2, 20), ReLU())), ('layer2', Sequential(Linear(20, 20), ReLU())), ('layer3', Sequential(Linear(20, 2))) ])) mlp.cuda() optimizer = Adam(mlp.parameters(), lr=0.01) for iteration in range(1024): in_batch = torch.randn(10000, 2, device='cuda') target_batch = classify_target(in_batch[:,0], in_batch[:,1]) out_batch = mlp(in_batch) loss = cross_entropy(out_batch, target_batch) if iteration > 0: mlp.zero_grad() loss.backward() optimizer.step() if iteration == 2 ** iteration.bit_length() - 1: pred_batch = out_batch.max(1)[1] accuracy = (pred_batch == target_batch).float().sum() / len(in_batch) print(f'Iteration {iteration} accuracy: {accuracy}')
def __init__(self): super(Net, self).__init__() self.hidden_layer = Linear(1, 20) self.out_layer = Linear(20, 1)
def __init__(self, in_channels): super().__init__() self.lin_src = Linear(in_channels, in_channels) self.lin_dst = Linear(in_channels, in_channels) self.lin_final = Linear(in_channels, 1)
print("b shape", b.shape) def forward(x): yhat = w * x + b return yhat x = torch.tensor([[1.0], [2.0], [3.0]]) yhat = forward(x) print("The Prediction: ", yhat) print("Y size", yhat.shape) torch.manual_seed(1) lr = Linear(in_features=1, out_features=1, bias=True) print("Parameters w and b: ", list(lr.parameters())) print("Python Dictionary", lr.state_dict()) print("keys:", lr.state_dict().keys()) print("values:", lr.state_dict().values()) print("weight:", lr.weight) print("bias:", lr.bias) x = torch.tensor([[1.0]]) yhat = lr(x) print("The prediction: ", yhat) x = torch.tensor([[1.0], [2.0]]) yhat = lr(x)
def __init__(self, pretrained="", checkpoint_path=None, freeze_nlayers=0, round_at: float = None, demo_mode=False): super(S20DeconvToDrySpotEff2, self).__init__() self.ct1 = ConvTranspose2d(1, 256, 3, stride=2) self.ct2 = ConvTranspose2d(256, 128, 5, stride=2) self.ct3 = ConvTranspose2d(128, 64, 10, stride=2) self.ct4 = ConvTranspose2d(64, 16, 17, stride=2) self.details = Conv2d(16, 8, 5) # ^ Pretrained ^ self.c2 = Conv2d(8, 16, 13) self.c3 = Conv2d(16, 64, 7) self.c4 = Conv2d(64, 128, 3) self.c5 = Conv2d(128, 256, 3) self.c6 = Conv2d(256, 512, 3) self.c7 = Conv2d(512, 512, 1) self.maxpool = nn.MaxPool2d(2, 2) self.lin1 = Linear(1024, 256) self.lin2 = Linear(256, 1) self.dropout = nn.Dropout(0.3) # self.bn8 = nn.BatchNorm2d(8) # self.bn512 = nn.BatchNorm2d(512) self.round_at = round_at self.demo_mode = demo_mode if pretrained == "deconv_weights": logger = logging.getLogger(__name__) weights = load_model_layers_from_path( path=checkpoint_path, layer_names={'ct1', 'ct2', 'ct3', 'ct4', 'details'}) incomp = self.load_state_dict(weights, strict=False) logger.debug(f'All layers: {self.state_dict().keys()}') logger.debug(f'Loaded weights but the following: {incomp}') if pretrained == "all": logger = logging.getLogger(__name__) weights = load_model_layers_from_path(path=checkpoint_path, layer_names={ 'ct1', 'ct2', 'ct3', 'ct4', 'details', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'lin1', 'lin2' }) incomp = self.load_state_dict(weights, strict=False) logger.debug(f'All layers: {self.state_dict().keys()}') logger.debug(f'Loaded weights but the following: {incomp}') if freeze_nlayers == 0: return for i, c in enumerate(self.children()): logger = logging.getLogger(__name__) logger.info(f'Freezing: {c}') for param in c.parameters(): param.requires_grad = False if i == freeze_nlayers - 1: break
def __init__(self, input_size: int, input_module_class: Callable, rnn_module_class: Callable, output_size: int, option_size: int, rnn_size: int, intra_option_policy: str, intra_option_kwargs: [dict, None] = None, input_module_kwargs: [dict, None] = None, use_interest: bool = False, use_diversity: bool = False, use_attention: bool = False, baselines_init: bool = True, prev_action: np.ndarray = np.ones(5, dtype=bool), prev_reward: np.ndarray = np.ones(5, dtype=bool), prev_option: np.ndarray = np.zeros(5, dtype=bool), NORM_EPS: float = 1e-6): super().__init__() if input_module_kwargs is None: input_module_kwargs = { } # Assume module has all necessary arguments if intra_option_kwargs is None: intra_option_kwargs = {} input_module_kwargs = { **input_module_kwargs, **{ 'input_size': input_size } } # Add input size intra_option_kwargs = {**intra_option_kwargs} self.use_interest = use_interest self.use_diversity = use_diversity self.use_attention = use_attention self.NORM_EPS = NORM_EPS pi_class = DiscreteIntraOptionPolicy if intra_option_policy == 'discrete' else ContinuousIntraOptionPolicy # Instantiate independent preprocessors for pi, pi_omega, q (and entropy), interest, and termination heads self.pi_proc, self.pi_omega_proc, self.q_proc, self.beta_proc = [ input_module_class(**input_module_kwargs) for _ in range(4) ] self.int_proc = input_module_class( **input_module_kwargs) if use_interest else Dummy(option_size) if baselines_init: self.pi_proc.apply(apply_init) self.pi_omega_proc.apply(apply_init) self.q_proc.apply(apply_init) self.int_proc.apply(apply_init) self.beta_proc.apply(apply_init) input_size = self.pi_proc.output_size rnn_input_sizes = [ input_size + prev_option[i] * option_size + prev_action[i] * output_size + prev_reward[i] for i in range(4) ] self.pi_rnn, self.beta_rnn, self.q_rnn, self.pi_omega_rnn = [ rnn_module_class(s, rnn_size) for s in rnn_input_sizes ] self.int_rnn = rnn_module_class( input_size + prev_option[-1] * option_size + prev_action[-1] * output_size + prev_reward[-1], rnn_size) if use_interest else None if baselines_init: lstm_init = partial(apply_init, gain=O_INIT_VALUES['lstm']) self.pi_rnn.apply(lstm_init) self.pi_omega_rnn.apply(lstm_init) self.q_rnn.apply(lstm_init) self.beta_rnn.apply(lstm_init) if use_interest: self.int_rnn.apply(lstm_init) self.pi = Sequential( nn.ReLU(), pi_class(rnn_size, option_size, output_size, ortho_init=baselines_init, **intra_option_kwargs)) self.beta = Sequential(nn.ReLU(), Linear(rnn_size, option_size), nn.Sigmoid()) self.q = Sequential(nn.ReLU(), Linear(rnn_size, option_size)) self.q_ent = Sequential(nn.ReLU(), Linear( rnn_size, option_size)) if use_diversity else Dummy(option_size, out_value=0.) self.pi_omega = Sequential(nn.ReLU(), Linear(rnn_size, option_size), nn.Softmax(-1)) self.interest = Sequential( nn.ReLU(), Linear(rnn_size, option_size), nn.Sigmoid()) if use_interest else Dummy(option_size) self.p_a, self.p_o, self.p_r = prev_action, prev_option, prev_reward if baselines_init: init_v, init_pi = O_INIT_VALUES['v'], O_INIT_VALUES['pi'] self.beta[1].apply(apply_init) self.pi_omega[1].apply(partial(apply_init, gain=init_pi)) self.q[1].apply(partial(apply_init, gain=init_v)) if use_interest: self.interest[0].apply(apply_init) if use_diversity: self.q_ent.apply(apply_init)
def __init__(self, input_size: int, input_module_class: Callable, output_size: int, option_size: int, intra_option_policy: str, intra_option_kwargs: [dict, None] = None, input_module_kwargs: [dict, None] = None, use_interest: bool = False, use_diversity: bool = False, use_attention: bool = False, baselines_init: bool = True, NORM_EPS: float = 1e-6): super().__init__() if input_module_kwargs is None: input_module_kwargs = { } # Assume module has all necessary arguments if intra_option_kwargs is None: intra_option_kwargs = {} input_module_kwargs = { **input_module_kwargs, **{ 'input_size': input_size } } # Add input size intra_option_kwargs = {**intra_option_kwargs} self.use_interest = use_interest self.use_diversity = use_diversity self.use_attention = use_attention self.NORM_EPS = NORM_EPS pi_class = DiscreteIntraOptionPolicy if intra_option_policy == 'discrete' else ContinuousIntraOptionPolicy # Instantiate independent preprocessors for pi, pi_omega, q (and entropy), interest, and termination heads pi_proc, pi_omega_proc, q_proc, int_proc, beta_proc = [ input_module_class(**input_module_kwargs) for _ in range(5) ] if baselines_init: pi_proc.apply(apply_init) pi_omega_proc.apply(apply_init) q_proc.apply(apply_init) int_proc.apply(apply_init) beta_proc.apply(apply_init) input_size = pi_proc.output_size self.pi = Sequential( pi_proc, pi_class(input_size, option_size, output_size, ortho_init=baselines_init, **intra_option_kwargs)) self.beta = Sequential(beta_proc, Linear(input_size, option_size), nn.Sigmoid()) self.q = Sequential(q_proc, Linear(input_size, option_size)) self.q_ent = Sequential(q_proc, Linear( input_size, option_size)) if use_diversity else Dummy(option_size, out_value=0.) self.pi_omega = Sequential(pi_omega_proc, Linear(input_size, option_size), nn.Softmax(-1)) self.interest = Sequential( int_proc, Linear(input_size, option_size), nn.Sigmoid()) if use_interest else Dummy(option_size) if baselines_init: init_v, init_pi = O_INIT_VALUES['v'], O_INIT_VALUES['pi'] self.beta[1].apply(apply_init) self.pi_omega[1].apply(partial(apply_init, gain=init_pi)) self.q[1].apply(partial(apply_init, gain=init_v)) if use_interest: self.interest[1].apply(apply_init) if use_diversity: self.q_ent[1].apply(apply_init)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1) order = y_train.argsort(axis=0) y_train = y_train.values[order] y_train = np.reshape(y_train, newshape=(y_train.shape[0], 1)) x_train = x_train.values[order, :] x_train = torch.FloatTensor(x_train) y_train = torch.FloatTensor(y_train) Net = Sequential( # BatchNorm1d(num_features=2), Linear(in_features=2, out_features=10), ReLU(inplace=True), Linear(in_features=10, out_features=1), ) optimizer = RMSprop(Net.parameters(), lr=0.001) loss_func = MSELoss() x_data, y_data = Variable(x_train), Variable(y_train) bar = ProgressBar(1, STEPS, "train_loss:%.9f") predict = [] myloss = [] for step in range(STEPS): prediction = Net(x_data)
def __init__(self): super(LinearClassifier, self).__init__() self.fully_connected = Linear(2, 1)
) from backpack import convert_module_to_backpack from backpack.custom_module.branching import Parallel SQRT_GGN_SETTINGS = SECONDORDER_SETTINGS ############################################################################### # Embedding # ############################################################################### SQRT_GGN_SETTINGS += [ { "input_fn": lambda: randint(0, 5, (6, )), "module_fn": lambda: Sequential( Embedding(5, 3), Linear(3, 4), ), "loss_function_fn": lambda: CrossEntropyLoss(reduction="mean"), "target_fn": lambda: classification_targets((6, ), 4), }, { "input_fn": lambda: randint(0, 3, (3, 2, 2)), "module_fn": lambda: Sequential( Embedding(3, 2), Flatten(), ), "loss_function_fn": lambda: CrossEntropyLoss(reduction="mean"), "target_fn": lambda: classification_targets((3, ), 2 * 2), "seed": 1, }, ]
def _test_create_supervised_trainer( model_device: Optional[str] = None, trainer_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, ): model = Linear(1, 1) if model_device: model.to(model_device) model.weight.data.zero_() model.bias.data.zero_() optimizer = SGD(model.parameters(), 0.1) if trace: example_input = torch.randn(1, 1) model = torch.jit.trace(model, example_input) if amp_mode == "apex" and model_device == trainer_device == "cuda": from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O2") trainer = create_supervised_trainer( model, optimizer, mse_loss, device=trainer_device, output_transform=lambda x, y, y_pred, loss: (y_pred, loss.item()), amp_mode=amp_mode, scaler=scaler, ) x = torch.tensor([[0.1], [0.2]]) y = torch.tensor([[0.3], [0.5]]) data = [(x, y)] assert model.weight.data[0, 0].item() == approx(0.0) assert model.bias.item() == approx(0.0) if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")): state = trainer.run(data) assert state.output[-1] == approx(0.17), state.output[-1] assert round(model.weight.data[0, 0].item(), 3) == approx(0.013), model.weight.item() assert round(model.bias.item(), 3) == approx(0.08), model.bias.item() if amp_mode == "amp": assert state.output[0].dtype is torch.half if scaler and isinstance(scaler, bool): assert hasattr(state, "scaler") else: assert not hasattr(state, "scaler") else: if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"): # This is broken in 1.6.0 but will be probably fixed with 1.7.0 with pytest.raises( RuntimeError, match=r"is on CPU, but expected them to be on GPU"): trainer.run(data)
def __init__(self, observable_dim: int, delay: int, latent_dim: int) -> None: super().__init__() self.linear_embedder = Linear(in_features=observable_dim * delay, out_features=latent_dim)
def __init__(self, input_dim: int, hidden_dim: int, output_dim: int) -> None: super().__init__() self.fc1 = Linear(input_dim, hidden_dim) self.fc2 = Linear(hidden_dim, output_dim)
def __init__(self, dim): super(NetGIN, self).__init__() self.node_attribute_encoder = Sequential(Linear(2 * 13, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.type_encoder = Sequential(Linear(3, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.edge_encoder = Sequential(Linear(4 + 1, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.mlp = Sequential(Linear(3 * dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn1_1 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn1_2 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.conv1_1 = GINConv(nn1_1, train_eps=True) self.conv1_2 = GINConv(nn1_2, train_eps=True) self.mlp_1 = Sequential(Linear(2 * dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn2_1 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn2_2 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.conv2_1 = GINConv(nn2_1, train_eps=True) self.conv2_2 = GINConv(nn2_2, train_eps=True) self.mlp_2 = Sequential(Linear(2 * dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn3_1 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn3_2 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.conv3_1 = GINConv(nn3_1, train_eps=True) self.conv3_2 = GINConv(nn3_2, train_eps=True) self.mlp_3 = Sequential(Linear(2 * dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn4_1 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn4_2 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.conv4_1 = GINConv(nn4_1, train_eps=True) self.conv4_2 = GINConv(nn4_2, train_eps=True) self.mlp_4 = Sequential(Linear(2 * dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn5_1 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn5_2 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.conv5_1 = GINConv(nn5_1, train_eps=True) self.conv5_2 = GINConv(nn5_2, train_eps=True) self.mlp_5 = Sequential(Linear(2 * dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn6_1 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) nn6_2 = Sequential(Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.conv6_1 = GINConv(nn6_1, train_eps=True) self.conv6_2 = GINConv(nn6_2, train_eps=True) self.mlp_6 = Sequential(Linear(2 * dim, dim), torch.nn.BatchNorm1d(dim), ReLU(), Linear(dim, dim), torch.nn.BatchNorm1d(dim), ReLU()) self.set2set = Set2Set(1 * dim, processing_steps=6) self.fc1 = Linear(2 * dim, dim) self.fc4 = Linear(dim, 12)
def __init__(self, embedding_size, out_h, out_w): super(MobileFaceNet, self).__init__() self.conv1 = Conv_block(3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1)) self.conv2_dw = Conv_block(64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64) self.conv_23 = Depth_Wise(64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128) self.conv_3 = Residual(64, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) self.conv_34 = Depth_Wise(64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256) self.conv_4 = Residual(128, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) self.conv_45 = Depth_Wise(128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512) self.conv_5 = Residual(128, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) self.conv_6_sep = Conv_block(128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0)) #self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(7,7), stride=(1, 1), padding=(0, 0)) #self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(4,7), stride=(1, 1), padding=(0, 0)) self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(out_h, out_w), stride=(1, 1), padding=(0, 0)) self.conv_6_flatten = Flatten() self.linear = Linear(512, embedding_size, bias=False) self.bn = BatchNorm1d(embedding_size)
def __init__(self): super().__init__() self.layer = Linear(4, 4)
def __init__(self, in_channels, hidden_channels, out_channels): super(Block, self).__init__() self.conv1 = DenseGCNConv(in_channels, hidden_channels) self.conv2 = DenseGCNConv(hidden_channels, hidden_channels) self.lin = Linear(hidden_channels + hidden_channels, out_channels)
def __init__(self, dim_in, dim_out, dim_ctx): super(ConcatSquashLinear, self).__init__() self._layer = Linear(dim_in, dim_out) self._hyper_bias = Linear(dim_ctx, dim_out, bias=False) self._hyper_gate = Linear(dim_ctx, dim_out)
def __init__(self): super().__init__() self.inner = FSDP(Linear(4, 4), **fsdp_config) self.outer = Linear(4, 5)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, use_attention: bool, seq2seq_encoder: Seq2SeqEncoder, seq2vec_encoder: Seq2VecEncoder, span_end_encoder_after: Seq2SeqEncoder, use_decoder_trainer: bool, decoder_beam_search: BeamSearch, kb_configs: dict, other_configs: dict, initializer: InitializerApplicator) -> None: super(ProStructModel, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.num_actions = len(Action) # number of actions is hardcoded here. # They are defined in Action enum in propara_dataset_reader.py self.other_configs = other_configs # kb_coefficient * kb_score + (1-kb_coefficient) * model_score self.kb_coefficient = torch.nn.Parameter(torch.ones(1).mul(kb_configs.get('kb_coefficient', 0.5))) self.use_attention = use_attention self.use_decoder_trainer = use_decoder_trainer if self.use_attention: self.seq2seq_encoder = seq2seq_encoder self.time_distributed_seq2seq_encoder = TimeDistributed(TimeDistributed(self.seq2seq_encoder)) self.time_distributed_attention_layer = \ TimeDistributed(TimeDistributed( Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(), seq2seq_encoder.get_output_dim()), normalize=True))) self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(), self.num_actions) else: self.seq2vec_encoder = seq2vec_encoder self.time_distributed_seq2vec_encoder = TimeDistributed(TimeDistributed(self.seq2vec_encoder)) self.aggregate_feedforward = Linear(seq2vec_encoder.get_output_dim(), self.num_actions) self.span_end_encoder_after = span_end_encoder_after # per step per participant self.time_distributed_encoder_span_end_after = TimeDistributed(TimeDistributed(self.span_end_encoder_after)) # Fixme: dimensions self._span_start_predictor_after = TimeDistributed(TimeDistributed(torch.nn.Linear(2 + 2*seq2seq_encoder.get_output_dim(), 1))) self._span_end_predictor_after = TimeDistributed(TimeDistributed(torch.nn.Linear(span_end_encoder_after.get_output_dim(), 1))) self._type_accuracy = BooleanAccuracy() self._loss = torch.nn.CrossEntropyLoss(ignore_index=-1) # Fixme: This is less robust. If the masking value # Fixme: add a metric for location span strings self.span_metric = SquadEmAndF1() if self.use_decoder_trainer: self.decoder_trainer = MaximumMarginalLikelihood() if kb_configs['kb_to_use'] == 'lexicalkb': kb = KBLexical( lexical_kb_path=kb_configs['lexical_kb_path'], fullgrid_prompts_load_path=kb_configs['fullgrid_prompts_load_path'] ) # Makeshift arrangement to get number of participants in tiny.tsv . self.commonsense_based_action_generator = CommonsenseBasedActionGenerator(self.num_actions) self.rules_activated = [int(rule_val.strip()) > 0 for rule_val in self.other_configs.get('constraint_rules_to_turn_on', '0,0,0,1') .split(",")] self.rule_2_fraction_participants = self.other_configs.get('rule_2_fraction_participants', 0.5) self.rule_3_fraction_steps = self.other_configs.get('rule_3_fraction_steps', 0.5) self.commonsense_based_action_generator.set_rules_used(self.rules_activated, self.rule_2_fraction_participants, self.rule_3_fraction_steps) # [self.rules_activated[0], # C/D/C/D cannot happen # self.rules_activated[1], # > 1/2 partic # self.rules_activated[2], # > 1/2 steps cannot change # self.rules_activated[3] # until mentioned # ]) self.decoder_step = ProParaDecoderStep(KBBasedActionScorer(kb=kb, kb_coefficient=self.kb_coefficient), valid_action_generator=self.commonsense_based_action_generator) self.beam_search = decoder_beam_search initializer(self)
shuffle=True) images, label = next(iter(trainloader)) images.size() im1 = images[0] im1.size() im1_plt = np.squeeze(im1) plt.imshow(im1_plt) for image, label in trainloader: pass #Apply your DL on the dataset. ############################################ Linear transformation from torch.nn import Linear ## linear layer l1 = Linear(in_features=10, out_features=5, bias=True) ## inputs inp = Variable(torch.randn(1, 10)) ## Apply linear transformation to the inputs l1(inp).size() ## accessing the trainable parameters l1.weight ## size of the l1 weight layer would be such that that the mat mul will give out_features so 10X1.T.dot(10X5)--> l1.weight.size() l1.bias ## super is a shortcut to access a base class without having to know its type or name ## here super is used to pass arguments of child class to parents class ## sample network code
def test_add_param_group(debias_ewma): """Test AdaScale supports add_param_group() API.""" model1 = Linear(2, 2, bias=True) with torch.no_grad(): # make weights and bias deterministic, which is needed for # multi-layer models. For them, adascale gain is affected by # parameters from other layers. model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2)) model1.bias.fill_(0.1) optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma) assert len(optim._hook_handles) == 2 model2 = Linear(2, 3, bias=True) with torch.no_grad(): # make weights and bias deterministic model2.weight.copy_( Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(3, 2)) model2.bias.fill_(0.2) optim.add_param_group({"params": model2.parameters()}) assert len(optim._hook_handles) == 4 # make sure we can run the model. model = Sequential(model1, model2).cuda() in_data_0 = Tensor([1.0, 2.0]).cuda() out = model(in_data_0) out.sum().backward() in_data_1 = Tensor([3.0, 4.0]).cuda() out = model(in_data_1) out.sum().backward() # make sure the gains are right and we can step. # since this is the first step, debias_ewma doesn't affect the value. assert np.allclose(optim.gain(), 1.1440223454935758), optim.gain() assert np.allclose(optim.gain(0), 1.1428571428571428), optim.gain(0) assert np.allclose(optim.gain(1), 1.1471258476157762), optim.gain(1) optim.step() optim.zero_grad() # make sure we can add a PG again after stepping. model3 = Linear(3, 4, bias=True) with torch.no_grad(): # make weights and bias deterministic model3.weight.copy_( Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0] * 2).reshape(4, 3)) model3.bias.fill_(0.2) optim.add_param_group({"params": model3.parameters()}) assert len(optim._hook_handles) == 6 # make sure we can run the model. model = Sequential(model1, model2, model3).cuda() in_data_0 = Tensor([1.0, 2.0]).cuda() out = model(in_data_0) out.sum().backward() in_data_1 = Tensor([3.0, 4.0]).cuda() out = model(in_data_1) out.sum().backward() # make sure gains are right and we can step. # the last PG's gain is not affected by debias_ewma since it is the first step for that PG. assert np.allclose( optim.gain(), 1.1191193589460822 if debias_ewma else 1.1192783954732368), optim.gain() assert np.allclose( optim.gain(0), 1.1428571880897151 if debias_ewma else 1.142857188085096), optim.gain(0) assert np.allclose( optim.gain(1), 1.1167103578364508 if debias_ewma else 1.1167104954034948), optim.gain(1) assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2) optim.step() optim.zero_grad()
""" Compute the gradient with PyTorch and the KFLR approximation with BackPACK. """ from torch.nn import CrossEntropyLoss, Flatten, Linear, Sequential from backpack import backpack, extend, extensions from backpack.utils.examples import load_mnist_data B = 4 X, y = load_mnist_data(B) print("# Gradient with PyTorch, KFLR approximation with BackPACK | B =", B) model = Sequential(Flatten(), Linear(784, 10),) lossfunc = CrossEntropyLoss() model = extend(model) lossfunc = extend(lossfunc) loss = lossfunc(model(X), y) with backpack(extensions.KFLR()): loss.backward() for name, param in model.named_parameters(): print(name) print(".grad.shape: ", param.grad.shape) print(".kflr (shapes): ", [kflr.shape for kflr in param.kflr])