def __init__(self, input_dim, n_hidden, n_layer, dropout, n_hop): super().__init__() self._init_h = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_c = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_i = nn.Parameter(torch.Tensor(input_dim)) init.uniform_(self._init_h, -INI, INI) init.uniform_(self._init_c, -INI, INI) init.uniform_(self._init_i, -0.1, 0.1) self._lstm = nn.LSTM( input_dim, n_hidden, n_layer, bidirectional=False, dropout=dropout ) self._lstm_cell = None # attention parameters self._attn_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._attn_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._attn_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._attn_wm) init.xavier_normal_(self._attn_wq) init.uniform_(self._attn_v, -INI, INI) # hop parameters self._hop_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._hop_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._hop_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._hop_wm) init.xavier_normal_(self._hop_wq) init.uniform_(self._hop_v, -INI, INI) self._n_hop = n_hop
def __init__(self, layers, activations, fl_init, action_dim) -> None: super(CriticNet, self).__init__() self.layers: nn.ModuleList = nn.ModuleList() self.batch_norm_ops: nn.ModuleList = nn.ModuleList() self.activations = activations assert len(layers) >= 3, "Invalid layer schema {} for critic network".format( layers ) assert layers[-1] == 1, "Only one output node for the critic net" for i, layer in enumerate(layers[1:]): # Batch norm only applied to pre-action layers if i == 0: self.layers.append(nn.Linear(layers[i], layer)) self.batch_norm_ops.append(nn.BatchNorm1d(layers[i])) elif i == 1: self.layers.append(nn.Linear(layers[i] + action_dim, layer)) self.batch_norm_ops.append(nn.BatchNorm1d(layers[i])) # Actions skip input layer else: self.layers.append(nn.Linear(layers[i], layer)) # If last layer use simple uniform init (as outlined in DDPG paper) if i + 1 == len(layers[1:]): init.uniform_(self.layers[i].weight, -fl_init, fl_init) init.uniform_(self.layers[i].bias, -fl_init, fl_init) # Else use fan in uniform init (as outlined in DDPG paper) else: fan_in_init(self.layers[i].weight, self.layers[i].bias)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if args: ptr_net = args[0] else: ptr_net = kwargs['ptr_net'] assert isinstance(ptr_net, LSTMPointerNet) self._stop = nn.Parameter( torch.Tensor(self._lstm_cell.input_size)) init.uniform_(self._stop, -INI, INI)
def __init__(self, input_dim, n_hidden, n_layer, dropout, bidirectional): super().__init__() self._init_h = nn.Parameter( torch.Tensor(n_layer*(2 if bidirectional else 1), n_hidden)) self._init_c = nn.Parameter( torch.Tensor(n_layer*(2 if bidirectional else 1), n_hidden)) init.uniform_(self._init_h, -INI, INI) init.uniform_(self._init_c, -INI, INI) self._lstm = nn.LSTM(input_dim, n_hidden, n_layer, dropout=dropout, bidirectional=bidirectional)
def __init__(self, vocab_size, emb_dim, n_hidden, bidirectional, n_layer, dropout=0.0): super().__init__() # embedding weight parameter is shared between encoder, decoder, # and used as final projection layer to vocab logit # can initialize with pretrained word vectors self._embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0) self._enc_lstm = nn.LSTM( emb_dim, n_hidden, n_layer, bidirectional=bidirectional, dropout=dropout ) # initial encoder LSTM states are learned parameters state_layer = n_layer * (2 if bidirectional else 1) self._init_enc_h = nn.Parameter( torch.Tensor(state_layer, n_hidden) ) self._init_enc_c = nn.Parameter( torch.Tensor(state_layer, n_hidden) ) init.uniform_(self._init_enc_h, -INIT, INIT) init.uniform_(self._init_enc_c, -INIT, INIT) # vanillat lstm / LNlstm self._dec_lstm = MultiLayerLSTMCells( 2*emb_dim, n_hidden, n_layer, dropout=dropout ) # project encoder final states to decoder initial states enc_out_dim = n_hidden * (2 if bidirectional else 1) self._dec_h = nn.Linear(enc_out_dim, n_hidden, bias=False) self._dec_c = nn.Linear(enc_out_dim, n_hidden, bias=False) # multiplicative attention self._attn_wm = nn.Parameter(torch.Tensor(enc_out_dim, n_hidden)) self._attn_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) init.xavier_normal_(self._attn_wm) init.xavier_normal_(self._attn_wq) # project decoder output to emb_dim, then # apply weight matrix from embedding layer self._projection = nn.Sequential( nn.Linear(2*n_hidden, n_hidden), nn.Tanh(), nn.Linear(n_hidden, emb_dim, bias=False) ) # functional object for easier usage self._decoder = AttentionalLSTMDecoder( self._embedding, self._dec_lstm, self._attn_wq, self._projection )
def __init__(self, layers, activations, fl_init) -> None: super(ActorNet, self).__init__() self.layers: nn.ModuleList = nn.ModuleList() self.batch_norm_ops: nn.ModuleList = nn.ModuleList() self.activations = activations assert len(layers) >= 2, "Invalid layer schema {} for actor network".format( layers ) for i, layer in enumerate(layers[1:]): self.layers.append(nn.Linear(layers[i], layer)) self.batch_norm_ops.append(nn.BatchNorm1d(layers[i])) # If last layer use simple uniform init (as outlined in DDPG paper) if i + 1 == len(layers[1:]): init.uniform_(self.layers[i].weight, -fl_init, fl_init) init.uniform_(self.layers[i].bias, -fl_init, fl_init) # Else use fan in uniform init (as outlined in DDPG paper) else: fan_in_init(self.layers[i].weight, self.layers[i].bias)
def __init__(self, context_dim, state_dim, input_dim, bias=True): super().__init__() self._v_c = nn.Parameter(torch.Tensor(context_dim)) self._v_s = nn.Parameter(torch.Tensor(state_dim)) self._v_i = nn.Parameter(torch.Tensor(input_dim)) init.uniform_(self._v_c, -INIT, INIT) init.uniform_(self._v_s, -INIT, INIT) init.uniform_(self._v_i, -INIT, INIT) if bias: self._b = nn.Parameter(torch.zeros(1)) else: self.regiser_module(None, '_b')
def kaiming_bias_init(b, *kwargs): fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) return init.uniform_(b, -bound, bound)
def glorot(shape): """Glorot & Bengio (AISTATS 2010) init.""" init_range = np.sqrt(6.0 / (shape[0] + shape[1])) tensor = torch.empty(shape) uniform_(tensor, a=-init_range, b=init_range) return tensor
def fan_in_init(weight_tensor, bias_tensor) -> None: """ Fan in initialization as described in DDPG paper.""" val_range = 1.0 / np.sqrt(weight_tensor.size(1)) init.uniform_(weight_tensor, -val_range, val_range) init.uniform_(bias_tensor, -val_range, val_range)
def reset_parameters(self, dim): init.uniform_(self.z0, -math.sqrt(1 / dim), math.sqrt(1 / dim)) init.uniform_(self.log_alpha, -math.sqrt(1 / dim), math.sqrt(1 / dim)) init.uniform_(self.beta, -math.sqrt(1 / dim), math.sqrt(1 / dim))
def __init__(self, args, batchNorm=False, div_flow=20., requires_grad=False): super(FlowNet2, self).__init__() self.batchNorm = batchNorm self.div_flow = div_flow self.rgb_max = args.rgb_max self.args = args self.channelnorm = ChannelNorm() # First Block (FlowNetC) self.flownetc = FlowNetC.FlowNetC(args, batchNorm=self.batchNorm) self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear') #bilinear if args.fp16: self.resample1 = nn.Sequential(tofp32(), Resample2d(), tofp16()) else: self.resample1 = Resample2d() # Block (FlowNetS1) self.flownets_1 = FlowNetS.FlowNetS(args, batchNorm=self.batchNorm) self.upsample2 = nn.Upsample(scale_factor=4, mode='bilinear') if args.fp16: self.resample2 = nn.Sequential(tofp32(), Resample2d(), tofp16()) else: self.resample2 = Resample2d() # Block (FlowNetS2) self.flownets_2 = FlowNetS.FlowNetS(args, batchNorm=self.batchNorm) # Block (FlowNetSD) self.flownets_d = FlowNetSD.FlowNetSD(args, batchNorm=self.batchNorm) self.upsample3 = nn.Upsample(scale_factor=4, mode='nearest') self.upsample4 = nn.Upsample(scale_factor=4, mode='nearest') if args.fp16: self.resample3 = nn.Sequential(tofp32(), Resample2d(), tofp16()) else: self.resample3 = Resample2d() if args.fp16: self.resample4 = nn.Sequential(tofp32(), Resample2d(), tofp16()) else: self.resample4 = Resample2d() # Block (FLowNetFusion) self.flownetfusion = FlowNetFusion.FlowNetFusion( args, batchNorm=self.batchNorm) for m in self.modules(): if isinstance(m, nn.Conv2d): if m.bias is not None: init.uniform_(m.bias) init.xavier_uniform_(m.weight) if isinstance(m, nn.ConvTranspose2d): if m.bias is not None: init.uniform_(m.bias) init.xavier_uniform_(m.weight) # init_deconv_bilinear(m.weight) if not requires_grad: for param in self.parameters(): param.requires_grad = False
def reset_parameters(self): init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) init.uniform_(self.bias, -bound, bound)
def reset_parameters(self): init.uniform_(self.init_param, -1 / 2, 1 / 2)
def reset_parameters(self) -> None: bound = 1 / math.sqrt(self.weight.size(1)) init.uniform_(self.weight, -bound, bound) if self.bias is not None: init.uniform_(self.bias, -bound, bound)
def reset_bn_parameters(self): self.reset_running_stats() init.uniform_(self.gamma) init.zeros_(self.beta)
def fan_in_init(tensor) -> None: """ Fan in initialization as described in DDPG paper.""" val_range = 1. / np.sqrt(tensor.size(1)) init.uniform_(tensor, -val_range, val_range)
def linear_init(model): for para in model.parameters(): init.uniform_(para, -0.05, 0.05)
def reset_parameters(self): self.reset_running_stats() init.uniform_(self.weight) init.zeros_(self.bias)
def reset_parameters(self): sd = 1.0 / math.sqrt(self.hidden_size) for p in self.parameters(): if p.requires_grad: init.uniform_(p, -sd, sd)
def kaiming_uniform_(tensor: Tensor, fan: int, a: float = 0., nonlinearity: str = 'leaky_relu') -> Tensor: bound = calculate_gain(nonlinearity, a) * (3.0 / fan) ** 0.5 return init.uniform_(tensor, a=-bound, b=+bound)
def xavier_uniform_(tensor: Tensor, fan_in: int, fan_out: int, gain: float = 1.) -> Tensor: bound = gain * (6.0 / (fan_in + fan_out)) ** 0.5 return init.uniform_(tensor, a=-bound, b=+bound)
def __init__( self, emb_size, emb_dimension, batch_size, only_cpu, only_gpu, only_fst, only_snd, mix, neg_weight, negative, lr, lap_norm, fast_neg, record_loss, async_update, num_threads, ): """ initialize embedding on CPU Paremeters ---------- emb_size int : number of nodes emb_dimension int : embedding dimension batch_size int : number of node sequences in each batch only_cpu bool : training with CPU only_gpu bool : training with GPU only_fst bool : only embedding for first-order proximity only_snd bool : only embedding for second-order proximity mix bool : mixed training with CPU and GPU negative int : negative samples for each positve node pair neg_weight float : negative weight lr float : initial learning rate lap_norm float : weight of laplacian normalization fast_neg bool : do negative sampling inside a batch record_loss bool : print the loss during training use_context_weight : give different weights to the nodes in a context window async_update : asynchronous training """ super(SkipGramModel, self).__init__() self.emb_size = emb_size self.batch_size = batch_size self.only_cpu = only_cpu self.only_gpu = only_gpu if only_fst: self.fst = True self.snd = False self.emb_dimension = emb_dimension elif only_snd: self.fst = False self.snd = True self.emb_dimension = emb_dimension else: self.fst = True self.snd = True self.emb_dimension = int(emb_dimension / 2) self.mixed_train = mix self.neg_weight = neg_weight self.negative = negative self.lr = lr self.lap_norm = lap_norm self.fast_neg = fast_neg self.record_loss = record_loss self.async_update = async_update self.num_threads = num_threads # initialize the device as cpu self.device = torch.device("cpu") # embedding initrange = 1.0 / self.emb_dimension if self.fst: self.fst_u_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True) init.uniform_(self.fst_u_embeddings.weight.data, -initrange, initrange) if self.snd: self.snd_u_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True) init.uniform_(self.snd_u_embeddings.weight.data, -initrange, initrange) self.snd_v_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True) init.constant_(self.snd_v_embeddings.weight.data, 0) # lookup_table is used for fast sigmoid computing self.lookup_table = torch.sigmoid(torch.arange(-6.01, 6.01, 0.01)) self.lookup_table[0] = 0. self.lookup_table[-1] = 1. if self.record_loss: self.logsigmoid_table = torch.log( torch.sigmoid(torch.arange(-6.01, 6.01, 0.01))) self.loss_fst = [] self.loss_snd = [] # indexes to select positive/negative node pairs from batch_walks self.index_emb_negu, self.index_emb_negv = init_emb2neg_index( self.negative, self.batch_size) # adam if self.fst: self.fst_state_sum_u = torch.zeros(self.emb_size) if self.snd: self.snd_state_sum_u = torch.zeros(self.emb_size) self.snd_state_sum_v = torch.zeros(self.emb_size)
def init_lstm_(lstm, init_weight=0.1): """ Initializes weights of LSTM layer. Weights and biases are initialized with uniform(-init_weight, init_weight) distribution. :param lstm: instance of torch.nn.LSTM :param init_weight: range for the uniform initializer """ # Initialize hidden-hidden weights init.uniform_(lstm.weight_hh_l0.data, -init_weight, init_weight) # Initialize input-hidden weights: init.uniform_(lstm.weight_ih_l0.data, -init_weight, init_weight) # Initialize bias. PyTorch LSTM has two biases, one for input-hidden GEMM # and the other for hidden-hidden GEMM. Here input-hidden bias is # initialized with uniform distribution and hidden-hidden bias is # initialized with zeros. init.uniform_(lstm.bias_ih_l0.data, -init_weight, init_weight) init.zeros_(lstm.bias_hh_l0.data) if lstm.bidirectional: init.uniform_(lstm.weight_hh_l0_reverse.data, -init_weight, init_weight) init.uniform_(lstm.weight_ih_l0_reverse.data, -init_weight, init_weight) init.uniform_(lstm.bias_ih_l0_reverse.data, -init_weight, init_weight) init.zeros_(lstm.bias_hh_l0_reverse.data)
def std_uniform_init(W, hidden_size): stdv = 1.0 / math.sqrt(hidden_size) return init.uniform_(W, -stdv, stdv)
def __init__(self, args, batchNorm=True, div_flow=20): super(FlowNetC, self).__init__() self.batchNorm = batchNorm self.div_flow = div_flow self.conv1 = conv(self.batchNorm, 3, 64, kernel_size=7, stride=2) self.conv2 = conv(self.batchNorm, 64, 128, kernel_size=5, stride=2) self.conv3 = conv(self.batchNorm, 128, 256, kernel_size=5, stride=2) self.conv_redir = conv(self.batchNorm, 256, 32, kernel_size=1, stride=1) if args.fp16: self.corr = nn.Sequential( tofp32(), Correlation(pad_size=20, kernel_size=1, max_displacement=20, stride1=1, stride2=2, corr_multiply=1), tofp16()) else: self.corr = Correlation(pad_size=20, kernel_size=1, max_displacement=20, stride1=1, stride2=2, corr_multiply=1) self.corr_activation = nn.LeakyReLU(0.1, inplace=True) self.conv3_1 = conv(self.batchNorm, 473, 256) self.conv4 = conv(self.batchNorm, 256, 512, stride=2) self.conv4_1 = conv(self.batchNorm, 512, 512) self.conv5 = conv(self.batchNorm, 512, 512, stride=2) self.conv5_1 = conv(self.batchNorm, 512, 512) self.conv6 = conv(self.batchNorm, 512, 1024, stride=2) self.conv6_1 = conv(self.batchNorm, 1024, 1024) self.deconv5 = deconv(1024, 512) self.deconv4 = deconv(1026, 256) self.deconv3 = deconv(770, 128) self.deconv2 = deconv(386, 64) self.predict_flow6 = predict_flow(1024) self.predict_flow5 = predict_flow(1026) self.predict_flow4 = predict_flow(770) self.predict_flow3 = predict_flow(386) self.predict_flow2 = predict_flow(194) self.upsampled_flow6_to_5 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True) self.upsampled_flow5_to_4 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True) self.upsampled_flow4_to_3 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True) self.upsampled_flow3_to_2 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True) for m in self.modules(): if isinstance(m, nn.Conv2d): if m.bias is not None: init.uniform_(m.bias) init.xavier_uniform_(m.weight) if isinstance(m, nn.ConvTranspose2d): if m.bias is not None: init.uniform_(m.bias) init.xavier_uniform_(m.weight) # init_deconv_bilinear(m.weight) self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear')
def init(self): bound = math.sqrt(1.0 / self.embedding_size) uniform_(self.emb_relations.weight.data, -bound, bound) uniform_(self.emb_entities.weight.data, -bound, bound) uniform_(self.emb_types.weight.data, -bound, bound)
def reset_parameters(self): init.uniform_(self.initial_param, -math.sqrt(0.5), math.sqrt(0.5)).cuda()
def reset_parameters(self): init.kaiming_uniform_(self.weight, a=math.sqrt(5)) bound = 0.01 init.uniform_(self.bias, -bound, bound)
def reset_parameters(self, dim): init.uniform_(self.w, -math.sqrt(1 / dim), math.sqrt(1 / dim)) init.uniform_(self.u, -math.sqrt(1 / dim), math.sqrt(1 / dim)) init.uniform_(self.b, -math.sqrt(1 / dim), math.sqrt(1 / dim))
def __init__(self, input_dim, n_hidden, dropout, side_dim, attention_type): # attention type: seneca, bidaf, mask assert attention_type in ['seneca', 'bidaf', 'mask'] n_layer = 1 n_hop = 1 super().__init__() self._init_h = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_c = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_i = nn.Parameter(torch.Tensor(input_dim)) init.uniform_(self._init_h, -INI, INI) init.uniform_(self._init_c, -INI, INI) init.uniform_(self._init_i, -0.1, 0.1) self._lstm = nn.LSTM( input_dim, n_hidden, n_layer, bidirectional=False, dropout=dropout ) self._lstm_cell = None # attention parameters self._attn_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._attn_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._attn_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._attn_wm) init.xavier_normal_(self._attn_wq) init.uniform_(self._attn_v, -INI, INI) # hop parameters self._hop_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._hop_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._hop_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._hop_wm) init.xavier_normal_(self._hop_wq) init.uniform_(self._hop_v, -INI, INI) self._n_hop = n_hop # side info attention if attention_type == 'seneca': self.side_wm = nn.Parameter(torch.Tensor(side_dim, n_hidden)) self.side_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self.side_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self.side_wm) init.xavier_normal_(self.side_wq) init.uniform_(self.side_v, -INI, INI) self._attn_ws = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) init.xavier_normal_(self._attn_ws) # pad entity put in graph enc now # self._pad_entity = nn.Parameter(torch.Tensor(side_dim)) # init.uniform_(self._pad_entity) # stop token self._stop = nn.Parameter(torch.Tensor(input_dim)) init.uniform_(self._stop, -INI, INI)
def reset_parameters(self): init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: bound = 1 / math.sqrt(self.C) init.uniform_(self.bias, -bound, bound)
def reset_parameters(self): bound = 1 / self.weight.shape[0] init.uniform_(self.weight, 0, bound) if self.bias is not None: bound = 1 / math.sqrt(self.weight.shape[0]) init.uniform_(self.bias, -bound, bound)
def uniform(shape, low=-0.1, high=0.1): tensor = torch.empty(shape) uniform_(tensor, a=low, b=high) return tensor
def reset_parameters(self): init.uniform_(self.weight) init.zeros_(self.bias) self.mean.zero_() self.var.fill_(1)
def init_param(self, param): if len(param.size()) < 2: init.uniform_(param) else: init.xavier_uniform_(param)
def __init__(self, in_channel=1, out_channel=[32, 64, 128, 256], dropout_prob=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6]): super(NaimishNet, self).__init__() #Architecture summary #Activation Layer 1 to 5 is ELU #Activation Layer 6: Linear activation #Dropout is increased by stepsize .1 from .1 to .6from layer 1 to 6 self.pool = nn.MaxPool2d(kernel_size=2, stride=2) #LAYER 1 #IN (1, 224, 224) #OUT conv1: ( 32, 221, 221) #maxpool1: (32, 110, 110) #Layer 1 out: (32, 110, 110) self.conv1 = nn.Sequential( OrderedDict([('conv1', nn.Conv2d(in_channel, out_channel[0], 4)), ('elu_1', nn.ELU()), ('bn1', nn.BatchNorm2d(out_channel[0])), ('dropout_1', nn.Dropout2d(dropout_prob[0]))])) #LAYER 2 #IN ( 32, 110, 110) #conv2 ( 64, 108, 108) #maxpool2: (64, 53, 53) self.conv2 = nn.Sequential( OrderedDict([('conv2', nn.Conv2d(out_channel[0], out_channel[1], 3)), ('elu_2', nn.ELU()), ('bn2', nn.BatchNorm2d(out_channel[1])), ('dropout_2', nn.Dropout2d(dropout_prob[1]))])) #LAYER 3 #IN (64, 53, 53) #Conv3: (128, 52, 52) #maxpool: (128, 26, 26) self.conv3 = nn.Sequential( OrderedDict([('conv3', nn.Conv2d(out_channel[1], out_channel[2], 2)), ('elu_3', nn.ELU()), ('bn3', nn.BatchNorm2d(out_channel[2])), ('dropout_3', nn.Dropout2d(dropout_prob[2]))])) #Layer 4 #IN (128, 26, 26) #conv4: (256, 26, 26) #maxpool4: ( 256, 13, 13) self.conv4 = nn.Sequential( OrderedDict([('conv4', nn.Conv2d(out_channel[2], out_channel[3], 1)), ('elu_4', nn.ELU()), ('bn4', nn.BatchNorm2d(out_channel[3])), ('dropout_4', nn.Dropout2d(dropout_prob[3]))])) #IN( 256, 5, 5) #Flatten (256 * 13* 13) self.fc1 = nn.Sequential( OrderedDict([('fc1', nn.Linear(in_features=13 * 13 * 256, out_features=1000)), ('elu_5', nn.ELU()), ('bn5', nn.BatchNorm1d(1000)), ('dropout_5', nn.Dropout2d(dropout_prob[4]))])) self.fc2 = nn.Sequential( OrderedDict([('fc2', nn.Linear(in_features=1000, out_features=500)), ('tanh_6', nn.Tanh()), ('bn6', nn.BatchNorm1d(500)), ('dropout_6', nn.Dropout2d(dropout_prob[5]))])) #Layer 7 #OUT FKP: (X, Y) self.fc3 = nn.Linear(in_features=500, out_features=136) #Custom weights initialization for m in self.modules(): if isinstance(m, nn.Conv2d): m.weight = I.uniform_(m.weight, a=0.0, b=1.0) elif isinstance(m, nn.Linear): m.weight = I.xavier_uniform_(m.weight, gain=1)
def init_weights(self): init.uniform_(self.lstm.weight_ih_l0, a=-0.01, b=0.01) init.orthogonal_(self.lstm.weight_hh_l0) self.lstm.weight_ih_l0.requires_grad = True self.lstm.weight_hh_l0.requires_grad = True
) lin_nn_model = nn.Sequential( nn.Linear(d, d_1, bias=False), nn.Linear(d_1, d_2, bias=False) ) ReLU_model = nn.Sequential( nn.Linear(d, d_1), nn.ReLU(), nn.Linear(d_1, d_2) ) loss = nn.MSELoss() iter = lin_nn_model.parameters() w1 = next(iter) w2 = next(iter) init.uniform_(w1, a=0, b=0.01) init.constant_(w2, w1.norm() / 10) # This is definitely true! Compute the gradient! learning_rate = 0.01 time_range = range(2000) for i in range(1): x = data[i, :, :-1] y = data[i, :, -1].unsqueeze(1) r1, r2, r3 = [], [], [] for t in time_range: y_lin_pred = lin_model(x) lin_risk = loss(y_lin_pred, y) y_lin_nn_pred = lin_nn_model(x) lin_nn_risk = loss(y_lin_nn_pred, y) y_ReLU_pred = ReLU_model(x) ReLU_risk = loss(y_ReLU_pred, y)