def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6, num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1, activation: str = 'relu', custom_encoder: Optional[Any] = None, custom_decoder: Optional[Any] = None) -> None: super(Transformer, self).__init__() if custom_encoder is not None: self.encoder = custom_encoder else: encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) if custom_decoder is not None: self.decoder = custom_decoder else: decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation) decoder_norm = LayerNorm(d_model) self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead
def __init__( self, d_model: int, nhead: int, d_hid: int, dropout=0.1, no_residual=False, ): super(Extractor, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.cross_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.conv1 = Conv1d(d_model, d_hid, 9, padding=4) self.conv2 = Conv1d(d_hid, d_model, 1, padding=0) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.no_residual = no_residual
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", custom_encoder=None, custom_decoder=None): super(Transformer, self).__init__() if custom_encoder is not None: self.encoder = custom_encoder else: encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) if custom_decoder is not None: self.decoder = custom_decoder else: decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation) decoder_norm = LayerNorm(d_model) self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead
def __init__(self, config): Module.__init__(self) self.mlp1_inc = config['n_inc'] + config['e_outc'] self.mlp1_hs1 = config['node_model_mlp1_hidden_sizes'][0] self.mlp1_hs2 = config['node_model_mlp1_hidden_sizes'][1] self.mlp2_hs1 = config['node_model_mlp2_hidden_sizes'][0] self.dim_out = config['n_outc'] self.g_inc = config['g_inc'] self.node_mlp_1 = Seq(Linear(self.mlp1_inc, self.mlp1_hs1), LayerNorm(self.mlp1_hs1), ReLU(), Linear(self.mlp1_hs1, self.mlp1_hs2)) self.mlp2_inc_uncond = config['n_inc'] + self.mlp1_hs2 + config['u_inc'] self.mlp2_inc_cond = self.mlp2_inc_uncond + self.mlp1_hs2
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, attention_dropout=0.1, drop_path_rate=0.1): super(TransformerEncoderLayer, self).__init__() self.pre_norm = LayerNorm(d_model) self.self_attn = Attention(dim=d_model, num_heads=nhead, attention_dropout=attention_dropout, projection_dropout=dropout) self.linear1 = Linear(d_model, dim_feedforward) self.dropout1 = Dropout(dropout) self.norm1 = LayerNorm(d_model) self.linear2 = Linear(dim_feedforward, d_model) self.dropout2 = Dropout(dropout) self.drop_path = DropPath( drop_path_rate) if drop_path_rate > 0 else Identity() self.activation = F.gelu
def __init__(self, config: FSMTConfig): super().__init__() self.embed_dim = config.d_model self.self_attn = Attention( embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.encoder_attn = Attention( self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, input_size, hidden_size, bias=True, forget_bias=0): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.ih = Linear(input_size, 4 * hidden_size, bias=bias) self.hh = Linear(hidden_size, 4 * hidden_size, bias=bias) if bias: self.ih.bias.data.fill_(0) self.hh.bias.data.fill_(0) # forget bias init self.ih.bias.data[hidden_size:hidden_size * 2].fill_(forget_bias) self.hh.bias.data[hidden_size:hidden_size * 2].fill_(forget_bias) self.ln_cell = LayerNorm(hidden_size)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, dim_in, dim_out_local, dim_out_global, dim_local, dim_global, dim_hidden=20, dim_pre_aggr=20, n_iter=20, n_out_layers=5, global_flow=False, class_weight=torch.tensor([1.0, 1.0, 1.0, 1.0])): super(N2JNet, self).__init__() self.dim_in = dim_in self.dim_out_local = dim_out_local self.dim_out_global = dim_out_global self.dim_hidden = dim_hidden self.dim_local = dim_local self.dim_global = dim_global self.dim_pre_aggr = dim_pre_aggr self.n_iter = n_iter self.n_out_layers = n_out_layers self.global_flow = global_flow self.class_weight = class_weight # MLP for initially encoding local self.mlp_node_init = Seq(Lin(self.dim_in, self.dim_hidden), ReLU(), Lin(self.dim_hidden, self.dim_hidden), ReLU(), Lin(self.dim_hidden, self.dim_local), LayerNorm(self.dim_local)) # MLPs for encoding local and global meta_layers = ModuleList() for i in range(self.n_iter): node_model = NodeModel(self.dim_local, self.dim_global, self.dim_hidden) global_model = GlobalModel(self.dim_local, self.dim_global, self.dim_hidden, self.dim_pre_aggr) meta = CustomMetaLayer(node_model=node_model, global_model=global_model) meta_layers.append(meta) self.meta_layers = meta_layers # Networks for local and global output self.net_out_local = Seq(Lin(self.dim_local, self.dim_hidden), ReLU(), Lin(self.dim_hidden, self.dim_hidden), ReLU(), Lin(self.dim_hidden, self.dim_out_local*2)) if self.global_flow: self.net_out_global = Flow(*[[ MAF(self.dim_global, self.dim_out_global, hidden=dim_hidden), Perm(self.dim_global)][i%2] for i in \ range(self.n_out_layers*2 + 1)]) else: self.net_out_global = Seq(Lin(self.dim_global, self.dim_hidden), ReLU(), Lin(self.dim_hidden, self.dim_hidden), ReLU(), Lin(self.dim_hidden, self.dim_out_global*2))
def __init__(self, num_skills, state_size, num_heads=2, dropout=0.2, infer=False): super(student_model, self).__init__() self.infer = infer self.num_skills = num_skills self.state_size = state_size # we use the (num_skills * 2 + 1) as key padding_index ''' Embedding- drugi argument je maksimalna duljina tensora, prvi argument je broj tensora ako je num_embeddings velicina dictionariya onda bi mozda trebao biti velicine num_skills? (10,3) 10- broj razlicitih elemenata, 3-u koliko se dimenzija embeddaju elementi posto je 10 broj elemenata, najveci element moze biti 9, znaci num moze biti len(ex_id_converter ako indeksi pocinju od 0) state size je po defaultu 200, s tim bi se moglo igrati ''' self.embedding = nn.Embedding( num_embeddings=num_skills * 2, #promijenjen s a *2+1 na +2, ovaj embedding sadrzi pitanja i appendane odgovore duljina je 2n embedding_dim=state_size ) #je li ispravno stavljati indekse pitanja i odgovore tocno netocno u isti embedding? # padding_idx=num_skills*2 # self.position_embedding = PositionalEncoding(state_size) self.position_embedding = nn.Embedding( num_embeddings=opt. max_len, #max len je najveci broj exercisea s kojim moze raditi, maknut je -1 embedding_dim=state_size) # we use the (num_skills + 1) as query padding_index self.problem_embedding = nn.Embedding( num_embeddings=num_skills, #maknut +1 embedding_dim=state_size) # padding_idx=num_skills) self.multi_attn = MultiHeadedAttention(h=num_heads, d_model=state_size, dropout=dropout, infer=self.infer) self.feedforward1 = nn.Linear(in_features=state_size, out_features=state_size) self.feedforward2 = nn.Linear(in_features=state_size, out_features=state_size) self.pred_layer = nn.Linear(in_features=state_size, out_features=num_skills) self.dropout = nn.Dropout(dropout) self.layernorm = LayerNorm(state_size)
def mlp(f_in, f_out): """ This function returns a Multi-Layer Perceptron with ReLU non-linearities with num_layers layers and h hidden nodes in each layer, with f_in input features and f_out output features. """ layers = [] f1 = f_in for f2 in hidden_layer_sizes: layers.append(Linear(f1, f2)) layers.append(ReLU()) f1 = f2 layers.append(Linear(f1, f_out)) # layers.append(ReLU()) layers.append(LayerNorm(f_out)) return Sequential(*layers)
def __init__(self, cfg): super().__init__() # Original BERT Embedding # self.tok_embed = nn.Embedding(cfg.vocab_size, cfg.hidden) # token embedding # factorized embedding self.tok_embed1 = nn.Embedding(cfg.vocab_size, cfg.embedding_size) self.tok_embed2 = nn.Linear(cfg.embedding_size, cfg.hidden_size) self.pos_embed = nn.Embedding(cfg.max_position_embeddings, cfg.hidden_size) # position embedding # self.seg_embed = nn.Embedding(cfg.n_segments, cfg.hidden) # segment(token type) embedding self.norm = LayerNorm(cfg.hidden_size) # self.drop = nn.Dropout(cfg.classifier_dropout_prob) self.pos = None
def __init__(self, input_size, key_size, heads): super().__init__() self.input_size = input_size self.key_size = key_size self.heads = heads # bias设置为false 是为了防止padding为0的部分变为非0 self.q_w = Linear(input_size, key_size * heads, bias=False) self.k_w = Linear(input_size, key_size * heads, bias=False) self.v_w = Linear(input_size, input_size * heads, bias=False) # 融合多头信息 self.linear = Linear(input_size * heads, input_size) self.layer_norm = LayerNorm(input_size, eps=1e-6)
def __init__(self, hidden_size, s_size, r_size, t_size): super().__init__() self.hidden_size = hidden_size self.s_size = s_size self.r_size = r_size self.t_size = t_size self.b_size = 1 write_size = s_size + r_size + t_size + self.b_size self.W_write = nn.Linear(hidden_size, 1 * write_size) # read self.W_read = nn.Linear(hidden_size, s_size + r_size * 3) self.ln_read = LayerNorm(t_size, elementwise_affine=False) self.reset_parameters()
def define_weights_and_layers(self): gnn_layers = [] use_rels = self.n_relations if self.inverse_edges and self.separate_relation_types_for_inverse: use_rels *= 2 for layer in range(self.n_layers): gnn_layers.append( RGCNLayer(self.output_dim, self.output_dim, use_rels)) gnn_layers = torch.nn.ModuleList(gnn_layers) self.gnn_layers = gnn_layers self.W_input = torch.nn.Sequential( Linear(self.input_dim, self.output_dim), LayerNorm(self.output_dim), ReLU())
def __init__(self, c_in, num_nodes): super(SATT_3, self).__init__() self.conv1 = Conv2d(c_in * 12, c_in, kernel_size=(1, 1), padding=(0, 0), stride=(1, 1), bias=False) self.conv2 = Conv2d(c_in * 12, c_in, kernel_size=(1, 1), padding=(0, 0), stride=(1, 1), bias=False) self.bn = LayerNorm([num_nodes, num_nodes, 4]) self.c_in = c_in
def test_layer_norm(): bert = BertModel.from_pretrained("bert-base-cased").cuda().half() tokenizer = BertTokenizer.from_pretrained("bert-base-cased") test_text = ( "Hello. How are you? I am fine thank you and you? yes Good. " "hi hi hi hi hi hi hi hi hi hi hi hi hi" # 32 ) tokens = tokenizer( [test_text] * 4, return_tensors="pt", ) # [bsz, seq_len, d_model] embedding_output = (bert.embeddings( input_ids=tokens["input_ids"].cuda(), position_ids=None, token_type_ids=tokens["token_type_ids"].cuda(), inputs_embeds=None, past_key_values_length=0, ).cuda().half()) fused_layernorm_layer = (MixedFusedLayerNorm( normalized_shape=embedding_output.size(-1)).cuda().half()) torch_layernorm_layer = (LayerNorm( normalized_shape=embedding_output.size(-1)).cuda().half()) fused_output = fused_layernorm_layer(embedding_output) torch_output = torch_layernorm_layer(embedding_output) test_result = (fused_output - torch_output).abs() while test_result.dim() != 1: test_result = test_result.mean(dim=-1) diff = test_result.mean(dim=-1) if diff <= 1e-3: print(f"\n[Success] test_layer_norm" f"\n > mean_difference={diff}" f"\n > fused_values={fused_output[-1][-1][:5].tolist()}" f"\n > torch_values={torch_output[-1][-1][:5].tolist()}") else: print(f"\n[Fail] test_layer_norm" f"\n > mean_difference={diff}, " f"\n > fused_values={fused_output[-1][-1][:5].tolist()}, " f"\n > torch_values={torch_output[-1][-1][:5].tolist()}")
def __init__( self, cfg: WavLMConfig, ) -> None: super().__init__() logger.info(f"WavLM Config: {cfg.__dict__}") self.cfg = cfg feature_enc_layers = eval(cfg.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias, ) self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection self.mask_other = cfg.mask_other self.mask_length = cfg.mask_length self.no_mask_overlap = cfg.no_mask_overlap self.mask_min_space = cfg.mask_min_space self.mask_channel_prob = cfg.mask_channel_prob self.mask_channel_selection = cfg.mask_channel_selection self.mask_channel_other = cfg.mask_channel_other self.mask_channel_length = cfg.mask_channel_length self.no_mask_channel_overlap = cfg.no_mask_channel_overlap self.mask_channel_min_space = cfg.mask_channel_min_space self.dropout_input = nn.Dropout(cfg.dropout_input) self.dropout_features = nn.Dropout(cfg.dropout_features) self.feature_grad_mult = cfg.feature_grad_mult self.mask_emb = nn.Parameter( torch.FloatTensor(cfg.encoder_embed_dim).uniform_()) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.embed)
def __init__(self,in_channels, hidden_channels, out_channels, num_layers,dropout=0.5): super(MLP, self).__init__() self.node_encoder = Linear(in_channels, hidden_channels) self.layers = torch.nn.ModuleList() for i in range(1, num_layers+1): conv = Linear(hidden_channels, hidden_channels) norm = LayerNorm(hidden_channels, elementwise_affine=True) act = ReLU(inplace=True) layer = AdaGNNLayer(conv,norm,act,dropout=dropout, lin=True) self.layers.append(layer) self.lin = Linear(hidden_channels, out_channels) self.currenlayer = 1 self.layers[0].unfix() self.num_layers = num_layers
def __init__(self, config): super(BertCRFForAttr, self).__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.t_lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size // 2, batch_first=True, bidirectional=True) self.a_lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size // 2, batch_first=True, bidirectional=True) self.attention = CosAttention() self.ln = LayerNorm(config.hidden_size * 2) self.classifier = nn.Linear(config.hidden_size * 2, config.num_label) self.crf = CRF(num_tags=config.num_labels, batch_first=True) self.init_weights()
def __init__(self, n_in_channels, n_out_channels, n_blocks, n_init_features, growth_rate, drop_rate, kernel_sizes, glu_act, bt_f=None): super(DenseDeep1D, self).__init__() self.n_blocks = n_blocks self.features = torch.nn.Sequential( OrderedDict([('conv0', Conv1d(n_in_channels, n_init_features, kernel_size=kernel_sizes['conv0'], padding_mode='zeros', padding=int( (kernel_sizes['conv0'] - 1) / 2))), ('norm0', LayerNorm([n_init_features, 107])), ('relu0', ReLU(inplace=True))])) for k_block in range(n_blocks): if bt_f is None: self.features.add_module( 'block_{}'.format(k_block), _DenseConvBlock(n_init_features + k_block * growth_rate, growth_rate=growth_rate, drop_rate=drop_rate, kernel_size=kernel_sizes['blocks'], glu_act=glu_act)) else: self.features.add_module( 'block_{}'.format(k_block), _DenserConvBlock_bottleneck( n_init_features + k_block * growth_rate, growth_rate=growth_rate, drop_rate=drop_rate, kernel_size=kernel_sizes['blocks'], bottle_factor=bt_f)) self.act_final = ReLU(inplace=True) self.regression = Linear(n_init_features + n_blocks * growth_rate, n_out_channels)
def __init__(self, vocab_size, num_classes, embedding_dim, nhead=1, num_encoder_layers=2): super().__init__() d_model = embedding_dim dim_feedforward = 2 * d_model self.embedding = nn.Embedding(vocab_size, embedding_dim) encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) self.linear = nn.Linear(d_model, num_classes)
def __init__(self, ): super(Model, self).__init__() self.hidden_size = model_config.hidden_size self.embedding = nn.Embedding(config.num_vocab, config.embed_dim) self.bilstm = nn.LSTM(input_size=config.embed_dim, hidden_size=self.hidden_size, batch_first=True, num_layers=2, dropout=model_config.dropout, bidirectional=True) # self.dropout = SpatialDropout(drop_p) self.dropout = nn.Dropout(model_config.dropout) self.layer_norm = LayerNorm(self.hidden_size * 2) self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels) self.crf = CRF(tagset_size=config.num_labels, tag_dictionary=config.label2id, is_bert=True)
def __init__(self, in_channels, hidden_channels, out_channels, num_layers, gnn_type='GEN'): super(WeakGNN, self).__init__() self.node_encoder = Linear(in_channels, hidden_channels) self.edge_encoder = Linear(in_channels, hidden_channels) self.layers = torch.nn.ModuleList() self.gnn_type = gnn_type for i in range(1, num_layers + 1): if gnn_type == 'GEN': conv = GENConv(hidden_channels, hidden_channels, aggr='softmax', t=1.0, learn_t=True, num_layers=1, norm='layer') elif gnn_type == 'MLP': conv = torch.nn.Linear(hidden_channels, hidden_channels) elif gnn_type == 'GCN': conv = GCNConv(hidden_channels, hidden_channels) elif gnn_type == 'SAGE': conv = SAGEConv(hidden_channels, hidden_channels) elif gnn_type == 'GAT': conv = GATConv(hidden_channels, hidden_channels) norm = LayerNorm(hidden_channels, elementwise_affine=True) act = ReLU(inplace=True) if gnn_type == 'MLP': layer = AdaGNNLayer(conv, norm, act, dropout=0.1, ckpt_grad=False, lin=True) else: layer = AdaGNNLayer(conv, norm, act, dropout=0.1, ckpt_grad=False) self.layers.append(layer) self.lin = Linear(hidden_channels, out_channels)
def __init__(self, d_hid, d_ff, relu_dropout=0.1, residual_dropout=0.1, leaky_relu_slope=0.1): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Linear(d_hid, d_ff) self.w_2 = nn.Linear(d_ff, d_hid) self.layer_norm = LayerNorm(d_hid, eps=1e-6) # The t2t code on github uses relu dropout, even though the transformer # paper describes residual dropout only. We implement relu dropout # because we always have the option to set it to zero. self.relu_dropout = FeatureDropout2(relu_dropout) self.residual_dropout = FeatureDropout2(residual_dropout) self.relu = nn.ReLU() if leaky_relu_slope == 0.0 else nn.LeakyReLU( leaky_relu_slope)
def __init__(self, cell_class, input_size: int, hidden_size: int, input_keep_prob: float = 1.0, recurrent_keep_prob: float = 1.0, layer_norm=False): super(UniDirLSTMLayer, self).__init__() self.forward_layer = DynamicRNN(cell_class(input_size, hidden_size), input_keep_prob, recurrent_keep_prob, go_forward=True) self.use_layer_norm = layer_norm if layer_norm: self.layer_norm = LayerNorm(hidden_size) else: self.layer_norm = Identity()
def __init__(self, hidden_channels, num_layers): super(DeeperGCN, self).__init__() self.node_encoder = Linear(data.x.size(-1), hidden_channels) self.edge_encoder = Linear(data.edge_attr.size(-1), hidden_channels) self.layers = torch.nn.ModuleList() for i in range(1, num_layers + 1): conv = GENConv(hidden_channels, hidden_channels, aggr='stat', t=1.0, learn_t=True, num_layers=2, norm='layer', msg_norm=True) norm = LayerNorm(hidden_channels, elementwise_affine=True) act = ReLU(inplace=True) layer = DeepGCNLayer(conv, norm, act, block='res+', dropout=0.1, ckpt_grad=i % 3) self.layers.append(layer) self.lin = Linear(hidden_channels, data.y.size(-1))
def __init__(self, in_features, hidden_dim, num_heads=1, dropout=0.2, edge_encoding=EDGE_ENCODING_TYPE.RELATIVE_POSITION): super().__init__() self.attention = MultiHeadAttention(in_features=in_features, hidden_dim=hidden_dim, num_heads=num_heads, dropout=dropout, edge_encoding=edge_encoding) self.feed_forward = PositionwiseFeedForward(in_features=in_features, hidden_dim=hidden_dim, dropout=dropout) self.dropout = Dropout() self.layer_norm = LayerNorm(in_features)
def __init__(self, config: ModelConfig, data_config: DataConfig, encoder_embeddings): super().__init__() self.embeddings = Embedding(num_embeddings=data_config.output_translation_vocabulary_sizes[0][0], embedding_dim=config.encoder_output_size, padding_idx=pad_token_index) self.positional_encoding = PositionalEncoding(config.encoder_output_size) if config.decoder_translation_scale_embeddings: self.embeddings_scale = math.sqrt(float(config.encoder_output_size)) else: self.embeddings_scale = None self.dropout = Dropout(config.decoder_translation_transformer_dropout) if config.decoder_translation_share_encoder_embeddings: assert(self.embeddings.weight.shape == encoder_embeddings.get_lut_embeddings().weight.shape) self.embeddings.weight = encoder_embeddings.get_lut_embeddings().weight self.transformer_layers = ModuleList([TransformerDecoderLayer(d_model=config.encoder_output_size, heads=config.decoder_translation_transformer_heads, d_ff=config.decoder_translation_transformer_hidden_size, dropout=config.decoder_translation_transformer_dropout, attention_dropout=config.decoder_translation_transformer_dropout) for _ in range(config.decoder_translation_transformer_layers)]) self.layer_norm = LayerNorm(config.encoder_output_size, eps=1e-6) self.linear: Linear = Linear(in_features=config.encoder_output_size, out_features=data_config.output_translation_vocabulary_sizes[0][0]) if config.decoder_translation_share_embeddings: self.linear.weight = self.embeddings.weight self.linear_features = None if data_config.output_translation_features > 1: self.linear_features = ModuleList([Linear(in_features=config.encoder_output_size, out_features=data_config.output_translation_vocabulary_sizes[0][i]) for i in range(1, data_config.output_translation_features)]) self.max_seq_out_len = 150 self.beam_size = 1 self.state = {}
def __init__(self, args): super(BiLSTMForNer, self).__init__() self.embedding_size = args.embedding_size self.model_type = args.model_type self.embedding = nn.Embedding(args.vocab_size, args.embedding_size) self.bilstm = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size, num_layers=2, batch_first=True, dropout=args.drop_p, bidirectional=True) self.dropout = SpatialDropout(args.drop_p) self.layer_norm = LayerNorm(args.hidden_size * 2) self.classifier = nn.Linear(args.hidden_size * 2, args.num_labels) self.use_crf = args.use_crf self.loss_type = args.loss_type self.num_labels = args.num_labels if args.use_crf: self.crf = CRF(num_tags=args.num_labels, batch_first=True)