def __init__(self, tag_vocabs, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None): super().__init__() self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.tag_vocabs = [] self.out_fcs = nn.ModuleList() self.crfs = nn.ModuleList() for i in range(len(tag_vocabs)): self.tag_vocabs.append(tag_vocabs[i]) out_fc = nn.Linear(1536, len(tag_vocabs[i])) self.out_fcs.append(out_fc) trans = allowed_transitions( tag_vocabs[i], encoding_type='bioes', include_start_end=True) crf = ConditionalRandomField( len(tag_vocabs[i]), include_start_end_trans=True, allowed_transitions=trans) self.crfs.append(crf) self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.fc_dropout = nn.Dropout(fc_dropout)
def __init__(self, config, data_bundle, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer """ super().__init__() self.config = config self.data_bundle = data_bundle tag_vocab = data_bundle.get_vocab('target') self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.fc_dropout = nn.Dropout(fc_dropout) self.out_fc = nn.Linear(d_model, len(tag_vocab)) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)
def get_network(self): return TransformerEncoder(embed_dim=self.embed_dim, num_heads=self.num_heads, layers=self.layers, attn_dropout=self.attn_dropout, relu_dropout=self.relu_dropout, res_dropout=self.res_dropout, attn_mask=self.attn_mask)
def get_encoder_network(self): return TransformerEncoder(embed_dim=self.orig_d_l, num_heads=self.num_heads, layers=self.layers, attn_dropout=self.attn_dropout, relu_dropout=self.relu_dropout, res_dropout=self.res_dropout, attn_mask=self.attn_mask, crossmodal=self.crossmodal)
def __init__( self, src_embedding, tgt_embedding, embedding_size, hidden_size, vocab_size, start_index, end_index, padding_index, num_heads, num_layers=2, dropout=0.2, learning_position_embedding=False, embedding_scale=False, num_positions=1024, ): super().__init__() self.embedding_size = embedding_size self.hidden_size = hidden_size self.vocab_size = vocab_size self.start_index = start_index self.end_index = end_index self.padding_index = padding_index self.dropout = dropout self.num_layers = num_layers self.encoder = TransformerEncoder( num_heads, num_layers, embedding_size, src_embedding, hidden_size, dropout=dropout, learn_position_embedding=learning_position_embedding, embedding_scale=embedding_scale, num_positions=num_positions) output_layer = nn.Sequential(nn.LayerNorm(embedding_size), nn.Linear(embedding_size, vocab_size)) self.decoder = TransformerDecoder( num_heads, num_layers, embedding_size, hidden_size, tgt_embedding, start_index, end_index, output_layer, dropout=dropout, embedding_scale=embedding_scale, learn_positional_embedding=learning_position_embedding, num_positions=num_positions)
def get_transformer(self, layers=-1): embed_dim, attn_dropout = self.input_feat_dim, self.attn_dropout return TransformerEncoder(embed_dim=embed_dim, num_heads=self.num_heads, layers=max(self.layers, layers), attn_dropout=attn_dropout, relu_dropout=self.relu_dropout, res_dropout=self.res_dropout, embed_dropout=self.embed_dropout, attn_mask=self.attn_mask)
def __init__(self, embed_dim, num_heads=4, layers=1): super(Transformer, self).__init__() self.num_heads = num_heads self.attn_dropout = 0.1 self.relu_dropout = 0.1 self.res_dropout = 0.1 self.embed_dropout = 0.1 self.attn_mask = False self.layers = layers self.transformer = TransformerEncoder(embed_dim=embed_dim, num_heads=self.num_heads, layers=self.layers, attn_dropout=self.attn_dropout, relu_dropout=self.relu_dropout, res_dropout=self.res_dropout, embed_dropout=self.embed_dropout, attn_mask=self.attn_mask)
def get_network(self, self_type='l', layers=-1): if self_type in ['l', 'vl']: embed_dim, attn_dropout = self.d_l, self.attn_dropout elif self_type in ['v', 'lv']: embed_dim, attn_dropout = self.d_v, self.attn_dropout_v elif self_type == 'l_mem': embed_dim, attn_dropout = self.d_l, self.attn_dropout elif self_type == 'v_mem': embed_dim, attn_dropout = self.d_v, self.attn_dropout else: raise ValueError("Unknown network type") return TransformerEncoder(embed_dim=embed_dim, num_heads=self.num_heads, layers=max(self.layers, layers), attn_dropout=attn_dropout, relu_dropout=self.relu_dropout, res_dropout=self.res_dropout, embed_dropout=self.embed_dropout, attn_mask=self.attn_mask)
def __init__(self, emb_size: int, n_hidden: int, ff_size: int, n_head: int, n_block: int, dropout: float, beam_size: int, max_decoding_step: int, minimum_length: int, label_smoothing: float, share: bool, vocab: Vocabulary) -> None: super().__init__(vocab) self.vocab = vocab self.vocab_size = vocab.get_vocab_size('tokens') self.beam_size = beam_size self.max_decoding_step = max_decoding_step self.minimum_length = minimum_length self.label_smoothing = label_smoothing self._bos = self.vocab.get_token_index(START_SYMBOL) self._eos = self.vocab.get_token_index(END_SYMBOL) if share: self.src_embedding = nn.Sequential( Embeddings(emb_size, self.vocab_size), PositionalEncoding(n_hidden, dropout)) self.tgt_embedding = self.src_embedding else: src_vocab_size = vocab.get_vocab_size('src_tokens') self.src_embedding = nn.Sequential( Embeddings(emb_size, src_vocab_size), PositionalEncoding(n_hidden, dropout)) self.tgt_embedding = nn.Sequential( Embeddings(emb_size, self.vocab_size), PositionalEncoding(n_hidden, dropout)) self.encoder = TransformerEncoder(n_hidden, ff_size, n_head, dropout, n_block) self.decoder = TransformerDecoder(n_hidden, ff_size, n_head, dropout, n_block) self.generator = nn.Linear(n_hidden, self.vocab_size) self.accuracy = SequenceAccuracy()
def __init__(self, cfgs): super().__init__() self.cfgs = cfgs feature_enc_layers = eval(cfgs.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtraction( conv_layers=feature_enc_layers, in_d=1 if cfgs.dataset == 'mitbih' else 12, dropout=0.0, mode=cfgs.extractor_mode, conv_bias=cfgs.conv_bias) self.post_extract_proj = (nn.Linear(self.embed, cfgs.embed_dim) if self.embed != cfgs.embed_dim and not cfgs.quantize_input else None) self.mask_prob = cfgs.mask_prob self.mask_selection = cfgs.mask_selection self.mask_other = cfgs.mask_other self.mask_length = cfgs.mask_length self.no_mask_overlap = cfgs.no_mask_overlap self.mask_min_space = cfgs.mask_min_space self.mask_channel_prob = cfgs.mask_channel_prob self.mask_channel_selection = cfgs.mask_channel_selection self.mask_channel_other = cfgs.mask_channel_other self.mask_channel_length = cfgs.mask_channel_length self.no_mask_channel_overlap = cfgs.no_mask_channel_overlap self.mask_channel_min_space = cfgs.mask_channel_min_space #XXX self.dropout_input = nn.Dropout(cfgs.dropout_input) self.dropout_features = nn.Dropout(cfgs.dropout_features) self.quantizer = None self.input_quantizer = None self.n_negatives = cfgs.num_negatives self.cross_sample_negatives = cfgs.cross_sample_negatives self.codebook_negatives = cfgs.codebook_negatives self.negatives_from_everywhere = cfgs.negatives_from_everywhere self.logit_temp = cfgs.logit_temp self.feature_grad_mult = cfgs.feature_grad_mult final_dim = cfgs.final_dim if cfgs.final_dim > 0 else cfgs.embed_dim if cfgs.quantize_targets: vq_dim = cfgs.latent_dim if cfgs.latent_dim > 0 else final_dim self.quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=cfgs.latent_vars, temp=cfgs.latent_temp, groups=cfgs.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_q = nn.Linear(vq_dim, final_dim) else: self.project_q = nn.Linear(self.embed, final_dim) if cfgs.quantize_input: if cfgs.same_quantizer and self.quantizer is not None: vq_dim = final_dim self.input_quantizer = self.quantizer else: vq_dim = cfgs.latent_dim if cfgs.latent_dim > 0 else cfgs.embed_dim self.input_quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=cfgs.latent_vars, temp=cfgs.latent_temp, groups=cfgs.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_inp = nn.Linear(vq_dim, self.embed_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(cfgs.embed_dim).uniform_()) self.encoder = TransformerEncoder(cfgs) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if cfgs.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU()) self.final_proj = nn.Linear(cfgs.embed_dim, final_dim)
def __init__(self, tag_vocab, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None, use_knowledge=False, feature2count=None, vocab_size=None, feature_vocab_size=None, kv_attn_type="dot", memory_dropout=0.2, fusion_dropout=0.2, fusion_type='concat', highway_layer=0, key_embed_dropout=0.2, knowledge_type="all", use_zen=False, zen_model=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer :param use_knowledge: 是否使用stanford corenlp的知识 :param feature2count: 字典, {"gram2count": dict, "pos_tag2count": dict, "chunk_tag2count": dict, "dep_tag2count": dict}, :param """ super().__init__() self.use_knowledge = use_knowledge self.feature2count = feature2count self.vocab_size = vocab_size self.feature_vocab_size = feature_vocab_size # add ZEN self.use_zen = use_zen self.embed = embed embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.kv_memory = KeyValueMemoryNetwork( vocab_size=vocab_size, feature_vocab_size=feature_vocab_size, attn_type=kv_attn_type, emb_size=d_model, scaled=True, key_embed_dropout=key_embed_dropout, knowledge_type=knowledge_type) self.output_dim = d_model * _dim_map[fusion_type] self.fusion = FusionModule(fusion_type=fusion_type, layer=highway_layer, input_size=d_model, output_size=self.output_dim, dropout=fusion_dropout) self.memory_dropout = nn.Dropout(p=memory_dropout) self.out_fc = nn.Linear(self.output_dim, len(tag_vocab)) self.fc_dropout = nn.Dropout(fc_dropout) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)
def __init__(self, tag_vocab, embed, num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=True, attn_type='adatrans', bi_embed=None, fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None, use_knowledge=False, multi_att_dropout=0.3, use_ngram=False, gram2id=None, cat_num=5, device=None): """ :param tag_vocab: fastNLP Vocabulary :param embed: fastNLP TokenEmbedding :param num_layers: number of self-attention layers :param d_model: input size :param n_head: number of head :param feedforward_dim: the dimension of ffn :param dropout: dropout in self-attention :param after_norm: normalization place :param attn_type: adatrans, naive :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None :param bi_embed: Used in Chinese scenerio :param fc_dropout: dropout rate before the fc layer :param use_knowledge: 是否使用stanford corenlp的知识 :param feature2count: 字典, {"gram2count": dict, "pos_tag2count": dict, "chunk_tag2count": dict, "dep_tag2count": dict}, :param """ super().__init__() self.use_knowledge = use_knowledge self.use_ngram = use_ngram self.gram2id = gram2id self.embed = embed # new add self.cat_num = cat_num self.use_attention = use_ngram embed_size = self.embed.embed_size self.bi_embed = None if bi_embed is not None: self.bi_embed = bi_embed embed_size += self.bi_embed.embed_size # self.ngram_embeddings = BertWordEmbeddings(hidden_size=embed_size) self.in_fc = nn.Linear(embed_size, d_model) self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout, after_norm=after_norm, attn_type=attn_type, scale=scale, dropout_attn=dropout_attn, pos_embed=pos_embed) self.hidden_size = d_model if self.use_attention: print("use multi_attention") self.multi_attention = MultiChannelAttention( len(self.gram2id), self.hidden_size, self.cat_num) self.attention_fc = nn.Linear(self.hidden_size * self.cat_num, self.hidden_size, bias=False) self.multi_att_dropout = nn.Dropout(multi_att_dropout) self.out_fc = nn.Linear(self.hidden_size * 2, len(tag_vocab), bias=False) self.gate = GateConcMechanism(hidden_size=self.hidden_size) # self.gete_dropout = nn.Dropout(gate_dropout) else: self.multi_attention = None self.out_fc = nn.Linear(self.hidden_size, len(tag_vocab), bias=False) # self.out_fc = nn.Linear(d_model, len(tag_vocab)) # print("len(tag_vocab): ", len(tag_vocab)) self.fc_dropout = nn.Dropout(fc_dropout) trans = allowed_transitions(tag_vocab, include_start_end=True) self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)