def __init__(self, input_vocab_size, opt_vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, position_embed_size=300, utter_n_layer=2, dropout=0.3, sos=0, pad=0, teach_force=1): super(Transformer, self).__init__() self.d_model = d_model self.hidden_size = d_model self.embed_src = nn.Embedding(input_vocab_size, d_model) # position maxlen is 5000 self.pos_enc = PositionEmbedding(d_model, dropout=dropout, max_len=position_embed_size) self.input_vocab_size = input_vocab_size self.utter_n_layer = utter_n_layer self.opt_vocab_size = opt_vocab_size self.pad, self.sos = pad, sos self.teach_force = teach_force encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation='gelu') self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers) self.decoder = Decoder(d_model, d_model, opt_vocab_size, n_layers=utter_n_layer, dropout=dropout, nhead=nhead)
def __init__(self, device): super().__init__() self.bert_tokenizer = BertTokenizer.from_pretrained( "bert-base-uncased", do_lower_case=True) self.bert_model = BertModel.from_pretrained("bert-base-uncased") self.bert_feature_dim = self.bert_model.pooler.dense.in_features self.indicator_vector = torch.nn.parameter.Parameter( torch.rand(self.bert_feature_dim, requires_grad=True, dtype=torch.double), requires_grad=True).to(device) self.encoder_layer = nn.TransformerEncoderLayer( d_model=self.bert_feature_dim, nhead=8) self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6) self.norm_layer = nn.LayerNorm(self.bert_feature_dim) self.projection_layer = nn.Linear(self.bert_feature_dim, 2) self.device = device #self.loss = nn.NLLLoss(weight=torch.tensor([0.17105,1]),reduction='mean', ignore_index=-1) self.loss = nn.NLLLoss(weight=torch.tensor([1.0, 1.0]), reduction='mean', ignore_index=-1) self.lsm = nn.LogSoftmax(dim=2)
def __init__(self, conf): super().__init__() self.save_hyperparameters(conf) self.ke = nn.Embedding(4, 2, max_norm=1.) self.conv1 = ConvBlock(3, 256, 13, stride=3, padding=6) self.conv2 = ConvBlock(256, 256, 7, stride=1, padding=3) self.conv3 = ConvBlock(256, 256, 3, stride=2, padding=1) self.pos_encoder = PositionalEncoding(256, self.hparams.dropout) encoder_layer = nn.TransformerEncoderLayer(256, self.hparams.nhead, self.hparams.dim_ff, self.hparams.dropout, activation='gelu') self.encoder = nn.TransformerEncoder(encoder_layer, self.hparams.nlayers) self.fc1 = nn.Linear(256, 1) self.train_acc = Accuracy() self.val_acc = Accuracy(compute_on_step=False)
def __init__(self, id2char, model_para): super(Transformer, self).__init__() self.idim = model_para['encoder']['idim'] #FIXME: need to remove these hardcoded thing later self.odim = len(id2char) + 2 self.sos_id = len(id2char) + 1 self.eos_id = len(id2char) + 1 self.blank_id = 0 self.space_id = -1 #FIXME: what is this self.vgg_ch_dim = 128 self.feat_extractor = nn.Sequential( nn.Conv2d(1, 64, 3, stride=1, padding=1), nn.ReLU(), nn.Conv2d(64, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2, stride=2), nn.Conv2d(64, 128, 3, stride=1, padding=1), nn.ReLU(), nn.Conv2d(self.vgg_ch_dim,self.vgg_ch_dim, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2, stride=2), ) self.vgg_o_dim = self.vgg_ch_dim * floor(self.idim/4) # self.vgg2enc = nn.Linear(self.vgg_o_dim, model_para['encoder']['d_model']) self.vgg2enc = nn.Linear(self.vgg_o_dim, model_para['encoder']['d_model']) self.pos_encoder = PositionalEncoding(model_para['encoder']['d_model'], model_para['encoder']['dropout']) encoder_layer = nn.TransformerEncoderLayer(model_para['encoder']['d_model'], #512 model_para['encoder']['nhead'], # 2 model_para['encoder']['dim_inner'], # 2048 model_para['encoder']['dropout'] # 0.1 ) self.encoder = nn.TransformerEncoder(encoder_layer, model_para['encoder']['nlayers'])
def __init__(self, config: BertConfig): super(Bert, self).__init__() self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.embedding_layer_norm = nn.LayerNorm(config.hidden_size) self.embedding_dropout = nn.Dropout(p=config.hidden_dropout_prob) self.encoders = nn.TransformerEncoder( encoder_layer=nn.TransformerEncoderLayer( d_model=config.hidden_size, nhead=config.num_attention_heads, dim_feedforward=config.intermediate_size, dropout=config.attention_probs_dropout_prob, activation=config.hidden_act, ), num_layers=config.num_hidden_layers, ) self.pooler_layer = nn.Linear(config.hidden_size, config.hidden_size) self.pooled_output_activate = nn.Tanh()
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", custom_encoder=None, custom_decoder=None): super().__init__() if custom_encoder is not None: self.encoder = custom_encoder else: encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) encoder_norm = nn.LayerNorm(d_model) self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) if custom_decoder is not None: self.decoder = custom_decoder else: decoder_layer = TransformerDecoderLayerWithFastDecode( d_model, nhead, dim_feedforward, dropout, activation) decoder_norm = nn.LayerNorm(d_model) self.decoder = TransformerDecoderWithFastDecode( decoder_layer, num_decoder_layers, decoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead
def __init__(self, config): super(TransformerLM, self).__init__() vocabSize = config.data.vocabSize self.nemd = config.model.transformer.nemd emd_drop_ratio = config.model.transformer.emd_drop_ratio hidden_drop_ratio = config.model.transformer.hidden_drop_ratio nhead = config.model.transformer.nhead nhid = config.model.transformer.nhid nlayer = config.model.transformer.nlayer self.src_mask=None tie_weight = config.model.transformer.tie_weight self.embedding = nn.Embedding(vocabSize, self.nemd) self.pos_encoder = PositionalEncoding(config) self.dropout = nn.Dropout(emd_drop_ratio) encoder_layers = nn.TransformerEncoderLayer(self.nemd, nhead, nhid, hidden_drop_ratio) self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayer) self.out = nn.Linear(self.nemd, vocabSize) # if not pretrained embedding self.init_weights() if tie_weight: self.out.weight = self.embedding.weight
def __init__(self, input_dim, hidden_dim, num_layers=1, dropout=0, use_categories=False): super().__init__() self.input_dim = input_dim self.hidden_dim = hidden_dim self.num_layers = num_layers self.dropout = dropout self.use_categories = use_categories enc_layer = nn.TransformerEncoderLayer(hidden_dim, 2, dim_feedforward=hidden_dim * 4, dropout=dropout, activation='gelu') self.enc = nn.TransformerEncoder(enc_layer, num_layers) self.input_fc = nn.Linear(input_dim, hidden_dim) if use_categories: self.dropout_mod = nn.Dropout(dropout) self.stroke_cat_fc = nn.Linear(input_dim + hidden_dim, hidden_dim)
def __init__(self, args): ''' transformer encoder for language, frames and action inputs ''' super(EncoderVL, self).__init__() # transofmer layers encoder_layer = nn.TransformerEncoderLayer( args.demb, args.encoder_heads, args.demb, args.dropout['transformer']['encoder']) self.enc_transformer = nn.TransformerEncoder(encoder_layer, args.encoder_layers) # how many last actions to attend to self.num_input_actions = args.num_input_actions # encodings self.enc_pos = PosEncoding(args.demb) if args.enc['pos'] else None self.enc_pos_learn = PosLearnedEncoding( args.demb) if args.enc['pos_learn'] else None self.enc_token = TokenLearnedEncoding( args.demb) if args.enc['token'] else None self.enc_layernorm = nn.LayerNorm(args.demb) self.enc_dropout = nn.Dropout(args.dropout['emb'], inplace=True)
def __init__(self, input_size, emb_size, hidden_size, num_layer, max_len=64): super().__init__() self.emb_size = emb_size self.hidden_size = hidden_size self.num_layer = num_layer self.scale = math.sqrt(emb_size) self.embedding = nn.Embedding(input_size, emb_size) # additional length for sos and eos self.pos_encoder = PositionEncoder(max_len + 10, emb_size) encoder_layer = nn.TransformerEncoderLayer(d_model=emb_size, nhead=8, dim_feedforward=hidden_size, dropout=0.1, activation='gelu') encoder_norm = nn.LayerNorm(emb_size) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layer, norm=encoder_norm)
def __init__( self, d_model: int, nhead: int, vocab_size: int, max_len: int, num_encoder_layers: int = 6, num_decoder_layers: int = 6, ) -> None: super(Transformer2, self).__init__() encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead) decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead) self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers) self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers) self.embedding = nn.Embedding(vocab_size, d_model) self.positional_encoding = PositionalEncoding(d_model, max_len) self.output_bias = Parameter(torch.Tensor(vocab_size)) self._init_bias()
def __init__(self, num_slots, slot_dim, hidden_dim, num_heads, num_layers, **kwargs): super().__init__() self.num_slots = num_slots self.slot_dim = slot_dim self.hidden_dim = hidden_dim self.num_heads = num_heads # TODO: for now we simply embed the slots to higher dim, create # TODO: custom transformer layer # TODO: non-relational Transformer is not directly comparable # TODO: to non-relational GNN self.slot_encode = nn.Linear(slot_dim, hidden_dim) self.slot_decode = nn.Linear(hidden_dim, slot_dim) transformer_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads) self.transformer = nn.TransformerEncoder( encoder_layer=transformer_layer, num_layers=num_layers)
def __init__(self, input_window, output_window, num_layers=1, dropout=0.1, longueur_serie=23): """ Init. Parameters ---------- input_window: int Représente le nombre de jour de la séquence d'entrée Longueur de la séquence d'entrée: 24 * input_window output_window: int Représente le nombre d'heure de la séquence de sortie Longueur de la séquence de sortie: output_window """ super(Transformer, self).__init__() self.name_model = 'Transformer' self.input_window = input_window self.output_window = output_window self.feature_size = self.output_window * 4 self.src_mask = None self.pos_encoder = PositionalEncoding(self.feature_size) self.encoder_layer = nn.TransformerEncoderLayer( d_model=self.feature_size, dim_feedforward=self.feature_size * 4, nhead=self.output_window, dropout=dropout) self.transformer_encoder = nn.TransformerEncoder( self.encoder_layer, num_layers=num_layers).float() self.decoder = nn.Linear(self.feature_size, 1) self.init_weights()
def __init__(self, numberTokens, embeddingSize, maxLength, numberEncoderLayers, numberDecoderLayers, attentionHeadCount, transformerHiddenDenseSize, batch_size=32): # Based on https://pytorch.org/tutorials/beginner/transformer_tutorial.html super(Transformer, self).__init__() self.batch_size=batch_size self.model_type = 'Transformer' self.embeddingSize = embeddingSize self.numberTokens = numberTokens self.encoderEmbedding = nn.Embedding(numberTokens, embeddingSize) self.maxLength = maxLength encoderLayer = nn.TransformerEncoderLayer(embeddingSize, attentionHeadCount, transformerHiddenDenseSize) self.encoder = nn.TransformerEncoder(encoderLayer, numberEncoderLayers) self.decoderEmbedding = nn.Embedding(numberTokens, embeddingSize) decoderLayer = nn.TransformerDecoderLayer(embeddingSize, attentionHeadCount, transformerHiddenDenseSize) self.decoder = nn.TransformerDecoder(decoderLayer, numberDecoderLayers) self.decoderLinear = nn.Linear(embeddingSize, numberTokens) self.decoderSoftmax = nn.Softmax(dim=2)
def __init__(self, d_model, seq_len, nhead, dim_feedforward, dropout, num_layers): super(Encoder, self).__init__() # 時系列データの入力処理 self.embedding_layer = nn.Linear(51, d_model) # Positional Encodeingの処理 self.positionalencoding_layer = PositionalEncoding(d_model, seq_len) # Dropoutの処理 self.dropout_layer = nn.Dropout(p=dropout) # Transformer decoder側の処理 default dim_feedforward =2048 self.encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout) self.transformer_encoder = nn.TransformerEncoder( self.encoder_layer, num_layers) # Discriminatorへの入力用 self.linear_layer_dis = nn.Linear(seq_len * d_model, d_model)
def __init__(self, input_dim, rnn_hidden_dims, max_ponder=3, epsilon=0.05, last_relu=True, act_steps=3, act_fixed=False): super(ATC_TFencoder, self).__init__() self.rnn_hidden_dim = rnn_hidden_dims[-1] self.epsilon = epsilon # self.rnn_cell = GRUEXND(input_dim, rnn_hidden_dims, last_relu) self.transformer_encoder_layer = nn.TransformerEncoderLayer( d_model=50, nhead=2, dim_feedforward=150) self.transformer_encoder = nn.TransformerEncoder( self.transformer_encoder_layer, num_layers=1) #TransformerEncoder(source_dims=13, k_dims=16, v_dims=16, n_heads=3, layer_cnt=1) self.transition_layer = nn.Linear(13, rnn_hidden_dims[-1]) self.max_ponder = max_ponder self.ponder_linear = nn.Linear(rnn_hidden_dims[-1], 1) self.act_fixed = act_fixed self.act_steps = act_steps
def __init__(self, n_skill, max_seq=100, embed_dim=128, num_heads=8, dropout=0.2): super(SAKTModel, self).__init__() self.n_skill = n_skill self.embed_dim = embed_dim embed_dim = 32 * 6 + 256 self.embedding = nn.Embedding(4, 32) self.user_answer_embedding = nn.Embedding(6, 32) self.prior_question_had_explanation_embedding = nn.Embedding(4, 32) self.e_embedding = nn.Embedding(n_skill + 1, 256) self.part_embedding = nn.Embedding(8, 32) self.elapsed_time_embedding = nn.Embedding(302, 32) self.duration_previous_content_embedding = nn.Embedding(302, 32) encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout) self.transformer_enc = nn.TransformerEncoder( encoder_layer=encoder_layer, num_layers=4) self.gru = nn.GRU(input_size=embed_dim, hidden_size=embed_dim) self.continuous_embedding = nn.Sequential(nn.BatchNorm1d(99), nn.Linear(1, embed_dim // 2), nn.LayerNorm(embed_dim // 2)) self.cat_embedding = nn.Sequential( nn.Linear(embed_dim, embed_dim // 2), nn.LayerNorm(embed_dim // 2)) self.layer_normal = nn.LayerNorm(embed_dim) self.ffn = FFN(embed_dim) self.dropout = nn.Dropout(dropout / 2) self.pred = nn.Linear(embed_dim, 1)
def __init__(self, d_model: int = 300, nhead: int = 6, num_encoder_layers: int = 4, num_decoder_layers: int = 4, dim_feedforward: int = 1024, dropout: float = 0.1, activation: str = "relu", words_num: int = 0) -> None: super(Transformer, self).__init__() self.source_embedding = nn.Embedding(words_num, 300) self.pos_encoder = PositionalEncoding(d_model=d_model, dropout=dropout, max_len=75) self.pos_decoder = PositionalEncoding(d_model=d_model, dropout=dropout, max_len=74) encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) encoder_norm = nn.LayerNorm(d_model) self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) self.target_embedding = nn.Embedding(words_num, 300) decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation) decoder_norm = nn.LayerNorm(d_model) self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) self.out = nn.Linear(d_model, words_num) self._reset_parameters() self.d_model = d_model self.nhead = nhead
def __init__(self, input_size, encoder_size, n_head, feedforward_size, n_layers, dropout=0): ''' input_size: ''' super(EncoderTransformer, self).__init__() # self.model_type = 'Transformer' # self.src_mask = None # self.pos_encoder = PositionalEncoding(ninp, dropout) encoder_layers = nn.TransformerEncoderLayer(encoder_size, n_head, feedforward_size, dropout=dropout) self.transformer_encoder = nn.TransformerEncoder( encoder_layers, n_layers)
def __init__(self): super(sequence_encoder, self).__init__() self.embedding_layer = nn.Embedding(5, 512) self.encoder_layer = nn.TransformerEncoderLayer(512, 8, 1024) self.encoder = nn.TransformerEncoder(self.encoder_layer, 5)
def __init__(self, config: Config): super().__init__(config) conv_H = config.experiment.cmax_h conv_W = config.experiment.cmax_w conv_layers = [] in_channels = 1 for index, filters in enumerate(config.experiment.cnn_filters): out_channels = filters conv_layers.extend([ nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), stride=(2, 2), padding=1), nn.ReLU(), nn.BatchNorm2d(num_features=out_channels), ]) if index != len(config.experiment.cnn_filters) - 1: conv_layers.append(nn.Dropout(config.experiment.dropout)) conv_W = math.ceil(conv_W / 2) conv_H = math.ceil(conv_H / 2) in_channels = out_channels self.conv = nn.Sequential( *conv_layers, nn.Flatten(), nn.Linear(in_features=conv_W * conv_H * out_channels, out_features=conv_W * conv_H * out_channels)) self.conv_time_distributed = TimeDistributed(self.conv) self.embed_dim = self.features_len * ( config.experiment.time2vec_embedding_size + 1) + conv_W * conv_H * out_channels if config.experiment.use_all_gfs_as_input: self.time_2_vec_time_distributed = TimeDistributed( Time2Vec( self.features_len + len( process_config( config.experiment.train_parameters_config_file)), config.experiment.time2vec_embedding_size), batch_first=True) self.embed_dim += len( process_config( config.experiment.train_parameters_config_file)) * ( config.experiment.time2vec_embedding_size + 1) self.pos_encoder = PositionalEncoding(self.embed_dim, self.dropout, self.sequence_length) encoder_layer = nn.TransformerEncoderLayer( d_model=self.embed_dim, nhead=config.experiment.transformer_attention_heads, dim_feedforward=config.experiment.transformer_ff_dim, dropout=config.experiment.dropout, batch_first=True) encoder_norm = nn.LayerNorm(self.embed_dim) self.encoder = nn.TransformerEncoder( encoder_layer, config.experiment.transformer_attention_layers, encoder_norm) dense_layers = [] features = self.embed_dim + 1 for neurons in config.experiment.transformer_head_dims: dense_layers.append( nn.Linear(in_features=features, out_features=neurons)) features = neurons dense_layers.append(nn.Linear(in_features=features, out_features=1)) self.classification_head = nn.Sequential(*dense_layers) self.classification_head_time_distributed = TimeDistributed( self.classification_head, batch_first=True)
def __init__(self, ): super(TextTansformer, self).__init__() self.encoder = nn.TransformerEncoder() self.decoder = nn.TransformerDecoder() pass
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, module_dim=128, text_dim=1, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(feature_dim[0], module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm) if verbose: print('Here is my stem:') print(self.stem) self.char_lstm = nn.LSTM(input_size=28, hidden_size=98, bidirectional=True, batch_first=True) encoder_layer = nn.TransformerEncoderLayer(d_model=28, nhead=7) self.char_transformer = nn.TransformerEncoder( encoder_layer=encoder_layer, num_layers=3) self.char_linear = nn.Linear(28, 196) num_answers = len(vocab['answer_idx_to_token']) module_H, module_W = feature_dim[1], feature_dim[2] self.classifier = build_classifier(module_dim + text_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab self.module_list = [] for idx, fn_str in enumerate(vocab['program_token_to_idx']): num_inputs = iep.programs.get_num_inputs(fn_str) self.function_modules_num_inputs[fn_str] = num_inputs if fn_str == 'scene' or num_inputs == 1: mod = ResidualBlock(module_dim + text_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: mod = ConcatBlock(module_dim + text_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.module_list.append(mod) self.function_modules[fn_str] = idx self.module_list = nn.ModuleList(self.module_list) self.save_module_outputs = False
def __init__(self): layer = nn.TransformerEncoderLayer(128, 8) self.encoder = nn.TransformerEncoder(layer, 8) self.linear = nn.Linear(128, 2048)
def __init__(self, char_embedding_dim: int, out_dim: int, image_feature_dim: int = 512, nheaders: int = 8, nlayers: int = 6, feedforward_dim: int = 2048, dropout: float = 0.1, max_len: int = 100, image_encoder: str = 'resnet50', roi_pooling_mode: str = 'roi_align', roi_pooling_size: Tuple[int, int] = (7, 7)): ''' convert image segments and text segments to node embedding. :param char_embedding_dim: :param out_dim: :param image_feature_dim: :param nheaders: :param nlayers: :param feedforward_dim: :param dropout: :param max_len: :param image_encoder: :param roi_pooling_mode: :param roi_pooling_size: ''' super().__init__() self.dropout = dropout assert roi_pooling_mode in [ 'roi_align', 'roi_pool' ], 'roi pooling model: {} not support.'.format(roi_pooling_mode) self.roi_pooling_mode = roi_pooling_mode assert roi_pooling_size and len( roi_pooling_size) == 2, 'roi_pooling_size not be set properly.' self.roi_pooling_size = tuple(roi_pooling_size) # (h, w) transformer_encoder_layer = nn.TransformerEncoderLayer( d_model=char_embedding_dim, nhead=nheaders, dim_feedforward=feedforward_dim, dropout=dropout) self.transformer_encoder = nn.TransformerEncoder( transformer_encoder_layer, num_layers=nlayers) if image_encoder == 'resnet18': self.cnn = resnet.resnet18(output_channels=out_dim) elif image_encoder == 'resnet34': self.cnn = resnet.resnet34(output_channels=out_dim) elif image_encoder == 'resnet50': self.cnn = resnet.resnet50(output_channels=out_dim) elif image_encoder == 'resnet101': self.cnn = resnet.resnet101(output_channels=out_dim) elif image_encoder == 'resnet152': self.cnn = resnet.resnet152(output_channels=out_dim) else: raise NotImplementedError() self.conv = nn.Conv2d(image_feature_dim, out_dim, self.roi_pooling_size) self.bn = nn.BatchNorm2d(out_dim) self.projection = nn.Linear(2 * out_dim, out_dim) self.norm = nn.LayerNorm(out_dim) # Compute the positional encodings once in log space. position_embedding = torch.zeros(max_len, char_embedding_dim) position = torch.arange(0, max_len).unsqueeze(1).float() div_term = torch.exp( torch.arange(0, char_embedding_dim, 2).float() * -(math.log(10000.0) / char_embedding_dim)) position_embedding[:, 0::2] = torch.sin(position * div_term) position_embedding[:, 1::2] = torch.cos(position * div_term) position_embedding = position_embedding.unsqueeze(0).unsqueeze( 0) # 1, 1, max_len, char_embedding_dim self.register_buffer('position_embedding', position_embedding) self.pe_droput = nn.Dropout(self.dropout)
def __init__(self, input_dim=40, hidden_dim=128, output_dim=2, num_layers=2): super(LSTMTrans1_deep, self).__init__() self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, dropout=0.2, batch_first=True) encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=2) self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4) self.hidden2out = nn.Linear(hidden_dim, output_dim)
def __init__(self, hidden_size=20, num_heads=16): super(TransformerModel, self).__init__() self.encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads) self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=4)
def __init__(self, hps, obs_config): super(TransformerPolicy7, self).__init__() assert obs_config.drones > 0 or obs_config.minerals > 0,\ 'Must have at least one mineral or drones observation' assert obs_config.drones >= obs_config.allies assert not hps.use_privileged or ( hps.nmineral > 0 and hps.nally > 0 and (hps.nenemy > 0 or hps.ally_enemy_same)) assert hps.nally == obs_config.allies assert hps.nenemy == obs_config.drones - obs_config.allies assert hps.nmineral == obs_config.minerals assert hps.ntile == obs_config.tiles self.version = 'transformer_v7' self.kwargs = dict(hps=hps, obs_config=obs_config) self.hps = hps self.obs_config = obs_config self.agents = hps.agents self.nally = hps.nally self.nenemy = hps.nenemy self.nmineral = hps.nmineral self.nconstant = hps.nconstant self.ntile = hps.ntile self.nitem = hps.nally + hps.nenemy + hps.nmineral + hps.nconstant + hps.ntile self.fp16 = hps.fp16 self.d_agent = hps.d_agent self.d_item = hps.d_item self.naction = hps.objective.naction() + obs_config.extra_actions() if hasattr(obs_config, 'global_drones'): self.global_drones = obs_config.global_drones else: self.global_drones = 0 if hps.norm == 'none': norm_fn = lambda x: nn.Sequential() elif hps.norm == 'batchnorm': norm_fn = lambda n: nn.BatchNorm2d(n) elif hps.norm == 'layernorm': norm_fn = lambda n: nn.LayerNorm(n) else: raise Exception(f'Unexpected normalization layer {hps.norm}') endglobals = self.obs_config.endglobals() endallies = self.obs_config.endallies() endenemies = self.obs_config.endenemies() endmins = self.obs_config.endmins() endtiles = self.obs_config.endtiles() endallenemies = self.obs_config.endallenemies() self.agent_embedding = ItemBlock( obs_config.dstride() + obs_config.global_features(), hps.d_agent, hps.d_agent * hps.dff_ratio, norm_fn, True, mask_feature=7, # Feature 7 is hitpoints ) self.relpos_net = ItemBlock(3, hps.d_item // 2, hps.d_item // 2 * hps.dff_ratio, norm_fn, hps.item_ff) self.item_nets = nn.ModuleList() if hps.ally_enemy_same: self.item_nets.append( PosItemBlock( obs_config.dstride(), hps.d_item // 2, hps.d_item // 2 * hps.dff_ratio, norm_fn, hps.item_ff, mask_feature=7, # Feature 7 is hitpoints count=obs_config.drones, start=endglobals, end=endenemies, )) else: if self.nally > 0: self.item_nets.append( PosItemBlock( obs_config.dstride(), hps.d_item // 2, hps.d_item // 2 * hps.dff_ratio, norm_fn, hps.item_ff, mask_feature=7, # Feature 7 is hitpoints count=obs_config.allies, start=endglobals, end=endallies, )) if self.nenemy > 0: self.item_nets.append( PosItemBlock( obs_config.dstride(), hps.d_item // 2, hps.d_item // 2 * hps.dff_ratio, norm_fn, hps.item_ff, mask_feature=7, # Feature 7 is hitpoints count=obs_config.drones - self.obs_config.allies, start=endallies, end=endenemies, start_privileged=endtiles if hps.use_privileged else None, end_privileged=endallenemies if hps.use_privileged else None, )) if hps.nmineral > 0: self.item_nets.append( PosItemBlock( obs_config.mstride(), hps.d_item // 2, hps.d_item // 2 * hps.dff_ratio, norm_fn, hps.item_ff, mask_feature=2, # Feature 2 is size count=obs_config.minerals, start=endenemies, end=endmins, )) if hps.ntile > 0: self.item_nets.append( PosItemBlock( obs_config.tstride(), hps.d_item // 2, hps.d_item // 2 * hps.dff_ratio, norm_fn, hps.item_ff, mask_feature=2, # Feature is elapsed since last visited time count=obs_config.tiles, start=endmins, end=endtiles, )) if hps.nconstant > 0: self.constant_items = nn.Parameter( torch.normal(0, 1, (hps.nconstant, hps.d_item))) if hps.item_item_attn_layers > 0: encoder_layer = nn.TransformerEncoderLayer(d_model=hps.d_item, nhead=8) self.item_item_attn = nn.TransformerEncoder( encoder_layer, num_layers=hps.item_item_attn_layers) else: self.item_item_attn = None self.multihead_attention = MultiheadAttention( embed_dim=hps.d_agent, kdim=hps.d_item, vdim=hps.d_item, num_heads=hps.nhead, dropout=hps.dropout, ) self.linear1 = nn.Linear(hps.d_agent, hps.d_agent * hps.dff_ratio) self.linear2 = nn.Linear(hps.d_agent * hps.dff_ratio, hps.d_agent) self.norm1 = nn.LayerNorm(hps.d_agent) self.norm2 = nn.LayerNorm(hps.d_agent) self.map_channels = hps.d_agent // (hps.nm_nrings * hps.nm_nrays) map_item_channels = self.map_channels - 2 if self.hps.map_embed_offset else self.map_channels self.downscale = nn.Linear(hps.d_item, map_item_channels) self.norm_map = norm_fn(map_item_channels) self.conv1 = spatial.ZeroPaddedCylindricalConv2d(self.map_channels, hps.dff_ratio * self.map_channels, kernel_size=3) self.conv2 = spatial.ZeroPaddedCylindricalConv2d(hps.dff_ratio * self.map_channels, self.map_channels, kernel_size=3) self.norm_conv = norm_fn(self.map_channels) final_width = hps.d_agent if hps.nearby_map: final_width += hps.d_agent self.final_layer = nn.Sequential( nn.Linear(final_width, hps.d_agent * hps.dff_ratio), nn.ReLU(), ) self.policy_head = nn.Linear(hps.d_agent * hps.dff_ratio, self.naction) if hps.small_init_pi: self.policy_head.weight.data *= 0.01 self.policy_head.bias.data.fill_(0.0) if hps.use_privileged: self.value_head = nn.Linear( hps.d_agent * hps.dff_ratio + hps.d_item, 1) else: self.value_head = nn.Linear(hps.d_agent * hps.dff_ratio, 1) if hps.zero_init_vf: self.value_head.weight.data.fill_(0.0) self.value_head.bias.data.fill_(0.0) self.epsilon = 1e-4 if hps.fp16 else 1e-8
def __init__(self, early_fusion, d_model, n_head, dim_feedforward, dropout, num_layers, layer_norm, embed_dropout, output_dim, out_dropout, multimodal_transformer=True): super(Transformer, self).__init__() self.multimodal_transformer = multimodal_transformer if self.multimodal_transformer: self.d_mult = d_model self.mult = mult_model.MULTModel( orig_d_l=300, orig_d_a=74, orig_d_v=35, d_l=self.d_mult, # different from MulT d_a=self.d_mult, # different from MulT d_v=self.d_mult, # different from MulT vonly=True, aonly=True, lonly=True, num_heads=n_head, layers=num_layers, attn_dropout=0.1, attn_dropout_a=0.0, attn_dropout_v=0.0, relu_dropout=0.1, res_dropout=0.1, out_dropout=out_dropout, embed_dropout=embed_dropout, attn_mask=True) self.t_in_dim = self.v_in_dim = self.a_in_dim = self.d_mult * 2 combined_dim = 6 * d_model else: self.early_fusion = early_fusion 'Only late fusion implemented; early fusion will be implemented later' # Late fusion if not self.early_fusion: self.orig_d_t = 300 self.orig_d_a = 74 self.orig_d_v = 35 self.d_t = self.d_a = self.d_v = d_model # Temporal convolutional layers self.proj_t = nn.Conv1d(self.orig_d_t, self.d_t, kernel_size=1, padding=0, bias=False) self.proj_a = nn.Conv1d(self.orig_d_a, self.d_a, kernel_size=1, padding=0, bias=False) self.proj_v = nn.Conv1d(self.orig_d_v, self.d_v, kernel_size=1, padding=0, bias=False) # Transformer Layers self.encoder_layer_t = nn.TransformerEncoderLayer( d_model, nhead=n_head, dim_feedforward=dim_feedforward) self.encoder_layer_a = nn.TransformerEncoderLayer( d_model, nhead=n_head, dim_feedforward=dim_feedforward) self.encoder_layer_v = nn.TransformerEncoderLayer( d_model, nhead=n_head, dim_feedforward=dim_feedforward) 'Remember to implement layer norm option here' if layer_norm: print( "layer norm not implemented yet for vanilla transformer" ) assert False else: self.transformer_encoder_t = nn.TransformerEncoder( self.encoder_layer_t, num_layers=num_layers) self.transformer_encoder_a = nn.TransformerEncoder( self.encoder_layer_a, num_layers=num_layers) self.transformer_encoder_v = nn.TransformerEncoder( self.encoder_layer_v, num_layers=num_layers) self.embed_dropout = embed_dropout 'Change here for other dataset since number of modalities might be different' combined_dim = 3 * d_model self.proj1 = nn.Linear(combined_dim, combined_dim) self.proj2 = nn.Linear(combined_dim, combined_dim) self.out_layer = nn.Linear(combined_dim, output_dim) self.out_dropout = out_dropout
def __init__(self, config): super(MISA, self).__init__() self.config = config self.text_size = config.embedding_size self.visual_size = config.visual_size self.acoustic_size = config.acoustic_size self.input_sizes = input_sizes = [ self.text_size, self.visual_size, self.acoustic_size ] self.hidden_sizes = hidden_sizes = [ int(self.text_size), int(self.visual_size), int(self.acoustic_size) ] self.output_size = output_size = config.num_classes self.dropout_rate = dropout_rate = config.dropout self.activation = self.config.activation() self.tanh = nn.Tanh() rnn = nn.LSTM if self.config.rnncell == "lstm" else nn.GRU # defining modules - two layer bidirectional LSTM with layer norm in between if self.config.use_bert: # Initializing a BERT bert-base-uncased style configuration bertconfig = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True) self.bertmodel = BertModel.from_pretrained('bert-base-uncased', config=bertconfig) else: self.embed = nn.Embedding(len(config.word2id), input_sizes[0]) self.trnn1 = rnn(input_sizes[0], hidden_sizes[0], bidirectional=True) self.trnn2 = rnn(2 * hidden_sizes[0], hidden_sizes[0], bidirectional=True) self.vrnn1 = rnn(input_sizes[1], hidden_sizes[1], bidirectional=True) self.vrnn2 = rnn(2 * hidden_sizes[1], hidden_sizes[1], bidirectional=True) self.arnn1 = rnn(input_sizes[2], hidden_sizes[2], bidirectional=True) self.arnn2 = rnn(2 * hidden_sizes[2], hidden_sizes[2], bidirectional=True) ########################################## # mapping modalities to same sized space ########################################## if self.config.use_bert: self.project_t = nn.Sequential() self.project_t.add_module( 'project_t', nn.Linear(in_features=768, out_features=config.hidden_size)) self.project_t.add_module('project_t_activation', self.activation) self.project_t.add_module('project_t_layer_norm', nn.LayerNorm(config.hidden_size)) else: self.project_t = nn.Sequential() self.project_t.add_module( 'project_t', nn.Linear(in_features=hidden_sizes[0] * 4, out_features=config.hidden_size)) self.project_t.add_module('project_t_activation', self.activation) self.project_t.add_module('project_t_layer_norm', nn.LayerNorm(config.hidden_size)) self.project_v = nn.Sequential() self.project_v.add_module( 'project_v', nn.Linear(in_features=hidden_sizes[1] * 4, out_features=config.hidden_size)) self.project_v.add_module('project_v_activation', self.activation) self.project_v.add_module('project_v_layer_norm', nn.LayerNorm(config.hidden_size)) self.project_a = nn.Sequential() self.project_a.add_module( 'project_a', nn.Linear(in_features=hidden_sizes[2] * 4, out_features=config.hidden_size)) self.project_a.add_module('project_a_activation', self.activation) self.project_a.add_module('project_a_layer_norm', nn.LayerNorm(config.hidden_size)) ########################################## # private encoders ########################################## self.private_t = nn.Sequential() self.private_t.add_module( 'private_t_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_t.add_module('private_t_activation_1', nn.Sigmoid()) self.private_v = nn.Sequential() self.private_v.add_module( 'private_v_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_v.add_module('private_v_activation_1', nn.Sigmoid()) self.private_a = nn.Sequential() self.private_a.add_module( 'private_a_3', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_a.add_module('private_a_activation_3', nn.Sigmoid()) ########################################## # shared encoder ########################################## self.shared = nn.Sequential() self.shared.add_module( 'shared_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.shared.add_module('shared_activation_1', nn.Sigmoid()) ########################################## # reconstruct ########################################## self.recon_t = nn.Sequential() self.recon_t.add_module( 'recon_t_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.recon_v = nn.Sequential() self.recon_v.add_module( 'recon_v_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.recon_a = nn.Sequential() self.recon_a.add_module( 'recon_a_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) ########################################## # shared space adversarial discriminator ########################################## if not self.config.use_cmd_sim: self.discriminator = nn.Sequential() self.discriminator.add_module( 'discriminator_layer_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.discriminator.add_module('discriminator_layer_1_activation', self.activation) self.discriminator.add_module('discriminator_layer_1_dropout', nn.Dropout(dropout_rate)) self.discriminator.add_module( 'discriminator_layer_2', nn.Linear(in_features=config.hidden_size, out_features=len(hidden_sizes))) ########################################## # shared-private collaborative discriminator ########################################## self.sp_discriminator = nn.Sequential() self.sp_discriminator.add_module( 'sp_discriminator_layer_1', nn.Linear(in_features=config.hidden_size, out_features=4)) self.fusion = nn.Sequential() self.fusion.add_module( 'fusion_layer_1', nn.Linear(in_features=self.config.hidden_size * 6, out_features=self.config.hidden_size * 3)) self.fusion.add_module('fusion_layer_1_dropout', nn.Dropout(dropout_rate)) self.fusion.add_module('fusion_layer_1_activation', self.activation) self.fusion.add_module( 'fusion_layer_3', nn.Linear(in_features=self.config.hidden_size * 3, out_features=output_size)) self.tlayer_norm = nn.LayerNorm((hidden_sizes[0] * 2, )) self.vlayer_norm = nn.LayerNorm((hidden_sizes[1] * 2, )) self.alayer_norm = nn.LayerNorm((hidden_sizes[2] * 2, )) encoder_layer = nn.TransformerEncoderLayer( d_model=self.config.hidden_size, nhead=2) self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)