Example #1
0
    def __init__(self,
                 device,
                 tag_to_ix,
                 n_layers,
                 hidden_dim,
                 hidden_dim_pp,
                 char_cnn,
                 n_chars,
                 char_cnn_filters,
                 pairwise_gate,
                 train_type="sequence",
                 normalization="weight",
                 elmo_dropout_ratio=0.,
                 dropout_ratio=0.,
                 shared_lstm=False,
                 inp_config="full",
                 pairwise_query_type='mul',
                 bilinear_dim=300,
                 elmo_dim=1024,
                 attn='multi',
                 all_test=False,
                 gate_bias=-1.,
                 monitor=None,
                 logger=None):
        super(CRF_FB, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.hidden_dim_pp = hidden_dim_pp
        self.bilinear_dim = bilinear_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.monitor = monitor
        self.embedding_dim = elmo_dim
        self.normalization = normalization
        self.elmo_dropout_ratio = elmo_dropout_ratio
        self.dropout_ratio = dropout_ratio
        self.train_type = train_type.lower()
        self.n_layers = n_layers
        self.char_cnn = char_cnn
        self.pairwise_gate = pairwise_gate
        self.bilinear_inp_dim = self.embedding_dim
        self.bilinear_out_dim = hidden_dim
        self.char_cnn_highway_bias = -1.
        self.query_dim = hidden_dim
        self.attn_dim = hidden_dim
        self.inp_config = inp_config
        self.shared_lstm = shared_lstm
        self.pairwise_query_type = pairwise_query_type
        self.pairwise_bilinear_pooling = True
        self.all_test = all_test
        self.logger = logger
        self.logger.info("Pairwise Type = {}".format(self.pairwise_query_type))

        if self.inp_config != "w2v":
            self.elmo = Elmo(ELMO_OPTIONS_FILE,
                             ELMO_WEIGHT_FILE,
                             1,
                             requires_grad=False,
                             dropout=self.elmo_dropout_ratio)
            self.elmo.to(self.device)

        self.act = nn.ELU()

        self.layer_norm = nn.LayerNorm(self.embedding_dim)

        if self.train_type != "no_unary":
            self.logger.info("Unary Config")
            self.lstm = nn.LSTM(self.embedding_dim,
                                self.hidden_dim,
                                num_layers=self.n_layers,
                                dropout=self.dropout_ratio,
                                bidirectional=True).to(device=device)

            self.unary_fc = weight_norm(nn.Linear(2 * hidden_dim,
                                                  2 * hidden_dim,
                                                  bias=True).to(device=device),
                                        dim=None)
            self.init_parameters(self.unary_fc, 'relu')

            self.out_dropout_u_fc = nn.Dropout(self.dropout_ratio)
            self.out_dropout_u_skip = nn.Dropout(self.dropout_ratio)

            self.hidden2tag = weight_norm(nn.Linear(
                2 * hidden_dim, self.tagset_size).to(device=device),
                                          dim=None)
            self.init_parameters(self.hidden2tag, 'linear')

            tran_init = torch.empty(self.tagset_size,
                                    self.tagset_size,
                                    dtype=torch.float,
                                    requires_grad=True)
            torch.nn.init.normal_(tran_init, mean=0.0, std=1.)
            self.transitions = nn.Parameter(tran_init.to(device=device))
            self.transitions.data[:, tag_to_ix[DatasetPreprosessed.
                                               __START_TAG__]] = -100.
            self.transitions.data[
                tag_to_ix[DatasetPreprosessed.__STOP_TAG__], :] = -100.

        if self.train_type != "no_pairwise":
            self.logger.info("Pairwise Config")
            if not self.shared_lstm:
                self.logger.info("Separate LSTMs")
                self.lstm_pairwise = nn.LSTM(
                    self.embedding_dim,
                    self.hidden_dim,
                    num_layers=self.n_layers,
                    dropout=self.dropout_ratio,
                    bidirectional=True).to(device=device)
            else:
                self.logger.info("Shared LSTM")
            self.U = weight_norm(nn.Linear(
                2 * self.hidden_dim, self.hidden_dim_pp).to(device=device),
                                 dim=None)
            self.init_parameters(self.U, 'relu')
            self.V = weight_norm(nn.Linear(
                2 * self.hidden_dim, self.hidden_dim_pp).to(device=device),
                                 dim=None)
            self.init_parameters(self.V, 'relu')
            self.P = weight_norm(nn.Linear(
                self.hidden_dim_pp, self.bilinear_dim).to(device=device),
                                 dim=None)
            self.init_parameters(self.P, 'relu')
            self.pairwise_fc = weight_norm(
                nn.Linear(self.bilinear_dim, self.bilinear_dim,
                          bias=True).to(device=device),
                dim=None)
            self.init_parameters(self.pairwise_fc, 'relu')
            self.dropout_p_mul = nn.Dropout(self.dropout_ratio)
            self.out_dropout_p_fc = nn.Dropout(self.dropout_ratio)
            self.out_dropout_p_skip = nn.Dropout(self.dropout_ratio)
            self.hidden2tag_pp = weight_norm(nn.Linear(
                self.bilinear_dim, self.tagset_size**2).to(device=device),
                                             dim=None)
            self.init_parameters(self.hidden2tag_pp, 'linear')

        self.__start__ = torch.tensor(
            self.tag_to_ix[DatasetPreprosessed.__START_TAG__],
            dtype=torch.long).to(device=device)
        self.__stop__ = torch.tensor(
            self.tag_to_ix[DatasetPreprosessed.__STOP_TAG__],
            dtype=torch.long).to(device=device)
Example #2
0
 def __init__(self, config):
     super().__init__()
     inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
     self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
     self.attn = GPTJAttention(config)
     self.mlp = GPTJMLP(inner_dim, config)
Example #3
0
 def forward(self, inputs):
     residual = inputs
     output = nn.ReLU()(self.w_1(inputs))
     output = self.w_2(output)
     return nn.LayerNorm(d_model)(output + residual)
Example #4
0
 def __init__(self, layer, num_layers):
     super(Decoder, self).__init__()
     self.layers = clones(layer, num_layers)
     self.norm = nn.LayerNorm(layer.size)
Example #5
0
    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters()

        bert_config = BertConfig(
            vocab_size=config["vocab_size"],
            hidden_size=config["hidden_size"],
            num_hidden_layers=config["num_layers"],
            num_attention_heads=config["num_heads"],
            intermediate_size=config["hidden_size"] * config["mlp_ratio"],
            max_position_embeddings=config["max_text_len"],
            hidden_dropout_prob=config["drop_rate"],
            attention_probs_dropout_prob=config["drop_rate"],
        )
        self.tempeture_max_OT = config['tempeture_max_OT']
        self.text_embeddings = BertEmbeddings(bert_config)
        self.text_embeddings.apply(objectives.init_weights)

        self.token_type_embeddings = nn.Embedding(2, config["hidden_size"])
        self.token_type_embeddings.apply(objectives.init_weights)

        import vilt.modules.vision_transformer as vit

        if self.hparams.config["load_path"] == "":
            self.transformer = getattr(vit, self.hparams.config["vit"])(
                pretrained=config["pretrained_flag"], config=self.hparams.config)
        else:
            self.transformer = getattr(vit, self.hparams.config["vit"])(
                pretrained=False, config=self.hparams.config
            )

        self.pooler = heads.Pooler(config["hidden_size"])
        self.pooler.apply(objectives.init_weights)

        if config["loss_names"]["mlm"] > 0:
            self.mlm_score = heads.MLMHead(bert_config)
            self.mlm_score.apply(objectives.init_weights)

        if config["loss_names"]["itm"] > 0:
            self.itm_score = heads.ITMHead(config["hidden_size"])
            self.itm_score.apply(objectives.init_weights)

        if config["loss_names"]["mpp"] > 0:
            self.mpp_score = heads.MPPHead(bert_config)
            self.mpp_score.apply(objectives.init_weights)

        # ===================== Downstream ===================== #
        if (
            self.hparams.config["load_path"] != ""
            and not self.hparams.config["test_only"]
        ):
            ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu")
            state_dict = ckpt["state_dict"]
            self.load_state_dict(state_dict, strict=False)
            print(f'Loading checkpoint from {self.hparams.config["load_path"]}')

        hs = self.hparams.config["hidden_size"]

        if self.hparams.config["loss_names"]["vqa"] > 0:
            vs = self.hparams.config["vqav2_label_size"]
            self.vqa_classifier = nn.Sequential(
                nn.Linear(hs, hs * 2),
                nn.LayerNorm(hs * 2),
                nn.GELU(),
                nn.Linear(hs * 2, vs),
            )
            self.vqa_classifier.apply(objectives.init_weights)

        if self.hparams.config["loss_names"]["nlvr2"] > 0:
            self.nlvr2_classifier = nn.Sequential(
                nn.Linear(hs * 2, hs * 2),
                nn.LayerNorm(hs * 2),
                nn.GELU(),
                nn.Linear(hs * 2, 2),
            )
            self.nlvr2_classifier.apply(objectives.init_weights)
            emb_data = self.token_type_embeddings.weight.data
            self.token_type_embeddings = nn.Embedding(3, hs)
            self.token_type_embeddings.apply(objectives.init_weights)
            self.token_type_embeddings.weight.data[0, :] = emb_data[0, :]
            self.token_type_embeddings.weight.data[1, :] = emb_data[1, :]
            self.token_type_embeddings.weight.data[2, :] = emb_data[1, :]

        if self.hparams.config["loss_names"]["irtr"] > 0:
            self.rank_output = nn.Linear(hs, 1)
            self.rank_output.weight.data = self.itm_score.fc.weight.data[1:, :]
            self.rank_output.bias.data = self.itm_score.fc.bias.data[1:]
            self.margin = 0.2
            for p in self.itm_score.parameters():
                p.requires_grad = False

        vilt_utils.set_metrics(self)
        self.current_tasks = list()

        # ===================== load downstream (test_only) ======================

        if self.hparams.config["load_path"] != "" and self.hparams.config["test_only"]:
            ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu")
            state_dict = ckpt["state_dict"]
            self.load_state_dict(state_dict, strict=False)
            print(f'Loading checkpoint from {self.hparams.config["load_path"]}')
Example #6
0
 def block(inp, out, activation, block_device):
     return nn.Sequential(
         nn.Linear(inp, out, bias=False),
         nn.LayerNorm(out),  # Recommended by Gulrajani et al 2017
         activation(),
     ).to(block_device)
Example #7
0
 def __init__(self, dim_model):
     super(SublayerConnection, self).__init__()
     self.norm = nn.LayerNorm(dim_model)
Example #8
0
 def __init__(self, size, dropout):
     super(SublayerConnection, self).__init__()
     self.norm = nn.LayerNorm(size)
     self.dropout = nn.Dropout(dropout)
Example #9
0
 def __init__(self, backbone, hidden_size=2560, class_num=168 * 11 * 7):
     super(BengalModel, self).__init__()
     self.backbone = backbone
     self._avg_pooling = nn.AdaptiveAvgPool2d(1)
     self.fc = nn.Linear(hidden_size, class_num)
     self.ln = nn.LayerNorm(hidden_size)
Example #10
0
 def __init__(self, hidden_dim: int, sublayer: nn.Module):
     super(AddAndNorm, self).__init__()
     self.norm = nn.LayerNorm(hidden_dim)
     self.sublayer = sublayer
     return
Example #11
0
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = nn.LayerNorm(layer.size)

        self.count = nn.Embedding(200, 256)
    def __init__(self, classifier_dims, num_classes,
                 gaussian_noise, dropout,
                 internal_dims, n_layers,
                 featurizer, final_layer_builder,
                 n_tokens_in=64, n_tokens_out=16,
                 use_as_super=False, **kwargs):
        embedding_dims = 768
        super(AlbertClassifer, self).__init__(classifier_dims, num_classes, embedding_dims, gaussian_noise, dropout,
                                              internal_dims, n_layers,
                                              featurizer, final_layer_builder,
                                              n_tokens_in, n_tokens_out, True, **kwargs)
        self.word_masking_proba = kwargs["word_masking_proba"] if "word_masking_proba" in kwargs else 0.0
        self.need_fasttext = "fasttext_vector_config" in kwargs
        if "fasttext_vector_config" in kwargs:
            import fasttext
            ftvc = kwargs["fasttext_vector_config"]
            gru_layers = ftvc.pop("gru_layers", 0)
            fasttext_crawl = fasttext.load_model("crawl-300d-2M-subword.bin")
            fasttext_wiki = fasttext.load_model("wiki-news-300d-1M-subword.bin")
            bpe = BPEmb(dim=200)
            cngram = CharNGram()
            self.word_vectorizers = dict(fasttext_crawl=fasttext_crawl, fasttext_wiki=fasttext_wiki, bpe=bpe, cngram=cngram)
            crawl_nn = ExpandContract(900, embedding_dims, dropout,
                                      use_layer_norm=True, unit_norm=False, groups=(4, 4))
            self.crawl_nn = crawl_nn
            n_tokens_in = n_tokens_in + (8 * int(self.n_tokens_in/(8*1.375) + 1))
            if gru_layers > 0:
                lstm = nn.Sequential(GaussianNoise(gaussian_noise),
                                     nn.GRU(embedding_dims, int(embedding_dims / 2), gru_layers, batch_first=True, bidirectional=True, dropout=dropout))
                pre_query_layer = nn.Sequential(lstm, LambdaLayer(lambda x: x[0]), nn.LayerNorm(embedding_dims))
            else:
                pre_query_layer = nn.LayerNorm(embedding_dims)
            self.pre_query_layer = pre_query_layer

        if not use_as_super:
            model = kwargs["model"] if "model" in kwargs else 'albert-base-v2'
            global_dir = get_global("models_dir")
            model = os.path.join(global_dir, model) if model in os.listdir(global_dir) else model
            self.tokenizer = AutoTokenizer.from_pretrained(model)
            self.model = AutoModel.from_pretrained(model)
            print("Pick stored Model", model, "Model Class = ", type(self.model), "Tokenizer Class = ", type(self.tokenizer))
            if featurizer == "cnn":
                self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims, n_tokens_out,
                                                  classifier_dims, internal_dims, n_layers, gaussian_noise, dropout)
            elif featurizer == "gru":
                self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims,
                                                internal_dims, n_layers, gaussian_noise, dropout)
            elif featurizer == "basic":
                self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims, n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers, gaussian_noise, dropout)
            elif featurizer == "transformer":
                self.attention_drop_proba = kwargs["attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0
                n_encoders = kwargs.pop("n_encoders", n_layers)
                n_decoders = kwargs.pop("n_decoders", n_layers)
                self.featurizer = TransformerFeaturizer(n_tokens_in, embedding_dims, n_tokens_out,
                                                        classifier_dims,
                                                        internal_dims, n_encoders, n_decoders,
                                                        gaussian_noise, dropout, self.attention_drop_proba)
            else:
                raise NotImplementedError()

            self.final_layer = final_layer_builder(classifier_dims, n_tokens_out, num_classes, dropout, **kwargs)
        if "stored_model" in kwargs:
            load_stored_params(self, kwargs["stored_model"])
        self.reg_layers = [(c, c.p if hasattr(c, "p") else c.sigma) for c in self.children() if c.__class__ == GaussianNoise or c.__class__ == nn.Dropout]
def LayerNorm(embedding_dim):
    m = nn.LayerNorm(embedding_dim)
    return m
Example #14
0
 def __init__(self, d_in, d_hid, dropout):
     super().__init__()
     self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise
     self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise
     self.layer_norm = nn.LayerNorm(d_in)
     self.dropout = nn.Dropout(dropout)
Example #15
0
    def __init__(self, params, dico, is_encoder, with_output):
        """
        Transformer model (encoder or decoder).
        """
        super().__init__()

        # encoder / decoder, output layer
        self.is_encoder = is_encoder
        self.is_decoder = not is_encoder
        self.with_output = with_output

        # dictionary / languages
        self.n_langs = params.n_langs
        self.n_words = params.n_words
        self.eos_index = params.eos_index
        self.pad_index = params.pad_index
        self.dico = dico
        self.id2lang = params.id2lang
        self.lang2id = params.lang2id
        self.use_lang_emb = getattr(params, 'use_lang_emb', True)
        assert len(self.dico) == self.n_words
        assert len(self.id2lang) == len(self.lang2id) == self.n_langs

        # model parameters
        self.dim = params.emb_dim_encoder if is_encoder else params.emb_dim_decoder  # 512 by default
        self.hidden_dim = self.dim * 4  # 2048 by default
        self.n_heads = params.n_heads  # 8 by default
        self.n_layers = params.n_layers_encoder if is_encoder else params.n_layers_decoder
        self.dropout = params.dropout
        self.attention_dropout = params.attention_dropout
        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'

        # embeddings
        self.position_embeddings = Embedding(N_MAX_POSITIONS, self.dim)
        if params.sinusoidal_embeddings:
            create_sinusoidal_embeddings(N_MAX_POSITIONS,
                                         self.dim,
                                         out=self.position_embeddings.weight)
        if params.n_langs > 1 and self.use_lang_emb:
            self.lang_embeddings = Embedding(self.n_langs, self.dim)
        self.embeddings = Embedding(self.n_words,
                                    self.dim,
                                    padding_idx=self.pad_index)
        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12)

        # transformer layers
        self.attentions = nn.ModuleList()
        self.layer_norm1 = nn.ModuleList()
        self.ffns = nn.ModuleList()
        self.layer_norm2 = nn.ModuleList()
        if self.is_decoder:
            self.layer_norm15 = nn.ModuleList()
            self.encoder_attn = nn.ModuleList()

        self.cache = None

        for layer_id in range(self.n_layers):
            self.attentions.append(
                MultiHeadAttention(self.n_heads,
                                   self.dim,
                                   dropout=self.attention_dropout))
            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=1e-12))
            if self.is_decoder:
                self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12))
                self.encoder_attn.append(
                    MultiHeadAttention(self.n_heads,
                                       self.dim,
                                       dim_encoder=params.emb_dim_encoder,
                                       dropout=self.attention_dropout))
            self.ffns.append(
                TransformerFFN(self.dim,
                               self.hidden_dim,
                               self.dim,
                               dropout=self.dropout,
                               gelu_activation=params.gelu_activation))
            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))

        # output layer
        if self.with_output:
            self.pred_layer = PredLayer(params)
            if params.share_inout_emb:
                self.pred_layer.proj.weight = self.embeddings.weight
Example #16
0
    def __init__(self, input_size, hidden_size, bias=True):
        super().__init__(input_size, hidden_size, bias)

        self.ln_ih = nn.LayerNorm(4 * hidden_size)
        self.ln_hh = nn.LayerNorm(4 * hidden_size)
        self.ln_ho = nn.LayerNorm(hidden_size)
Example #17
0
    def __init__(self, n_features):
        super(LayerNorm, self).__init__()

        self.layer_norm = nn.LayerNorm(n_features)
 def __init__(self, dim, fn, context_dim=None):
     super().__init__()
     self.fn = fn
     self.norm = nn.LayerNorm(dim)
     self.norm_context = nn.LayerNorm(context_dim) if exists(
         context_dim) else None
 def build(self):
     """
     Construct Submodule And Prepare Parameters
     """
     self.state_ndim = len(self.state_shape)
     self.state_size = np.prod(self.state_shape)
     last_size = self.state_size
     # fully connected before LSTM
     if (self.fc_config_before_lstm is not None and
             len(self.fc_config_before_lstm) > 0):
         submodule = OrderedDict()
         for i_layer, layer_config in enumerate(
                 self.fc_config_before_lstm):
             num_hidden_unit, add_bias, activation = layer_config[:3]
             normalization_config = (layer_config[3]
                     if len(layer_config) > 3 else None)
             add_bias = (add_bias and (normalization_config is None))
             last_layer = submodule['fc%d' % i_layer] = nn.Linear(
                     last_size, num_hidden_unit, bias = add_bias)
             nn.init.xavier_uniform_(last_layer.weight,
                     calculate_gain_from_activation(activation))
             if (add_bias):
                 nn.init.constant_(last_layer.bias, 0)
             last_size = num_hidden_unit
             if (normalization_config is not None
                     and normalization_config == 'layernorm'):
                 submodule['fc_layernorm%d'
                         % i_layer] = nn.LayerNorm([last_size])
             if (activation is not None):
                 activation_type, activation_module = get_activation(
                         activation)
                 submodule[activation_type
                         + str(i_layer)] = activation_module
         self.fc_function_before_lstm = nn.Sequential(submodule)
     else:
         self.fc_function_before_lstm = None
     # LSTM
     self.lstm_list = []
     if (self.contain_lstm()):
         module_id = 'lstm'
         self.lstm_list.append(module_id)
         self.__setattr__(module_id, nn.LSTM(last_size, self.lstm_h_size))
         nn.init.xavier_uniform_(self.__getattr__(module_id).weight_hh_l0)
         nn.init.xavier_uniform_(self.__getattr__(module_id).weight_ih_l0)
         nn.init.constant_(self.__getattr__(module_id).bias_ih_l0, 0)
         nn.init.constant_(self.__getattr__(module_id).bias_hh_l0, 0)
         last_size = self.lstm_h_size
     # fully connected after LSTM
     if (self.fc_config_after_lstm is not None and
             len(self.fc_config_after_lstm) > 0):
         submodule = OrderedDict()
         for i_layer, layer_config in enumerate(
                 self.fc_config_after_lstm):
             num_hidden_unit, add_bias, activation = layer_config[:3]
             normalization_config = (layer_config[3]
                     if len(layer_config) > 3 else None)
             add_bias = (add_bias and (normalization_config is None))
             last_layer = submodule['fc%d' % i_layer] = nn.Linear(
                     last_size, num_hidden_unit, bias = add_bias)
             nn.init.xavier_uniform_(last_layer.weight,
                     calculate_gain_from_activation(activation))
             if (add_bias):
                 nn.init.constant_(last_layer.bias, 0)
             last_size = num_hidden_unit
             if (normalization_config is not None
                     and normalization_config == 'layernorm'):
                 submodule['fc_layernorm%d'
                         % i_layer] = nn.LayerNorm([last_size])
             if (activation is not None):
                 activation_type, activation_module = get_activation(
                         activation)
                 submodule[activation_type
                         + str(i_layer)] = activation_module
         self.fc_function_after_lstm = nn.Sequential(submodule)
     else:
         self.fc_function_after_lstm = None
     # policy and value
     self.policy_branch = nn.Linear(last_size, self.n_action,
                                     bias = False)
     nn.init.xavier_uniform_(self.policy_branch.weight)
     self.value_branch = nn.Linear(last_size, 1,
                                     bias = False)
     nn.init.xavier_uniform_(self.value_branch.weight)
     self.policy_softmax = nn.Softmax(dim = -1)
 def __init__(self, dim_model, dim_ff, dropout=0.1):
     super(PositionwiseFeedForward, self).__init__()
     self.linear_1 = nn.Linear(dim_model, dim_ff)
     self.linear_2 = nn.Linear(dim_ff, dim_model)
     self.dropout = nn.Dropout(dropout)
     self.layer_norm = nn.LayerNorm(dim_model)
Example #21
0
 def __init__(self, layer, num_layers):
     super(Encoder, self).__init__()
     self.layers = clones(layer, num_layers)
     self.norm = nn.LayerNorm(layer.dim_model)
 def __init__(self, dim_model, dim_hidden, dropout=0.1):
     super(PositionwiseFeedForwardWithConv, self).__init__()
     self.conv_1 = nn.Conv1d(dim_model, dim_hidden, 1)
     self.conv_2 = nn.Conv1d(dim_hidden, dim_model, 1)
     self.dropout = nn.Dropout(dropout)
     self.layer_norm = nn.LayerNorm(dim_model)
Example #23
0
 def __init__(self, channels):
     super().__init__()
     self.ln = nn.LayerNorm(channels)
Example #24
0
    def __init__(self,
                 num_inputs,
                 num_actions,
                 hidden_size,
                 action_range=1.,
                 init_w=3e-3,
                 log_std_min=-20,
                 log_std_max=2):
        super(PolicyNetwork, self).__init__()

        self.log_std_min = log_std_min
        self.log_std_max = log_std_max

        # self.linear1 = nn.Linear(num_inputs, hidden_size)
        # self.linear2 = nn.Linear(hidden_size, hidden_size)
        # self.linear3 = nn.Linear(hidden_size, hidden_size)
        # self.linear4 = nn.Linear(hidden_size, hidden_size)

        # self.tcn = TemporalConvNet(input_channels, num_channels, kernel_size=kernel_size, dropout=dropout)
        # self.tcn1 = nn.Conv1d(input_channels, out_channels = 256, kernel_size = kernel_size, stride=1, padding=0, dilation=1)
        # self.tcn2 = nn.Conv1d(256, out_channels = 256, kernel_size = kernel_size, stride=1, padding=0, dilation=1)
        # torch.nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')
        # self.fc1 = nn.Linear(num_channels[-1], hidden_size)
        # self.linear1 = nn.Linear(num_channels[-1], hidden_size)

        # self.conv1d1 = nn.Conv1d(input_channels, out_channels = hidden_size, kernel_size = kernel_size, stride=1, padding=0, dilation=1)
        # self.conv1d2 = nn.Conv1d(hidden_size, out_channels = hidden_size, kernel_size = kernel_size, stride=1, padding=0, dilation=1)
        # self.conv1d3 = nn.Conv1d(hidden_size, out_channels = hidden_size, kernel_size = kernel_size, stride=1, padding=0, dilation=1)

        # self.LN1 = nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True)
        self.model_conv = nn.Sequential(
            nn.Conv1d(input_channels,
                      out_channels=hidden_size,
                      kernel_size=kernel_size,
                      stride=1,
                      padding=0,
                      dilation=1),  # 输入10维,隐层20维
            # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True),
            nn.ReLU(),  # 激活函数
            nn.Conv1d(hidden_size,
                      out_channels=hidden_size,
                      kernel_size=kernel_size,
                      stride=1,
                      padding=0,
                      dilation=1),  # 输入10维,隐层20维
            # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True),
            nn.ReLU(),
            nn.Conv1d(hidden_size,
                      out_channels=hidden_size,
                      kernel_size=kernel_size,
                      stride=1,
                      padding=0,
                      dilation=1),  # 输入10维,隐层20维
            # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True),
            nn.ReLU(),
        )

        # self.model_lstm = nn.Sequential(
        #             nn.LSTM(hidden_size, hidden_size , 2),          # 输入10维,隐层20维
        #             # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True),
        #             # nn.ReLU(),                     # 激活函数
        #             nn.LSTM(10, 20, 2)           # 输入10维,隐层20维
        #             # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True),
        #             # nn.ReLU(),

        #         )

        # self.lstm = nn.LSTM(hidden_size, hidden_size , 1, batch_first = True)

        self.model = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),  # 输入10维,隐层20维
            nn.LayerNorm(hidden_size, elementwise_affine=True),
            nn.ReLU(),  # 激活函数
            nn.Linear(hidden_size, hidden_size),  # 输入10维,隐层20维
            nn.LayerNorm(hidden_size, elementwise_affine=True),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),  # 输入10维,隐层20维
            nn.LayerNorm(hidden_size, elementwise_affine=True),
            nn.ReLU(),
        )

        # self.model = nn.Sequential(
        #     nn.Linear(num_inputs, hidden_size),           # 输入10维,隐层20维
        #     nn.LayerNorm(hidden_size, elementwise_affine=True),
        #     nn.ReLU(),                     # 激活函数

        # )

        self.mean_linear = nn.Linear(hidden_size, num_actions)
        self.mean_linear.weight.data.uniform_(-init_w, init_w)
        self.mean_linear.bias.data.uniform_(-init_w, init_w)

        self.log_std_linear = nn.Linear(hidden_size, num_actions)
        self.log_std_linear.weight.data.uniform_(-init_w, init_w)
        self.log_std_linear.bias.data.uniform_(-init_w, init_w)

        self.action_range = action_range
        self.num_actions = num_actions

        print('#########')
Example #25
0
def LayerNorm(embedding_dim):
    m = nn.LayerNorm(embedding_dim)
    nn.init.constant_(m.weight, 1)
    nn.init.constant_(m.bias, 0)
    return m
Example #26
0
    def __init__(self, params, id2word, is_encoder, with_output):
        """
        Transformer model (encoder or decoder).
        """
        super().__init__()

        # encoder / decoder, output layer
        self.is_encoder = is_encoder
        self.is_decoder = not is_encoder
        self.with_output = with_output

        # dictionary
        self.n_words = params.n_words
        self.eos_index = params.eos_index
        self.pad_index = params.pad_index
        self.id2word = id2word
        assert len(self.id2word) == self.n_words

        # model parameters
        self.dim = params.emb_dim  # 512 by default
        self.hidden_dim = self.dim * 4  # 2048 by default
        self.n_heads = params.n_heads  # 8 by default
        self.n_layers = params.n_enc_layers if is_encoder else params.n_dec_layers
        self.dropout = params.dropout
        self.attention_dropout = params.attention_dropout
        self.nb_features = params.nb_features
        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'

        # embeddings
        self.position_embeddings = Embedding(N_MAX_POSITIONS, self.dim)
        if params.sinusoidal_embeddings:
            create_sinusoidal_embeddings(N_MAX_POSITIONS,
                                         self.dim,
                                         out=self.position_embeddings.weight)
        self.embeddings = Embedding(self.n_words,
                                    self.dim,
                                    padding_idx=self.pad_index)
        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12)

        # transformer layers
        self.attentions = nn.ModuleList()
        self.layer_norm1 = nn.ModuleList()
        self.ffns = nn.ModuleList()
        self.layer_norm2 = nn.ModuleList()
        if self.is_decoder:
            self.layer_norm15 = nn.ModuleList()
            self.encoder_attn = nn.ModuleList()

        for layer_id in range(self.n_layers):
            self.attentions.append(
                MultiHeadAttention(self.n_heads,
                                   self.dim,
                                   dropout=self.attention_dropout,
                                   nb_features=self.nb_features,
                                   causal=False))
            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=1e-12))
            if self.is_decoder:
                self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12))
                self.encoder_attn.append(
                    MultiHeadAttention(self.n_heads,
                                       self.dim,
                                       dropout=self.attention_dropout,
                                       nb_features=self.nb_features,
                                       causal=True))
            self.ffns.append(
                TransformerFFN(self.dim,
                               self.hidden_dim,
                               self.dim,
                               dropout=self.dropout))
            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))

        # output layer
        if self.with_output:
            self.proj = nn.Linear(self.dim, params.n_words, bias=True)
            if params.share_inout_emb:
                self.proj.weight = self.embeddings.weight
Example #27
0

def layer_norm(batch_x, gamma, beta, eps=1e-5):
    # Manual implementation
    n, d = batch_x.shape

    sample_mean = batch_x.mean(axis=1).view(2, 1)
    sample_var = batch_x.var(axis=1, unbiased=False).view(2, 1)
    std = torch.sqrt(sample_var + eps)
    x_centered = batch_x - sample_mean

    x_norm = x_centered / std
    out = gamma * x_norm + beta

    cache = (x_norm, x_centered, std, gamma)

    return out, cache


x = torch.rand(2, 3)
print(x)
x_norm, cache = layer_norm(x, gamma=0.02, beta=0.01)
print(x_norm)
print(cache[0])

# Pytorch implementation
# With/Without Learnable Parameters
model = nn.LayerNorm(normalized_shape=3)
output = model(x)
print(output)
Example #28
0
 def __init__(self, dim, fn):
     super().__init__()
     self.norm = nn.LayerNorm(dim)
     self.fn = fn
Example #29
0
    def __init__(
        self,
        input_width,
        input_height,
        input_channels,
        output_size,
        kernel_sizes,
        n_channels,
        strides,
        paddings,
        hidden_sizes=None,
        added_fc_input_size=0,
        conv_normalization_type='none',
        fc_normalization_type='none',
        init_w=1e-4,
        hidden_init=nn.init.xavier_uniform_,
        hidden_activation=nn.ReLU(),
        output_activation=identity,
        output_conv_channels=False,
        pool_type='none',
        pool_sizes=None,
        pool_strides=None,
        pool_paddings=None,
        image_augmentation=False,
        image_augmentation_padding=4,
    ):
        if hidden_sizes is None:
            hidden_sizes = []
        assert len(kernel_sizes) == \
               len(n_channels) == \
               len(strides) == \
               len(paddings)
        assert conv_normalization_type in {'none', 'batch', 'layer'}
        assert fc_normalization_type in {'none', 'batch', 'layer'}
        assert pool_type in {'none', 'max2d'}
        if pool_type == 'max2d':
            assert len(pool_sizes) == len(pool_strides) == len(pool_paddings)
        super().__init__()

        self.hidden_sizes = hidden_sizes
        self.input_width = input_width
        self.input_height = input_height
        self.input_channels = input_channels
        self.output_size = output_size
        self.output_activation = output_activation
        self.hidden_activation = hidden_activation
        self.conv_normalization_type = conv_normalization_type
        self.fc_normalization_type = fc_normalization_type
        self.added_fc_input_size = added_fc_input_size
        self.conv_input_length = self.input_width * self.input_height * self.input_channels
        self.output_conv_channels = output_conv_channels
        self.pool_type = pool_type
        self.image_augmentation = image_augmentation
        self.image_augmentation_padding = image_augmentation_padding

        self.conv_layers = nn.ModuleList()
        self.conv_norm_layers = nn.ModuleList()
        self.pool_layers = nn.ModuleList()
        self.fc_layers = nn.ModuleList()
        self.fc_norm_layers = nn.ModuleList()

        for i, (out_channels, kernel_size, stride, padding) in enumerate(
                zip(n_channels, kernel_sizes, strides, paddings)):
            conv = nn.Conv2d(input_channels,
                             out_channels,
                             kernel_size,
                             stride=stride,
                             padding=padding)
            hidden_init(conv.weight)
            conv.bias.data.fill_(0)

            conv_layer = conv
            self.conv_layers.append(conv_layer)
            input_channels = out_channels

            if pool_type == 'max2d':
                if pool_sizes[i] > 1:
                    self.pool_layers.append(
                        nn.MaxPool2d(
                            kernel_size=pool_sizes[i],
                            stride=pool_strides[i],
                            padding=pool_paddings[i],
                        ))

        # use torch rather than ptu because initially the model is on CPU
        test_mat = torch.zeros(
            1,
            self.input_channels,
            self.input_width,
            self.input_height,
        )
        # find output dim of conv_layers by trial and add norm conv layers
        for i, conv_layer in enumerate(self.conv_layers):
            test_mat = conv_layer(test_mat)
            if self.conv_normalization_type == 'batch':
                self.conv_norm_layers.append(nn.BatchNorm2d(test_mat.shape[1]))
            if self.conv_normalization_type == 'layer':
                self.conv_norm_layers.append(nn.LayerNorm(test_mat.shape[1:]))
            if self.pool_type != 'none' and len(self.pool_layers) > i:
                test_mat = self.pool_layers[i](test_mat)

        self.conv_output_flat_size = int(np.prod(test_mat.shape))
        if self.output_conv_channels:
            self.last_fc = None
        else:
            fc_input_size = self.conv_output_flat_size
            # used only for injecting input directly into fc layers
            fc_input_size += added_fc_input_size
            for idx, hidden_size in enumerate(hidden_sizes):
                fc_layer = nn.Linear(fc_input_size, hidden_size)
                fc_input_size = hidden_size

                fc_layer.weight.data.uniform_(-init_w, init_w)
                fc_layer.bias.data.uniform_(-init_w, init_w)

                self.fc_layers.append(fc_layer)

                if self.fc_normalization_type == 'batch':
                    self.fc_norm_layers.append(nn.BatchNorm1d(hidden_size))
                if self.fc_normalization_type == 'layer':
                    self.fc_norm_layers.append(nn.LayerNorm(hidden_size))

            self.last_fc = nn.Linear(fc_input_size, output_size)
            self.last_fc.weight.data.uniform_(-init_w, init_w)
            self.last_fc.bias.data.uniform_(-init_w, init_w)

        if self.image_augmentation:
            self.augmentation_transform = RandomCrop(
                input_height, self.image_augmentation_padding, device='cuda')
Example #30
0
 def __init__(self, size, dropout, layer_norm_rescale=True):
     super(SublayerConnection, self).__init__()
     self.norm = nn.LayerNorm(size, elementwise_affine=layer_norm_rescale)
     self.dropout = nn.Dropout(dropout)