Python WeightDrop Examples

Programming Language: Python

Namespace/Package Name: optimizations

Class/Type: WeightDrop

Examples at hotexamples.com: 4

Python WeightDrop - 4 examples found. These are the top rated real world Python examples of optimizations.WeightDrop extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

WeightDrop(3)

append(1)

Frequently Used Methods

WeightDrop (3)

append (1)

Example #1

Show file

File: model.py Project: xdcesc/trellisnet

    def __init__(self,
                 ntoken,
                 ninp,
                 nhid,
                 nout,
                 nlevels,
                 kernel_size=2,
                 dilation=[1],
                 dropout=0.0,
                 dropouti=0.0,
                 dropouth=0.0,
                 dropoutl=0.0,
                 emb_dropout=0.0,
                 wdrop=0.0,
                 temporalwdrop=True,
                 tie_weights=True,
                 repack=False,
                 wnorm=True,
                 aux=True,
                 aux_frequency=20,
                 n_experts=0):
        """
        A deep sequence model based on TrellisNet

        :param ntoken: The number of unique tokens
        :param ninp: The input dimension
        :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build
                     a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600.
                     (The reason we want to separate this is from Theorem 1.)
        :param nout: The output dimension
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output (variational) dropout
        :param dropouti: Input (variational) dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used)
        :param emb_dropout: Embedding dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param tie_weights: Whether to tie the encoder and decoder weights
        :param repack: Whether to use history repackaging for TrellisNet
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        :param n_experts: The number of softmax "experts" (i.e., whether MoS is used)
        """
        super(TrellisNetModel, self).__init__()
        self.emb_dropout = emb_dropout
        self.dropout = dropout  # Rate for dropping eventual output
        self.dropouti = dropouti  # Rate for dropping embedding output
        self.dropoutl = dropoutl
        self.var_drop = VariationalDropout()

        self.encoder = nn.Embedding(ntoken, ninp)
        self.repack = repack
        self.nout = nout
        self.nhid = nhid
        self.ninp = ninp
        self.aux = aux
        self.n_experts = n_experts

        network = TrellisNet
        self.network = network(ninp,
                               nhid,
                               nout=nout,
                               nlevels=nlevels,
                               kernel_size=kernel_size,
                               dropouth=dropouth,
                               wnorm=wnorm,
                               aux_frequency=aux_frequency,
                               dilation=dilation)

        # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale"
        reg_term = '_v' if wnorm else ''
        self.network = WeightDrop(self.network,
                                  [['full_conv', 'weight1' + reg_term],
                                   ['full_conv', 'weight2' + reg_term]],
                                  dropout=wdrop,
                                  temporal=temporalwdrop)

        self.decoder = nn.Linear(nhid, ntoken)
        self.network = nn.ModuleList([self.network])
        self.init_weights()

        if tie_weights:
            if nout != ninp and self.n_experts == 0:
                raise ValueError(
                    'When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        if n_experts > 0:
            print("Applied Mixture of Softmax")
            self.mixsoft = MixSoftmax(n_experts,
                                      ntoken,
                                      nlasthid=nout,
                                      ninp=ninp,
                                      decoder=self.decoder,
                                      dropoutl=dropoutl)
            self.network.append(self.mixsoft)

Example #2

Show file

    def __init__(self, track_weight, ntoken, ninp, nhid, nout, nlevels, kernel_size=2, dilation=[1],
                 dropout=0.0, dropouti=0.0, dropouth=0.0, dropoutl=0.0, emb_dropout=0.0, wdrop=0.0,
                 temporalwdrop=True, tie_weights=True, repack=False, wnorm=True, aux=True, aux_frequency=20, n_experts=0,
                 load=""):
        """
        A deep sequence model based on TrellisNet
        :param ntoken: The number of unique tokens
        :param ninp: The input dimension
        :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build
                     a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600.
                     (The reason we want to separate this is from Theorem 1.)
        :param nout: The output dimension
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output (variational) dropout
        :param dropouti: Input (variational) dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used)
        :param emb_dropout: Embedding dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param tie_weights: Whether to tie the encoder and decoder weights
        :param repack: Whether to use history repackaging for TrellisNet
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        :param n_experts: The number of softmax "experts" (i.e., whether MoS is used)
        :param load: The path to the pickled weight file (the weights/biases should be in numpy format)
        """
        super(TrellisNetModel, self).__init__()
        self.emb_dropout = emb_dropout
        self.dropout = dropout  # Rate for dropping eventual output
        self.dropouti = dropouti  # Rate for dropping embedding output
        self.dropoutl = dropoutl
        self.var_drop = VariationalDropout()
        
        self.repack = repack
        self.nout = nout
        self.nhid = nhid
        self.ninp = ninp
        self.aux = aux
        self.n_experts = n_experts
        self.tie_weights = tie_weights
        self.wnorm = wnorm
        
        # 1) Set up encoder and decoder (embeddings)
        self.encoder = nn.Embedding.from_pretrained(track_weight, freeze=True)
        self.decoder = nn.Linear(nhid, nout)
        self.init_weights()
        if tie_weights:
            if nout != ninp and self.n_experts == 0:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        # 2) Set up TrellisNet
        tnet = TrellisNet
        self.tnet = tnet(ninp, nhid, nout=nout, nlevels=nlevels, kernel_size=kernel_size,
                         dropouth=dropouth, wnorm=wnorm, aux_frequency=aux_frequency, dilation=dilation)
        
        # 3) Set up MoS, if needed
        if n_experts > 0:
            print("Applied Mixture of Softmax")
            self.mixsoft = MixSoftmax(n_experts, ntoken, nlasthid=nout, ninp=ninp, decoder=self.decoder,
                                      dropoutl=dropoutl)
            
        # 4) Apply weight drop connect. If weightnorm is used, we apply the dropout to its "direction", instead of "scale"
        reg_term = '_v' if wnorm else ''
        self.tnet = WeightDrop(self.tnet,
                               [['full_conv', 'weight1' + reg_term],
                                ['full_conv', 'weight2' + reg_term]],
                                dropout=wdrop,
                                temporal=temporalwdrop)
        self.network = nn.ModuleList([self.tnet])
        if n_experts > 0: self.network.append(self.mixsoft)
            
            
        # 5) Load model, if path specified
        if len(load) > 0:
            params_dict = torch.load(open(load, 'rb'))
            self.load_weights(params_dict)
            print("Model loaded successfully from {0}".format(load))

Example #3

Show file

File: model.py Project: xdcesc/trellisnet

class TrellisNetModel(nn.Module):
    def __init__(self,
                 ntoken,
                 ninp,
                 nhid,
                 nout,
                 nlevels,
                 kernel_size=2,
                 dilation=[1],
                 dropout=0.0,
                 dropouti=0.0,
                 dropouth=0.0,
                 dropoutl=0.0,
                 emb_dropout=0.0,
                 wdrop=0.0,
                 temporalwdrop=True,
                 tie_weights=True,
                 repack=False,
                 wnorm=True,
                 aux=True,
                 aux_frequency=20,
                 n_experts=0):
        """
        A deep sequence model based on TrellisNet

        :param ntoken: The number of unique tokens
        :param ninp: The input dimension
        :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build
                     a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600.
                     (The reason we want to separate this is from Theorem 1.)
        :param nout: The output dimension
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output (variational) dropout
        :param dropouti: Input (variational) dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used)
        :param emb_dropout: Embedding dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param tie_weights: Whether to tie the encoder and decoder weights
        :param repack: Whether to use history repackaging for TrellisNet
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        :param n_experts: The number of softmax "experts" (i.e., whether MoS is used)
        """
        super(TrellisNetModel, self).__init__()
        self.emb_dropout = emb_dropout
        self.dropout = dropout  # Rate for dropping eventual output
        self.dropouti = dropouti  # Rate for dropping embedding output
        self.dropoutl = dropoutl
        self.var_drop = VariationalDropout()

        self.encoder = nn.Embedding(ntoken, ninp)
        self.repack = repack
        self.nout = nout
        self.nhid = nhid
        self.ninp = ninp
        self.aux = aux
        self.n_experts = n_experts

        network = TrellisNet
        self.network = network(ninp,
                               nhid,
                               nout=nout,
                               nlevels=nlevels,
                               kernel_size=kernel_size,
                               dropouth=dropouth,
                               wnorm=wnorm,
                               aux_frequency=aux_frequency,
                               dilation=dilation)

        # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale"
        reg_term = '_v' if wnorm else ''
        self.network = WeightDrop(self.network,
                                  [['full_conv', 'weight1' + reg_term],
                                   ['full_conv', 'weight2' + reg_term]],
                                  dropout=wdrop,
                                  temporal=temporalwdrop)

        self.decoder = nn.Linear(nhid, ntoken)
        self.network = nn.ModuleList([self.network])
        self.init_weights()

        if tie_weights:
            if nout != ninp and self.n_experts == 0:
                raise ValueError(
                    'When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        if n_experts > 0:
            print("Applied Mixture of Softmax")
            self.mixsoft = MixSoftmax(n_experts,
                                      ntoken,
                                      nlasthid=nout,
                                      ninp=ninp,
                                      decoder=self.decoder,
                                      dropoutl=dropoutl)
            self.network.append(self.mixsoft)

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden, decode=True):
        """
        Execute the forward pass of the deep network

        :param input: The input sequence, with dimesion (N, L)
        :param hidden: The initial hidden state (h, c)
        :param decode: Whether to use decoder
        :return: The predicted sequence
        """
        emb = embedded_dropout(self.encoder, input,
                               self.emb_dropout if self.training else 0)
        emb = self.var_drop(emb, self.dropouti)
        emb = emb.transpose(1, 2)

        trellisnet = self.network[0]
        raw_output, hidden, all_raw_outputs = trellisnet(emb, hidden)
        output = self.var_drop(raw_output, self.dropout)
        all_outputs = self.var_drop(
            all_raw_outputs, self.dropout,
            dim=4) if self.aux else None  # N x M x L x C
        decoded, all_decoded = None, None

        if self.n_experts > 0 and not decode:
            raise ValueError(
                "Mixture of softmax involves decoding phase. Must set decode=True"
            )

        if self.n_experts > 0:
            decoded = torch.log(self.mixsoft(output).add_(1e-8))
            all_decoded = torch.log(
                self.mixsoft(all_outputs).add_(1e-8)) if self.aux else None

        if decode:
            decoded = decoded if self.n_experts > 0 else self.decoder(output)
            if self.aux:
                all_decoded = all_decoded if self.n_experts > 0 else self.decoder(
                    all_outputs)  # N x M x L x C
            return (raw_output, output, decoded), hidden, all_decoded

        return (raw_output, output, output), hidden, all_outputs

    def init_hidden(self, bsz):
        h_size = self.nhid + self.nout
        weight = next(self.parameters()).data
        return (Variable(weight.new(bsz, h_size, 1).zero_()),
                Variable(weight.new(bsz, h_size, 1).zero_()))

Example #4

Show file

File: model.py Project: CookieBox26/trellisnet

    def __init__(self,
                 ninp,
                 nhid,
                 nout,
                 nlevels,
                 kernel_size=2,
                 dilation=[1],
                 dropout=0.0,
                 dropouti=0.0,
                 dropouth=0.0,
                 wdrop=0.0,
                 temporalwdrop=True,
                 wnorm=True,
                 aux=False,
                 aux_frequency=1e4):
        """
        A sequence model using TrellisNet (on sequential MNIST & CIFAR-10). Note that this is different from
        the models in other tasks (e.g. word-level PTB) because: 1) there is no more embedding; 2) we only need
        one output at the end for classification of the pixel stream; and 3) the input and output features are
        very low-dimensional (e.g., 3 channels).

        :param ninp: The number of input channels of the pixels
        :param nhid: The number of hidden units in TrellisNet (excluding the output size)
        :param nout: The number of output channels (which should agree with the number of classes)
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output dropout
        :param dropouti: Input dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        """
        super(TrellisNetModel, self).__init__()
        self.nout = nout  # Should be the number of classes
        self.nhid = nhid
        self.dropout = dropout
        self.dropouti = dropouti
        self.aux = aux

        network = TrellisNet
        self.network = network(ninp,
                               nhid,
                               nout=nout,
                               nlevels=nlevels,
                               kernel_size=kernel_size,
                               dropouth=dropouth,
                               wnorm=wnorm,
                               aux_frequency=aux_frequency,
                               dilation=dilation)

        reg_term = '_v' if wnorm else ''
        self.network = WeightDrop(self.network,
                                  [['full_conv', 'weight1' + reg_term],
                                   ['full_conv', 'weight2' + reg_term]],
                                  dropout=wdrop,
                                  temporal=temporalwdrop)

        # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale"
        self.linear = nn.Linear(nout, nout)
        self.network = nn.ModuleList([self.network])