Example #1
0
    def __init__(self,
                 ntoken,
                 ninp,
                 nhid,
                 nout,
                 nlevels,
                 kernel_size=2,
                 dilation=[1],
                 dropout=0.0,
                 dropouti=0.0,
                 dropouth=0.0,
                 dropoutl=0.0,
                 emb_dropout=0.0,
                 wdrop=0.0,
                 temporalwdrop=True,
                 tie_weights=True,
                 repack=False,
                 wnorm=True,
                 aux=True,
                 aux_frequency=20,
                 n_experts=0):
        """
        A deep sequence model based on TrellisNet

        :param ntoken: The number of unique tokens
        :param ninp: The input dimension
        :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build
                     a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600.
                     (The reason we want to separate this is from Theorem 1.)
        :param nout: The output dimension
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output (variational) dropout
        :param dropouti: Input (variational) dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used)
        :param emb_dropout: Embedding dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param tie_weights: Whether to tie the encoder and decoder weights
        :param repack: Whether to use history repackaging for TrellisNet
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        :param n_experts: The number of softmax "experts" (i.e., whether MoS is used)
        """
        super(TrellisNetModel, self).__init__()
        self.emb_dropout = emb_dropout
        self.dropout = dropout  # Rate for dropping eventual output
        self.dropouti = dropouti  # Rate for dropping embedding output
        self.dropoutl = dropoutl
        self.var_drop = VariationalDropout()

        self.encoder = nn.Embedding(ntoken, ninp)
        self.repack = repack
        self.nout = nout
        self.nhid = nhid
        self.ninp = ninp
        self.aux = aux
        self.n_experts = n_experts

        network = TrellisNet
        self.network = network(ninp,
                               nhid,
                               nout=nout,
                               nlevels=nlevels,
                               kernel_size=kernel_size,
                               dropouth=dropouth,
                               wnorm=wnorm,
                               aux_frequency=aux_frequency,
                               dilation=dilation)

        # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale"
        reg_term = '_v' if wnorm else ''
        self.network = WeightDrop(self.network,
                                  [['full_conv', 'weight1' + reg_term],
                                   ['full_conv', 'weight2' + reg_term]],
                                  dropout=wdrop,
                                  temporal=temporalwdrop)

        self.decoder = nn.Linear(nhid, ntoken)
        self.network = nn.ModuleList([self.network])
        self.init_weights()

        if tie_weights:
            if nout != ninp and self.n_experts == 0:
                raise ValueError(
                    'When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        if n_experts > 0:
            print("Applied Mixture of Softmax")
            self.mixsoft = MixSoftmax(n_experts,
                                      ntoken,
                                      nlasthid=nout,
                                      ninp=ninp,
                                      decoder=self.decoder,
                                      dropoutl=dropoutl)
            self.network.append(self.mixsoft)
Example #2
0
    def __init__(self, track_weight, ntoken, ninp, nhid, nout, nlevels, kernel_size=2, dilation=[1],
                 dropout=0.0, dropouti=0.0, dropouth=0.0, dropoutl=0.0, emb_dropout=0.0, wdrop=0.0,
                 temporalwdrop=True, tie_weights=True, repack=False, wnorm=True, aux=True, aux_frequency=20, n_experts=0,
                 load=""):
        """
        A deep sequence model based on TrellisNet
        :param ntoken: The number of unique tokens
        :param ninp: The input dimension
        :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build
                     a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600.
                     (The reason we want to separate this is from Theorem 1.)
        :param nout: The output dimension
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output (variational) dropout
        :param dropouti: Input (variational) dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used)
        :param emb_dropout: Embedding dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param tie_weights: Whether to tie the encoder and decoder weights
        :param repack: Whether to use history repackaging for TrellisNet
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        :param n_experts: The number of softmax "experts" (i.e., whether MoS is used)
        :param load: The path to the pickled weight file (the weights/biases should be in numpy format)
        """
        super(TrellisNetModel, self).__init__()
        self.emb_dropout = emb_dropout
        self.dropout = dropout  # Rate for dropping eventual output
        self.dropouti = dropouti  # Rate for dropping embedding output
        self.dropoutl = dropoutl
        self.var_drop = VariationalDropout()
        
        self.repack = repack
        self.nout = nout
        self.nhid = nhid
        self.ninp = ninp
        self.aux = aux
        self.n_experts = n_experts
        self.tie_weights = tie_weights
        self.wnorm = wnorm
        
        # 1) Set up encoder and decoder (embeddings)
        self.encoder = nn.Embedding.from_pretrained(track_weight, freeze=True)
        self.decoder = nn.Linear(nhid, nout)
        self.init_weights()
        if tie_weights:
            if nout != ninp and self.n_experts == 0:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        # 2) Set up TrellisNet
        tnet = TrellisNet
        self.tnet = tnet(ninp, nhid, nout=nout, nlevels=nlevels, kernel_size=kernel_size,
                         dropouth=dropouth, wnorm=wnorm, aux_frequency=aux_frequency, dilation=dilation)
        
        # 3) Set up MoS, if needed
        if n_experts > 0:
            print("Applied Mixture of Softmax")
            self.mixsoft = MixSoftmax(n_experts, ntoken, nlasthid=nout, ninp=ninp, decoder=self.decoder,
                                      dropoutl=dropoutl)
            
        # 4) Apply weight drop connect. If weightnorm is used, we apply the dropout to its "direction", instead of "scale"
        reg_term = '_v' if wnorm else ''
        self.tnet = WeightDrop(self.tnet,
                               [['full_conv', 'weight1' + reg_term],
                                ['full_conv', 'weight2' + reg_term]],
                                dropout=wdrop,
                                temporal=temporalwdrop)
        self.network = nn.ModuleList([self.tnet])
        if n_experts > 0: self.network.append(self.mixsoft)
            
            
        # 5) Load model, if path specified
        if len(load) > 0:
            params_dict = torch.load(open(load, 'rb'))
            self.load_weights(params_dict)
            print("Model loaded successfully from {0}".format(load))
Example #3
0
class TrellisNetModel(nn.Module):
    def __init__(self,
                 ntoken,
                 ninp,
                 nhid,
                 nout,
                 nlevels,
                 kernel_size=2,
                 dilation=[1],
                 dropout=0.0,
                 dropouti=0.0,
                 dropouth=0.0,
                 dropoutl=0.0,
                 emb_dropout=0.0,
                 wdrop=0.0,
                 temporalwdrop=True,
                 tie_weights=True,
                 repack=False,
                 wnorm=True,
                 aux=True,
                 aux_frequency=20,
                 n_experts=0):
        """
        A deep sequence model based on TrellisNet

        :param ntoken: The number of unique tokens
        :param ninp: The input dimension
        :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build
                     a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600.
                     (The reason we want to separate this is from Theorem 1.)
        :param nout: The output dimension
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output (variational) dropout
        :param dropouti: Input (variational) dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used)
        :param emb_dropout: Embedding dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param tie_weights: Whether to tie the encoder and decoder weights
        :param repack: Whether to use history repackaging for TrellisNet
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        :param n_experts: The number of softmax "experts" (i.e., whether MoS is used)
        """
        super(TrellisNetModel, self).__init__()
        self.emb_dropout = emb_dropout
        self.dropout = dropout  # Rate for dropping eventual output
        self.dropouti = dropouti  # Rate for dropping embedding output
        self.dropoutl = dropoutl
        self.var_drop = VariationalDropout()

        self.encoder = nn.Embedding(ntoken, ninp)
        self.repack = repack
        self.nout = nout
        self.nhid = nhid
        self.ninp = ninp
        self.aux = aux
        self.n_experts = n_experts

        network = TrellisNet
        self.network = network(ninp,
                               nhid,
                               nout=nout,
                               nlevels=nlevels,
                               kernel_size=kernel_size,
                               dropouth=dropouth,
                               wnorm=wnorm,
                               aux_frequency=aux_frequency,
                               dilation=dilation)

        # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale"
        reg_term = '_v' if wnorm else ''
        self.network = WeightDrop(self.network,
                                  [['full_conv', 'weight1' + reg_term],
                                   ['full_conv', 'weight2' + reg_term]],
                                  dropout=wdrop,
                                  temporal=temporalwdrop)

        self.decoder = nn.Linear(nhid, ntoken)
        self.network = nn.ModuleList([self.network])
        self.init_weights()

        if tie_weights:
            if nout != ninp and self.n_experts == 0:
                raise ValueError(
                    'When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        if n_experts > 0:
            print("Applied Mixture of Softmax")
            self.mixsoft = MixSoftmax(n_experts,
                                      ntoken,
                                      nlasthid=nout,
                                      ninp=ninp,
                                      decoder=self.decoder,
                                      dropoutl=dropoutl)
            self.network.append(self.mixsoft)

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden, decode=True):
        """
        Execute the forward pass of the deep network

        :param input: The input sequence, with dimesion (N, L)
        :param hidden: The initial hidden state (h, c)
        :param decode: Whether to use decoder
        :return: The predicted sequence
        """
        emb = embedded_dropout(self.encoder, input,
                               self.emb_dropout if self.training else 0)
        emb = self.var_drop(emb, self.dropouti)
        emb = emb.transpose(1, 2)

        trellisnet = self.network[0]
        raw_output, hidden, all_raw_outputs = trellisnet(emb, hidden)
        output = self.var_drop(raw_output, self.dropout)
        all_outputs = self.var_drop(
            all_raw_outputs, self.dropout,
            dim=4) if self.aux else None  # N x M x L x C
        decoded, all_decoded = None, None

        if self.n_experts > 0 and not decode:
            raise ValueError(
                "Mixture of softmax involves decoding phase. Must set decode=True"
            )

        if self.n_experts > 0:
            decoded = torch.log(self.mixsoft(output).add_(1e-8))
            all_decoded = torch.log(
                self.mixsoft(all_outputs).add_(1e-8)) if self.aux else None

        if decode:
            decoded = decoded if self.n_experts > 0 else self.decoder(output)
            if self.aux:
                all_decoded = all_decoded if self.n_experts > 0 else self.decoder(
                    all_outputs)  # N x M x L x C
            return (raw_output, output, decoded), hidden, all_decoded

        return (raw_output, output, output), hidden, all_outputs

    def init_hidden(self, bsz):
        h_size = self.nhid + self.nout
        weight = next(self.parameters()).data
        return (Variable(weight.new(bsz, h_size, 1).zero_()),
                Variable(weight.new(bsz, h_size, 1).zero_()))
Example #4
0
    def __init__(self,
                 ninp,
                 nhid,
                 nout,
                 nlevels,
                 kernel_size=2,
                 dilation=[1],
                 dropout=0.0,
                 dropouti=0.0,
                 dropouth=0.0,
                 wdrop=0.0,
                 temporalwdrop=True,
                 wnorm=True,
                 aux=False,
                 aux_frequency=1e4):
        """
        A sequence model using TrellisNet (on sequential MNIST & CIFAR-10). Note that this is different from
        the models in other tasks (e.g. word-level PTB) because: 1) there is no more embedding; 2) we only need
        one output at the end for classification of the pixel stream; and 3) the input and output features are
        very low-dimensional (e.g., 3 channels).

        :param ninp: The number of input channels of the pixels
        :param nhid: The number of hidden units in TrellisNet (excluding the output size)
        :param nout: The number of output channels (which should agree with the number of classes)
        :param nlevels: The number of TrellisNet layers
        :param kernel_size: Kernel size of the TrellisNet
        :param dilation: Dilation size of the TrellisNet
        :param dropout: Output dropout
        :param dropouti: Input dropout
        :param dropouth: Hidden-to-hidden (VD-based) dropout
        :param wdrop: Weight dropout
        :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0)
        :param wnorm: Whether to apply weight normalization
        :param aux: Whether to use auxiliary loss (deep supervision)
        :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True)
        """
        super(TrellisNetModel, self).__init__()
        self.nout = nout  # Should be the number of classes
        self.nhid = nhid
        self.dropout = dropout
        self.dropouti = dropouti
        self.aux = aux

        network = TrellisNet
        self.network = network(ninp,
                               nhid,
                               nout=nout,
                               nlevels=nlevels,
                               kernel_size=kernel_size,
                               dropouth=dropouth,
                               wnorm=wnorm,
                               aux_frequency=aux_frequency,
                               dilation=dilation)

        reg_term = '_v' if wnorm else ''
        self.network = WeightDrop(self.network,
                                  [['full_conv', 'weight1' + reg_term],
                                   ['full_conv', 'weight2' + reg_term]],
                                  dropout=wdrop,
                                  temporal=temporalwdrop)

        # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale"
        self.linear = nn.Linear(nout, nout)
        self.network = nn.ModuleList([self.network])