Exemple #1
0
class bGD_FF(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,
                 output_activation):
        super(GD_FF, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        if output_activation == 'sigmoid':
            self.output_activation = F.sigmoid
        elif output_activation == 'tanh':
            self.output_activation = F.tanh
        else:
            self.output_activation = None

        # Block Input
        self.w_inp = Parameter(torch.rand(hidden_size, input_size),
                               requires_grad=1)

        # Output weights
        self.w_hid_out = Parameter(torch.rand(output_size, hidden_size),
                                   requires_grad=1)

        for param in self.parameters():
            # torch.nn.init.xavier_normal(param)
            # torch.nn.init.orthogonal(param)
            # torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)

    def reset(self, batch_size):
        return

    def graph_compute(self, input):
        # Compute hidden activations
        hidden_act = F.sigmoid(
            self.w_inp.mm(input))  # + self.w_block_input_bias)

        #Compute Output
        output = self.w_hid_out.mm(hidden_act)
        if self.output_activation != None:
            output = self.output_activation(output)

        return output

    def forward(self, input):
        self.out = self.graph_compute(input)
        return self.out

    def turn_grad_on(self):
        for param in self.parameters():
            param.requires_grad = True
            param.volatile = False

    def turn_grad_off(self):
        for param in self.parameters():
            param.requires_grad = False
            param.volatile = True
Exemple #2
0
class Single_MMU(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size,
                 n_vocab):
        super(Single_MMU, self).__init__()

        #Define model
        self.embeddings = nn.Embedding(n_vocab + 1, embedding_dim)
        self.mmu = mod.GD_MMU(embedding_dim, hidden_size, memory_size,
                              hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.w_out = Parameter(torch.rand(output_size, hidden_size),
                               requires_grad=1)

        for param in self.parameters():
            # torch.nn.init.xavier_normal(param)
            # torch.nn.init.orthogonal(param)
            # torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)

    def forward(self, input):
        embeds = self.embeddings(input)
        mmu_out = self.mmu.forward(torch.t(embeds))
        mmu_out = self.dropout(mmu_out)
        out = self.w_out.mm(mmu_out)
        out = F.log_softmax(torch.t(out))
        return out

    def reset(self, batch_size):
        #self.poly.reset(batch_size)
        self.mmu.reset(batch_size)
Exemple #3
0
class PPCA_Variational(Distribution):
    def __init__(self, ppca):
        super().__init__()
        self.K = ppca.K
        self.D = ppca.D
        self.W = Parameter(torch.Tensor(ppca.K, ppca.D).float())
        self.noise = ppca.noise
        self.reset_parameters()

    def reset_parameters(self):
        init.kaiming_uniform_(self.W, a=math.sqrt(5))

    def sample(self, X, compute_logprob=False):
        dist = Normal(F.linear(X, self.W),
                      self.noise * torch.eye(self.K),
                      learnable=False)
        z = dist.sample(1).squeeze(0)
        if compute_logprob:
            return z, dist.log_prob(z)
        return z

    def log_prob(self, z, X):
        dist = Normal(self.W.mm(X),
                      self.noise * torch.eye(self.D),
                      learnable=False)
        return dist.log_prob(z)
Exemple #4
0
class Stacked_MMU(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size,
                 n_vocab):
        super(Stacked_MMU, self).__init__()

        #Define model
        #self.poly = mod.GD_polynet(input_size, hidden_size, hidden_size, hidden_size, None)
        self.embedding = Parameter(torch.rand(input_size, n_vocab),
                                   requires_grad=1)
        self.mmu1 = mod.GD_MMU(n_vocab, hidden_size, memory_size, hidden_size)
        self.mmu2 = mod.GD_MMU(hidden_size, hidden_size, memory_size,
                               hidden_size)
        self.mmu3 = mod.GD_MMU(hidden_size, hidden_size, memory_size,
                               hidden_size)

        self.w_out1 = Parameter(torch.rand(output_size, hidden_size),
                                requires_grad=1)
        self.w_out2 = Parameter(torch.rand(output_size, hidden_size),
                                requires_grad=1)
        self.w_out3 = Parameter(torch.rand(output_size, hidden_size),
                                requires_grad=1)

        for param in self.parameters():
            # torch.nn.init.xavier_normal(param)
            # torch.nn.init.orthogonal(param)
            # torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)

    def forward(self, input):
        input = self.embedding.mm(input)
        mmu1_out = self.mmu1.forward(input)
        mmu2_out = self.mmu2.forward(mmu1_out)
        mmu3_out = self.mmu3.forward(mmu2_out)

        out = self.w_out3.mm(
            mmu3_out)  # + self.w_out2.mm(mmu2_out) + self.w_out1.mm(mmu1_out)
        out = F.log_softmax(torch.t(out))
        return torch.t(out)

    def reset(self, batch_size):
        #self.poly.reset(batch_size)
        self.mmu1.reset(batch_size)
        self.mmu2.reset(batch_size)
        self.mmu3.reset(batch_size)
Exemple #5
0
class GD_polynet(nn.Module):
    def __init__(self, input_size, h1, h2, output_size, output_activation):
        super(GD_polynet, self).__init__()

        self.input_size = input_size
        self.output_size = output_size
        if output_activation == 'sigmoid': self.output_activation = F.sigmoid
        elif output_activation == 'tanh': self.output_activation = F.tanh
        else: self.output_activation = None

        #Weights
        self.w1 = Parameter(torch.rand(h1, input_size), requires_grad=1)
        self.w_poly = Parameter(torch.rand(h2, h1), requires_grad=1)
        self.w2 = Parameter(torch.rand(output_size, h2), requires_grad=1)

        #Initialize weights except for poly weights which are initialized to all 1s
        for param in self.parameters():
            #torch.nn.init.xavier_normal(param)
            #torch.nn.init.orthogonal(param)
            #torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)
        #self.w_poly = Parameter(torch.ones(h2, h1), requires_grad=1) +
        self.w_poly.data += 1.0

    def forward(self, input):
        first_out = F.threshold(
            self.w1.mm(input), 0.01, 0.01
        )  #First dense layer with thresholding activation (Relu except 0 translated to 0.1)

        #Polynomial operation
        poly1 = torch.t(first_out).pow(self.w_poly)
        poly_out = torch.sum(poly1, 1).unsqueeze(1)

        #Output dense layer
        output = self.w2.mm(poly_out)
        if self.output_activation != None:
            output = self.output_activation(output)
        return output

    #TODO Batch Process for GD_Polynet
    def reset(self, batch_size):
        return
Exemple #6
0
class Encoder(Module):
    """
    Encodes a node's using 'convolutional' GraphSage approach
    """
    def __init__(
            self,
            features,
            feature_dim,
            embed_dim,
            adj_lists,
            aggregator,
            num_sample=10,
            base_model=None,
            gcn=False,  #cuda=False, 
            feature_transform=False):
        super().__init__()

        self.features = features
        self.feat_dim = feature_dim
        self.adj_lists = adj_lists
        self.aggregator = aggregator
        self.num_sample = num_sample
        if base_model != None:
            self.base_model = base_model

        self.gcn = gcn
        self.embed_dim = embed_dim
        #self.cuda = cuda
        #self.aggregator.cuda = cuda
        weight_dim_y = self.feat_dim if self.gcn else 2 * self.feat_dim
        self.weight = Parameter(torch.empty(embed_dim, weight_dim_y))
        init.xavier_uniform(self.weight)

    def forward(self, nodes):
        """
        Generates embeddings for a batch of nodes.

        nodes     -- list of nodes
        """
        to_neighs = [self.adj_lists[int(node)] for node in nodes]
        neigh_feats = self.aggregator.forward(nodes, to_neighs,
                                              self.num_sample)
        if not self.gcn:
            combined = neigh_feats
#            if self.cuda:
#                self_feats = self.features(torch.LongTensor(nodes).cuda())
#            else:
#                self_feats = self.features(torch.LongTensor(nodes))
#            combined = torch.cat([self_feats, neigh_feats], dim=1)
        else:
            self_feats = self.features(torch.tensor(nodes))
            combined = torch.cat([self_feats, neigh_feats], dim=1)
        combined = relu(self.weight.mm(combined.t()))
        return combined
Exemple #7
0
class SupervisedGraphSage(Module):
    def __init__(self, num_classes, enc):
        super().__init__()
        self.enc = enc
        self.xent = CrossEntropyLoss()

        self.weight = Parameter(torch.empty(num_classes, enc.embed_dim))
        init.xavier_uniform_(self.weight)

    def forward(self, nodes):
        embeds = self.enc(nodes)
        scores = self.weight.mm(embeds)
        return scores.t()

    def loss(self, nodes, labels):
        scores = self.forward(nodes)
        return self.xent(scores, labels.squeeze())
Exemple #8
0
class Single_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size):
        super(Single_LSTM, self).__init__()

        #Define model
        self.lstm = mod.GD_LSTM(input_size, hidden_size, memory_size,
                                hidden_size)
        self.w_out = Parameter(torch.rand(output_size, hidden_size),
                               requires_grad=1)

    def forward(self, input):
        lstm_out = self.lstm.forward(input)
        out = self.w_out.mm(lstm_out)
        out = F.log_softmax(out)
        return out

    def reset(self, batch_size):
        self.lstm.reset(batch_size)
Exemple #9
0
class Single_MMU(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size):
        super(Single_MMU, self).__init__()

        #Define model
        self.mmu = mod.GD_MMU(input_size, hidden_size, memory_size,
                              hidden_size)
        self.w_out = Parameter(torch.rand(output_size, hidden_size),
                               requires_grad=1)

    def forward(self, input):
        #input = self.poly.forward(input)
        mmu_out = self.mmu.forward(input)
        out = self.w_out.mm(mmu_out)
        out = F.log_softmax(out)
        return out

    def reset(self, batch_size):
        self.mmu.reset(batch_size)
Exemple #10
0
class Stacked_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size,
                 n_vocab):
        super(Stacked_LSTM, self).__init__()

        #Define model
        #self.poly = mod.GD_polynet(input_size, hidden_size, hidden_size, hidden_size, None)
        self.embeddings = nn.Embedding(n_vocab + 1, embedding_dim)
        self.lstm1 = mod.GD_LSTM(embedding_dim, hidden_size, memory_size,
                                 hidden_size)
        self.lstm2 = mod.GD_LSTM(hidden_size, hidden_size, memory_size,
                                 hidden_size)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)
        #self.bnorm1 = nn.BatchNorm2d(hidden_size)
        #self.w_out1 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1)
        self.w_out2 = Parameter(torch.rand(output_size, hidden_size),
                                requires_grad=1)

        for param in self.parameters():
            # torch.nn.init.xavier_normal(param)
            # torch.nn.init.orthogonal(param)
            # torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)

    def forward(self, input):
        embeds = self.embeddings(input)
        lstm1_out = self.lstm1.forward(torch.t(embeds))
        lstm1_out = self.dropout1(lstm1_out)
        #lstm1_out = self.bnorm1(lstm1_out)
        lstm2_out = self.lstm2.forward(lstm1_out)
        lstm2_out = self.dropout1(lstm2_out)

        out = self.w_out2.mm(
            lstm2_out
        )  # + self.w_out2.mm(lstm2_out) + self.w_out1.mm(lstm1_out)
        out = F.log_softmax(torch.t(out))
        return out

    def reset(self, batch_size):
        #self.poly.reset(batch_size)
        self.lstm1.reset(batch_size)
        self.lstm2.reset(batch_size)
Exemple #11
0
class Stacked_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size):
        super(Stacked_LSTM, self).__init__()

        #Define model
        #self.poly = mod.GD_polynet(input_size, hidden_size, hidden_size, hidden_size, None)
        self.lstm1 = mod.GD_LSTM(input_size, hidden_size, memory_size,
                                 hidden_size)
        self.lstm2 = mod.GD_LSTM(hidden_size, hidden_size, memory_size,
                                 hidden_size)
        self.lstm3 = mod.GD_LSTM(hidden_size, hidden_size, memory_size,
                                 hidden_size)

        self.w_out1 = Parameter(torch.rand(output_size, hidden_size),
                                requires_grad=1)
        self.w_out2 = Parameter(torch.rand(output_size, hidden_size),
                                requires_grad=1)
        self.w_out3 = Parameter(torch.rand(output_size, hidden_size),
                                requires_grad=1)

    def forward(self, input):
        #input = self.poly.forward(input)
        lstm1_out = self.lstm1.forward(input)
        lstm2_out = self.lstm2.forward(lstm1_out)
        lstm3_out = self.lstm3.forward(lstm2_out)

        out = self.w_out3.mm(
            lstm3_out
        )  # + self.w_out2.mm(lstm2_out) + self.w_out1.mm(lstm1_out)
        out = F.log_softmax(torch.t(out))
        return torch.t(out)

    def reset(self, batch_size):
        #self.poly.reset(batch_size)
        self.lstm1.reset(batch_size)
        self.lstm2.reset(batch_size)
        self.lstm3.reset(batch_size)
Exemple #12
0
class GRUPoem(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128):
        super(GRUPoem, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        input_dim = hidden_dim + embedding_dim

        #init weight
        self.Wr = Parameter(
            torch.rand(hidden_dim, input_dim) *
            np.sqrt(2 / (input_dim + hidden_dim)))
        self.Br = Parameter(torch.rand(hidden_dim, 1))
        self.Wz = Parameter(
            torch.rand(hidden_dim, input_dim) *
            np.sqrt(2 / (hidden_dim + hidden_dim)))
        self.Bz = Parameter(torch.rand(hidden_dim, 1))
        self.Wh = Parameter(
            torch.rand(hidden_dim, input_dim) *
            np.sqrt(2 / (input_dim + hidden_dim)))
        self.Bh = Parameter(torch.rand(hidden_dim, 1))
        self.W = Parameter(
            torch.rand(vocab_size, hidden_dim) *
            np.sqrt(2 / (vocab_size + hidden_dim)))
        self.b = Parameter(torch.rand(vocab_size, 1))

    def forward(self, x, hidden=None):
        #         x:    seq_len * batch_size
        seq_len, batch_size = x.size()

        if hidden is None:
            Ht = x.data.new(self.hidden_dim, batch_size).fill_(0).float()
        else:
            Ht = hidden

        embeds = self.embedding(x)
        # seq * batch * embedding

        output = []

        for i in range(len(embeds)):

            xTmp = embeds[i].transpose(1, 0).contiguous()
            x_h = torch.cat((xTmp, Ht), 0).cuda()

            Rt = torch.sigmoid(self.Wr.mm(x_h) + self.Br)
            Zt = torch.sigmoid(self.Wz.mm(x_h) + self.Bz)
            Ht_ = torch.mul(Ht, Rt)
            x_h_ = torch.cat((xTmp, Ht_), 0).cuda()
            H_ = torch.tanh(self.Wh.mm(x_h_) + self.Bh)

            Ht = torch.mul(Zt, Ht) + torch.mul((1 - Zt, H_))
            y = self.W.mm(Ht) + self.b
            # no softmax: included in cross entropy loss
            y = y.transpose(1, 0).contiguous()
            # y:  batch_size, vocab
            output.append(y)

        output = torch.cat(output, 0)
        output = output.view(seq_len * batch_size, -1)
        #output:    (seq * batchsize, vocab)
        return output, Ht
Exemple #13
0
class lstmPoem(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_dim=128,
                 hidden_dim=128,
                 cell_dim=128):
        super(lstmPoem, self).__init__()
        self.hidden_dim = hidden_dim
        self.cell_dim = cell_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        input_dim = hidden_dim + embedding_dim

        #init weight
        self.Wc = Parameter(
            torch.rand(cell_dim, input_dim) * np.sqrt(2 /
                                                      (input_dim + cell_dim)))
        self.Bc = Parameter(torch.rand(cell_dim, 1))
        self.Wf = Parameter(
            torch.rand(cell_dim, input_dim) * np.sqrt(2 /
                                                      (input_dim + cell_dim)))
        self.Bf = Parameter(torch.rand(cell_dim, 1))
        self.Wi = Parameter(
            torch.rand(cell_dim, input_dim) * np.sqrt(2 /
                                                      (input_dim + cell_dim)))
        self.Bi = Parameter(torch.rand(cell_dim, 1))
        self.Wo = Parameter(
            torch.rand(cell_dim, input_dim) * np.sqrt(2 /
                                                      (input_dim + cell_dim)))
        self.Bo = Parameter(torch.rand(cell_dim, 1))
        self.W = Parameter(
            torch.rand(vocab_size, hidden_dim) *
            np.sqrt(2 / (vocab_size + hidden_dim)))
        self.b = Parameter(torch.rand(vocab_size, 1))


#         self.gate = nn.Linear(input_dim, cell_dim)
#         self.output = nn.Linear(hidden_dim, vocab_size)
#         self.sigmoid = nn.Sigmoid()
#         self.tanh = nn.Tanh()

    def forward(self, x, hidden=None, cell=None):

        #         x:    seq_len * batch_size
        seq_len, batch_size = x.size()

        if hidden is None:
            Ht = x.data.new(self.hidden_dim, batch_size).fill_(0).float()
        else:
            Ht = hidden

        if cell is None:
            Ct = x.data.new(self.cell_dim, batch_size).fill_(0).float()
        else:
            Ct = cell

        embeds = self.embedding(x)
        # seq * batch * embedding

        output = []

        for i in range(len(embeds)):

            # self.Bx: cell_dim * 1
            # Wx:      cell_dim * input_dim
            # x_h:     input_dim * batch_size
            # C:       cell_dim * batch_size
            # H:       hidden_dim * batch_size

            xTmp = embeds[i].transpose(1, 0).contiguous()
            x_h = torch.cat((xTmp, Ht), 0).cuda()

            Ft = torch.sigmoid(self.Wf.mm(x_h) + self.Bf)
            It = torch.sigmoid(self.Wi.mm(x_h) + self.Bi)
            Ot = torch.sigmoid(self.Wo.mm(x_h) + self.Bo)
            Ct_ = torch.tanh(self.Wc.mm(x_h) + self.Bc)

            Ct = torch.add(torch.mul(Ft, Ct), torch.mul(It, Ct_))
            Ht = torch.mul(torch.tanh(Ct), Ot)
            y = self.W.mm(Ht) + self.b
            # no softmax: included in cross entropy loss
            y = y.transpose(1, 0).contiguous()
            # y:  batch_size, vocab
            output.append(y)

        output = torch.cat(output, 0)
        output = output.view(seq_len * batch_size, -1)
        #output:    (seq * batchsize, vocab)
        return output, Ht, Ct
Exemple #14
0
class GD_MMU(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size,
                 output_activation):
        super(GD_MMU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.memory_size = memory_size
        self.output_size = output_size
        if output_activation == 'sigmoid': self.output_activation = F.sigmoid
        elif output_activation == 'tanh': self.output_activation = F.tanh
        else: self.output_activation = None

        #Input gate
        self.w_inpgate = Parameter(torch.rand(hidden_size, input_size),
                                   requires_grad=1)
        self.w_rec_inpgate = Parameter(torch.rand(hidden_size, output_size),
                                       requires_grad=1)
        self.w_mem_inpgate = Parameter(torch.rand(hidden_size, memory_size),
                                       requires_grad=1)

        #Block Input
        self.w_inp = Parameter(torch.rand(hidden_size, input_size),
                               requires_grad=1)
        self.w_rec_inp = Parameter(torch.rand(hidden_size, output_size),
                                   requires_grad=1)

        #Read Gate
        self.w_readgate = Parameter(torch.rand(memory_size, input_size),
                                    requires_grad=1)
        self.w_rec_readgate = Parameter(torch.rand(memory_size, output_size),
                                        requires_grad=1)
        self.w_mem_readgate = Parameter(torch.rand(memory_size, memory_size),
                                        requires_grad=1)

        #Memory Decoder
        self.w_decoder = Parameter(torch.rand(hidden_size, memory_size),
                                   requires_grad=1)

        #Write Gate
        self.w_writegate = Parameter(torch.rand(memory_size, input_size),
                                     requires_grad=1)
        self.w_rec_writegate = Parameter(torch.rand(memory_size, output_size),
                                         requires_grad=1)
        self.w_mem_writegate = Parameter(torch.rand(memory_size, memory_size),
                                         requires_grad=1)

        #Memory Encoder
        self.w_encoder = Parameter(torch.rand(memory_size, hidden_size),
                                   requires_grad=1)

        #Output weights
        self.w_hid_out = Parameter(torch.rand(output_size, hidden_size),
                                   requires_grad=1)

        #Biases
        self.w_input_gate_bias = Parameter(torch.zeros(hidden_size, 1),
                                           requires_grad=1)
        self.w_block_input_bias = Parameter(torch.zeros(hidden_size, 1),
                                            requires_grad=1)
        self.w_readgate_bias = Parameter(torch.zeros(memory_size, 1),
                                         requires_grad=1)
        self.w_writegate_bias = Parameter(torch.zeros(memory_size, 1),
                                          requires_grad=1)

        # Adaptive components
        self.mem = Variable(torch.zeros(self.memory_size, 1),
                            requires_grad=1).cuda()
        self.out = Variable(torch.zeros(self.output_size, 1),
                            requires_grad=1).cuda()

        for param in self.parameters():
            #torch.nn.init.xavier_normal(param)
            #torch.nn.init.orthogonal(param)
            #torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)

    def reset(self, batch_size):
        # Adaptive components
        self.mem = Variable(torch.zeros(self.memory_size, batch_size),
                            requires_grad=1).cuda()
        self.out = Variable(torch.zeros(self.output_size, batch_size),
                            requires_grad=1).cuda()

    def graph_compute(self, input, rec_output, mem):
        #Block Input
        block_inp = F.sigmoid(
            self.w_inp.mm(input) +
            self.w_rec_inp.mm(rec_output))  # + self.w_block_input_bias)

        #Input gate
        inp_gate = F.sigmoid(
            self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) +
            self.w_rec_inpgate.mm(rec_output))  # + self.w_input_gate_bias)

        #Input out
        inp_out = block_inp * inp_gate

        #Read gate
        read_gate_out = F.sigmoid(
            self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) +
            self.w_mem_readgate.mm(mem))  # + self.w_readgate_bias) * mem

        #Compute hidden activation
        decoded_mem = self.w_decoder.mm(read_gate_out * mem)
        hidden_act = inp_out + decoded_mem

        #Write gate
        write_gate_out = F.sigmoid(
            self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) +
            self.w_rec_writegate.mm(rec_output))  # + self.w_writegate_bias)

        #Update memory
        encoded_update = F.tanh(self.w_encoder.mm(hidden_act))
        mem = mem + write_gate_out * encoded_update

        output = self.w_hid_out.mm(hidden_act)
        if self.output_activation != None:
            output = self.output_activation(output)

        return output, mem

    def bgraph_compute(self, input, rec_output, mem):
        #Block Input
        block_inp = F.sigmoid(
            self.w_inp.mm(input) +
            self.w_rec_inp.mm(rec_output))  # + self.w_block_input_bias)

        #Input gate
        inp_gate = F.sigmoid(
            self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) +
            self.w_rec_inpgate.mm(rec_output))  # + self.w_input_gate_bias)

        #Input out
        inp_out = block_inp * inp_gate

        #Read gate
        read_gate_out = F.sigmoid(
            self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) +
            self.w_mem_readgate.mm(mem))  # + self.w_readgate_bias) * mem

        #Compute hidden activation
        hidden_act = inp_out + read_gate_out * mem

        #Write gate
        write_gate_out = F.sigmoid(
            self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) +
            self.w_rec_writegate.mm(rec_output))  # + self.w_writegate_bias)

        #Update memory
        mem = mem + write_gate_out * F.tanh(hidden_act)

        output = self.w_hid_out.mm(hidden_act)
        if self.output_activation != None:
            output = self.output_activation(output)

        return output, mem

    def forward(self, input):
        self.out, self.mem = self.graph_compute(input, self.out, self.mem)
        return self.out

    def turn_grad_on(self):
        for param in self.parameters():
            param.requires_grad = True
            param.volatile = False

    def turn_grad_off(self):
        for param in self.parameters():
            param.requires_grad = False
            param.volatile = True
class GD_MMU(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size):
        super(GD_MMU, self).__init__()

        self.input_size = input_size; self.hidden_size = hidden_size; self.memory_size = memory_size; self.output_size = output_size

        #Input gate
        self.w_inpgate = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1)
        self.w_rec_inpgate = Parameter(torch.rand( hidden_size, output_size+1), requires_grad=1)
        self.w_mem_inpgate = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1)

        #Block Input
        self.w_inp = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1)
        self.w_rec_inp = Parameter(torch.rand(hidden_size, output_size+1), requires_grad=1)

        #Read Gate
        self.w_readgate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1)
        self.w_rec_readgate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1)
        self.w_mem_readgate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1)

        #Memory Decoder
        self.w_decoder = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1)

        #Write Gate
        self.w_writegate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1)
        self.w_rec_writegate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1)
        self.w_mem_writegate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1)

        #Memory Encoder
        self.w_encoder = Parameter(torch.rand(memory_size, hidden_size), requires_grad=1)

        #Memory init
        self.w_mem_init = Parameter(torch.rand(memory_size, 1), requires_grad=1)

        # Adaptive components
        self.mem = Variable(torch.ones(1, 1), requires_grad=1).cuda()
        self.out = Variable(torch.zeros(self.output_size, 1), requires_grad=1).cuda()

        for param in self.parameters():
            #torch.nn.init.xavier_normal(param)
            #torch.nn.init.orthogonal(param)
            #torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)

        #Gates to 1
        # self.w_writegate = Parameter(torch.ones(memory_size, input_size), requires_grad=1)
        # self.w_rec_writegate = Parameter(torch.ones(memory_size, output_size), requires_grad=1)
        # self.w_mem_writegate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1)
        # self.w_readgate = Parameter(torch.ones(memory_size, input_size), requires_grad=1)
        # self.w_rec_readgate = Parameter(torch.ones(memory_size, output_size), requires_grad=1)
        # self.w_mem_readgate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1)
        # self.w_inpgate = Parameter(torch.ones(hidden_size, input_size), requires_grad=1)
        # self.w_rec_inpgate = Parameter(torch.ones(hidden_size, output_size), requires_grad=1)
        # self.w_mem_inpgate = Parameter(torch.ones(hidden_size, memory_size), requires_grad=1)


    def reset(self, batch_size):
        # Adaptive components
        self.mem = self.w_mem_init.mm(Variable(torch.zeros(1, batch_size), requires_grad=1).cuda())
        self.out = Variable(torch.zeros(self.output_size, batch_size), requires_grad=1).cuda()

    def prep_bias(self, mat, batch_size):
        return Variable(torch.cat((mat.cpu().data, torch.ones(1, batch_size))).cuda())

    def bgraph_compute(self, input, rec_output, memory, batch_size):
        #Reshape add 1 for bias
        input = self.prep_bias(input, batch_size); rec_output = self.prep_bias(rec_output, batch_size); mem = self.prep_bias(memory, batch_size)

        #Input process
        block_inp = F.tanh(self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output)) #Block Input
        inp_gate = F.sigmoid(self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) + self.w_rec_inpgate.mm(rec_output)) #Input gate


        #Read from memory
        read_gate_out = F.sigmoid(self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(mem))
        decoded_mem = self.w_decoder.mm(read_gate_out * memory)

        # Compute hidden activation
        hidden_act = block_inp * inp_gate + decoded_mem

        #Update memory
        write_gate_out = F.sigmoid(self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) + self.w_rec_writegate.mm(rec_output)) # #Write gate
        encoded_update = F.tanh(self.w_encoder.mm(hidden_act))
        memory = memory + write_gate_out * encoded_update

        return hidden_act, memory

    def graph_compute(self, input, rec_output, memory, batch_size):

        #Reshape add 1 for bias
        input = self.prep_bias(input, batch_size); rec_output = self.prep_bias(rec_output, batch_size)

        #Input process
        block_inp = F.tanh(self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output)) #Block Input
        inp_gate = F.sigmoid(self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(memory) + self.w_rec_inpgate.mm(rec_output)) #Input gate


        #Read from memory
        read_gate_out = F.sigmoid(self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(memory))
        decoded_mem = self.w_decoder.mm(read_gate_out * memory)

        # Compute hidden activation
        hidden_act = block_inp * inp_gate + decoded_mem

        #Update memory
        write_gate_out = F.sigmoid(self.w_writegate.mm(input) + self.w_mem_writegate.mm(memory) + self.w_rec_writegate.mm(rec_output)) # #Write gate
        encoded_update = F.tanh(self.w_encoder.mm(hidden_act))
        memory = memory + write_gate_out * encoded_update

        return hidden_act, memory


    def forward(self, input):
        batch_size = input.data.shape[-1]
        self.out, self.mem = self.graph_compute(input, self.out, self.mem, batch_size)
        return self.out

    def turn_grad_on(self):
        for param in self.parameters():
            param.requires_grad = True
            param.volatile = False

    def turn_grad_off(self):
        for param in self.parameters():
            param.requires_grad = False
            param.volatile = True
class GD_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size):
        super(GD_LSTM, self).__init__()

        self.input_size = input_size; self.hidden_size = hidden_size; self.memory_size = memory_size; self.output_size = output_size

        # Input gate
        self.w_inpgate = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1)
        self.w_rec_inpgate = Parameter(torch.rand(hidden_size, output_size+1), requires_grad=1)
        self.w_mem_inpgate = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1)

        # Block Input
        self.w_inp = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1)
        self.w_rec_inp = Parameter(torch.rand(hidden_size, output_size+1), requires_grad=1)

        # Read Gate
        self.w_readgate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1)
        self.w_rec_readgate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1)
        self.w_mem_readgate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1)

        # Write Gate
        self.w_writegate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1)
        self.w_rec_writegate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1)
        self.w_mem_writegate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1)



        # Adaptive components
        self.mem = Variable(torch.zeros(self.memory_size, 1), requires_grad=1).cuda()
        self.out = Variable(torch.zeros(self.output_size, 1), requires_grad=1).cuda()

        for param in self.parameters():
            # torch.nn.init.xavier_normal(param)
            # torch.nn.init.orthogonal(param)
            # torch.nn.init.sparse(param, sparsity=0.5)
            torch.nn.init.kaiming_normal(param)

            # Gates to 1
            # self.w_writegate = Parameter(torch.ones(memory_size, input_size), requires_grad=1)
            # self.w_rec_writegate = Parameter(torch.ones(memory_size, output_size), requires_grad=1)
            # self.w_mem_writegate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1)
            # self.w_readgate = Parameter(torch.ones(memory_size, input_size), requires_grad=1)
            # self.w_rec_readgate = Parameter(torch.ones(memory_size, output_size), requires_grad=1)
            # self.w_mem_readgate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1)
            # self.w_inpgate = Parameter(torch.ones(hidden_size, input_size), requires_grad=1)
            # self.w_rec_inpgate = Parameter(torch.ones(hidden_size, output_size), requires_grad=1)
            # self.w_mem_inpgate = Parameter(torch.ones(hidden_size, memory_size), requires_grad=1)

    def prep_bias(self, mat, batch_size):
        return Variable(torch.cat((mat.cpu().data, torch.ones(1, batch_size))).cuda())

    def reset(self, batch_size):
        # Adaptive components
        self.mem = Variable(torch.zeros(self.memory_size, batch_size), requires_grad=1).cuda()
        self.out = Variable(torch.zeros(self.output_size, batch_size), requires_grad=1).cuda()

    def graph_compute(self, input, rec_output, mem, batch_size):
        #Reshape add 1 for bias
        input = self.prep_bias(input, batch_size); rec_output = self.prep_bias(rec_output, batch_size)

        # Block Input
        block_inp = F.tanh(self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output))  # + self.w_block_input_bias)

        # Input gate
        inp_gate = F.sigmoid(self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) + self.w_rec_inpgate.mm(rec_output))  # + self.w_input_gate_bias)

        # Input out
        inp_out = block_inp * inp_gate

        # Read gate
        read_gate_out = F.sigmoid(self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(mem))  # + self.w_readgate_bias) * mem

        # Output gate
        out_gate = F.sigmoid(self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) + self.w_rec_writegate.mm(rec_output))  # + self.w_writegate_bias)

        # Compute new mem
        mem = inp_out + read_gate_out * mem
        out = out_gate * mem


        return out, mem


    def forward(self, input):
        batch_size = input.data.shape[-1]
        self.out, self.mem = self.graph_compute(input, self.out, self.mem, batch_size)
        return self.out

    def turn_grad_on(self):
        for param in self.parameters():
            param.requires_grad = True
            param.volatile = False

    def turn_grad_off(self):
        for param in self.parameters():
            param.requires_grad = False
            param.volatile = True
Exemple #17
0
class MMU(nn.Module):
    def __init__(self, input_dim, hid_dim, mem_dim, out_dim):
        super(MMU, self).__init__()

        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.mem_dim = mem_dim
        self.out_dim = out_dim

        # Input gate
        self.w_inpgate = nn.Linear(input_dim, hid_dim)
        self.w_rec_inpgate = nn.Linear(out_dim, hid_dim)
        self.w_mem_inpgate = nn.Linear(mem_dim, hid_dim)

        # Block Input
        self.w_inp = nn.Linear(input_dim, hid_dim)
        self.w_rec_inp = nn.Linear(out_dim, hid_dim)

        # Read Gate
        self.w_readgate = nn.Linear(input_dim, mem_dim)
        self.w_rec_readgate = nn.Linear(out_dim, mem_dim)
        self.w_mem_readgate = nn.Linear(mem_dim, mem_dim)

        # Memory Decoder
        self.w_decoder = nn.Linear(hid_dim, mem_dim)

        # Write Gate
        self.w_writegate = nn.Linear(input_dim, mem_dim)
        self.w_rec_writegate = nn.Linear(out_dim, mem_dim)
        self.w_mem_writegate = nn.Linear(mem_dim, mem_dim)

        # Memory Encoder
        self.w_encoder = nn.Linear(mem_dim, hid_dim)

        #Adaptive components
        self.mem = None
        self.out = None

        #Output weights
        self.w_hid_out = Parameter(torch.rand(out_dim, mem_dim),
                                   requires_grad=True)

        # History for RRN
        self.hist_steps = 5
        self.rnn_history = []  #np.zeros([5,20,20])

    def reset(self, batch_size):
        # Adaptive components
        self.mem = Variable(torch.zeros(batch_size, self.mem_dim),
                            requires_grad=True)  #.cuda()
        self.out = Variable(torch.zeros(batch_size, self.out_dim),
                            requires_grad=True)  #.cuda()
        self.rnn_history = []  #Variable(torch.zeros())

    def predict(self, input):
        return self.forward(input)

    def graph_compute(self, input, rec_output, memory):

        # Input process
        #block_inp = F.sigmoid(self.w_inp(input) + self.w_rec_inp(rec_output))  # Block Input
        block_inp = torch.sigmoid(
            self.w_inp(torch.t(input)) + self.w_rec_inp(rec_output))
        inp_gate = torch.sigmoid(
            self.w_inpgate(torch.t(input)) + self.w_mem_inpgate(memory) +
            self.w_rec_inpgate(rec_output))  #Input gate

        # Read from memory
        read_gate_out = torch.sigmoid(
            self.w_readgate(torch.t(input)) + self.w_mem_readgate(memory) +
            self.w_rec_readgate(rec_output))
        decoded_mem = self.w_decoder(read_gate_out * memory)

        # Compute hidden activation
        hidden_act = decoded_mem + block_inp * inp_gate

        # Update memory
        write_gate_out = torch.sigmoid(
            self.w_writegate(torch.t(input)) + self.w_mem_writegate(memory) +
            self.w_rec_writegate(rec_output))  # #Write gate
        encoded_update = torch.tanh(self.w_encoder(hidden_act))
        memory = (1 -
                  write_gate_out) * memory + write_gate_out * encoded_update
        #memory = memory + encoded_update

        return hidden_act, memory

    def forward(self, input):
        # Adaptive components
        self.mem = Variable(torch.zeros(input.shape[1], self.mem_dim),
                            requires_grad=True)  #.cuda()
        self.out = Variable(torch.zeros(input.shape[1], self.out_dim),
                            requires_grad=True)  #.cuda()

        #print(self.out.shape)
        '''Create history of n time-steps and loop graph_compute n times to generate final output'''
        if not torch.is_tensor(self.rnn_history):
            self.rnn_history = torch.Tensor(
                np.zeros([input.shape[0], self.hist_steps, input.shape[1]
                          ]))  #control_inputs, history, batch_size

        # Shift the history and update to latest input
        for i in range(self.hist_steps - 1):
            self.rnn_history[:, i, :] = self.rnn_history[:, i + 1, :]
        '''Trying to fix batch size change'''
        # the last batch_size can be different
        if input.shape != self.rnn_history[:, -1, :].shape:
            temp = copy.deepcopy(input)
            input = torch.Tensor(
                np.zeros(
                    [self.rnn_history.shape[0], self.rnn_history.shape[2]]))
            input[0:temp.shape[0], 0:temp.shape[1]] = temp

        self.rnn_history[:, -1, :] = input

        #print(self.out.shape)
        # Loop to generate final output
        for i in range(self.hist_steps):
            out, mem = self.graph_compute(self.rnn_history[:, i, :], self.out,
                                          self.mem)
            self.out, self.mem = out, mem
            self.out = self.w_hid_out.mm(torch.t(self.out))
            self.out = torch.t(self.out)
        '''Old working code without history'''
        #self.out, self.mem = self.graph_compute(input, self.out, self.mem)
        # Till here, "out" is the hidden_act
        #self.out = self.w_hid_out.mm(torch.t(self.out))
        #self.out = torch.t(self.out)

        return self.out

    def turn_grad_on(self):
        for param in self.parameters():
            param.requires_grad = True
            param.volatile = False

    def turn_grad_off(self):
        for param in self.parameters():
            param.requires_grad = False
            param.volatile = True