class bGD_FF(nn.Module): def __init__(self, input_size, hidden_size, output_size, output_activation): super(GD_FF, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size if output_activation == 'sigmoid': self.output_activation = F.sigmoid elif output_activation == 'tanh': self.output_activation = F.tanh else: self.output_activation = None # Block Input self.w_inp = Parameter(torch.rand(hidden_size, input_size), requires_grad=1) # Output weights self.w_hid_out = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) for param in self.parameters(): # torch.nn.init.xavier_normal(param) # torch.nn.init.orthogonal(param) # torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) def reset(self, batch_size): return def graph_compute(self, input): # Compute hidden activations hidden_act = F.sigmoid( self.w_inp.mm(input)) # + self.w_block_input_bias) #Compute Output output = self.w_hid_out.mm(hidden_act) if self.output_activation != None: output = self.output_activation(output) return output def forward(self, input): self.out = self.graph_compute(input) return self.out def turn_grad_on(self): for param in self.parameters(): param.requires_grad = True param.volatile = False def turn_grad_off(self): for param in self.parameters(): param.requires_grad = False param.volatile = True
class Single_MMU(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size, n_vocab): super(Single_MMU, self).__init__() #Define model self.embeddings = nn.Embedding(n_vocab + 1, embedding_dim) self.mmu = mod.GD_MMU(embedding_dim, hidden_size, memory_size, hidden_size) self.dropout = nn.Dropout(0.1) self.w_out = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) for param in self.parameters(): # torch.nn.init.xavier_normal(param) # torch.nn.init.orthogonal(param) # torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) def forward(self, input): embeds = self.embeddings(input) mmu_out = self.mmu.forward(torch.t(embeds)) mmu_out = self.dropout(mmu_out) out = self.w_out.mm(mmu_out) out = F.log_softmax(torch.t(out)) return out def reset(self, batch_size): #self.poly.reset(batch_size) self.mmu.reset(batch_size)
class PPCA_Variational(Distribution): def __init__(self, ppca): super().__init__() self.K = ppca.K self.D = ppca.D self.W = Parameter(torch.Tensor(ppca.K, ppca.D).float()) self.noise = ppca.noise self.reset_parameters() def reset_parameters(self): init.kaiming_uniform_(self.W, a=math.sqrt(5)) def sample(self, X, compute_logprob=False): dist = Normal(F.linear(X, self.W), self.noise * torch.eye(self.K), learnable=False) z = dist.sample(1).squeeze(0) if compute_logprob: return z, dist.log_prob(z) return z def log_prob(self, z, X): dist = Normal(self.W.mm(X), self.noise * torch.eye(self.D), learnable=False) return dist.log_prob(z)
class Stacked_MMU(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size, n_vocab): super(Stacked_MMU, self).__init__() #Define model #self.poly = mod.GD_polynet(input_size, hidden_size, hidden_size, hidden_size, None) self.embedding = Parameter(torch.rand(input_size, n_vocab), requires_grad=1) self.mmu1 = mod.GD_MMU(n_vocab, hidden_size, memory_size, hidden_size) self.mmu2 = mod.GD_MMU(hidden_size, hidden_size, memory_size, hidden_size) self.mmu3 = mod.GD_MMU(hidden_size, hidden_size, memory_size, hidden_size) self.w_out1 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) self.w_out2 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) self.w_out3 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) for param in self.parameters(): # torch.nn.init.xavier_normal(param) # torch.nn.init.orthogonal(param) # torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) def forward(self, input): input = self.embedding.mm(input) mmu1_out = self.mmu1.forward(input) mmu2_out = self.mmu2.forward(mmu1_out) mmu3_out = self.mmu3.forward(mmu2_out) out = self.w_out3.mm( mmu3_out) # + self.w_out2.mm(mmu2_out) + self.w_out1.mm(mmu1_out) out = F.log_softmax(torch.t(out)) return torch.t(out) def reset(self, batch_size): #self.poly.reset(batch_size) self.mmu1.reset(batch_size) self.mmu2.reset(batch_size) self.mmu3.reset(batch_size)
class GD_polynet(nn.Module): def __init__(self, input_size, h1, h2, output_size, output_activation): super(GD_polynet, self).__init__() self.input_size = input_size self.output_size = output_size if output_activation == 'sigmoid': self.output_activation = F.sigmoid elif output_activation == 'tanh': self.output_activation = F.tanh else: self.output_activation = None #Weights self.w1 = Parameter(torch.rand(h1, input_size), requires_grad=1) self.w_poly = Parameter(torch.rand(h2, h1), requires_grad=1) self.w2 = Parameter(torch.rand(output_size, h2), requires_grad=1) #Initialize weights except for poly weights which are initialized to all 1s for param in self.parameters(): #torch.nn.init.xavier_normal(param) #torch.nn.init.orthogonal(param) #torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) #self.w_poly = Parameter(torch.ones(h2, h1), requires_grad=1) + self.w_poly.data += 1.0 def forward(self, input): first_out = F.threshold( self.w1.mm(input), 0.01, 0.01 ) #First dense layer with thresholding activation (Relu except 0 translated to 0.1) #Polynomial operation poly1 = torch.t(first_out).pow(self.w_poly) poly_out = torch.sum(poly1, 1).unsqueeze(1) #Output dense layer output = self.w2.mm(poly_out) if self.output_activation != None: output = self.output_activation(output) return output #TODO Batch Process for GD_Polynet def reset(self, batch_size): return
class Encoder(Module): """ Encodes a node's using 'convolutional' GraphSage approach """ def __init__( self, features, feature_dim, embed_dim, adj_lists, aggregator, num_sample=10, base_model=None, gcn=False, #cuda=False, feature_transform=False): super().__init__() self.features = features self.feat_dim = feature_dim self.adj_lists = adj_lists self.aggregator = aggregator self.num_sample = num_sample if base_model != None: self.base_model = base_model self.gcn = gcn self.embed_dim = embed_dim #self.cuda = cuda #self.aggregator.cuda = cuda weight_dim_y = self.feat_dim if self.gcn else 2 * self.feat_dim self.weight = Parameter(torch.empty(embed_dim, weight_dim_y)) init.xavier_uniform(self.weight) def forward(self, nodes): """ Generates embeddings for a batch of nodes. nodes -- list of nodes """ to_neighs = [self.adj_lists[int(node)] for node in nodes] neigh_feats = self.aggregator.forward(nodes, to_neighs, self.num_sample) if not self.gcn: combined = neigh_feats # if self.cuda: # self_feats = self.features(torch.LongTensor(nodes).cuda()) # else: # self_feats = self.features(torch.LongTensor(nodes)) # combined = torch.cat([self_feats, neigh_feats], dim=1) else: self_feats = self.features(torch.tensor(nodes)) combined = torch.cat([self_feats, neigh_feats], dim=1) combined = relu(self.weight.mm(combined.t())) return combined
class SupervisedGraphSage(Module): def __init__(self, num_classes, enc): super().__init__() self.enc = enc self.xent = CrossEntropyLoss() self.weight = Parameter(torch.empty(num_classes, enc.embed_dim)) init.xavier_uniform_(self.weight) def forward(self, nodes): embeds = self.enc(nodes) scores = self.weight.mm(embeds) return scores.t() def loss(self, nodes, labels): scores = self.forward(nodes) return self.xent(scores, labels.squeeze())
class Single_LSTM(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size): super(Single_LSTM, self).__init__() #Define model self.lstm = mod.GD_LSTM(input_size, hidden_size, memory_size, hidden_size) self.w_out = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) def forward(self, input): lstm_out = self.lstm.forward(input) out = self.w_out.mm(lstm_out) out = F.log_softmax(out) return out def reset(self, batch_size): self.lstm.reset(batch_size)
class Single_MMU(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size): super(Single_MMU, self).__init__() #Define model self.mmu = mod.GD_MMU(input_size, hidden_size, memory_size, hidden_size) self.w_out = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) def forward(self, input): #input = self.poly.forward(input) mmu_out = self.mmu.forward(input) out = self.w_out.mm(mmu_out) out = F.log_softmax(out) return out def reset(self, batch_size): self.mmu.reset(batch_size)
class Stacked_LSTM(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size, n_vocab): super(Stacked_LSTM, self).__init__() #Define model #self.poly = mod.GD_polynet(input_size, hidden_size, hidden_size, hidden_size, None) self.embeddings = nn.Embedding(n_vocab + 1, embedding_dim) self.lstm1 = mod.GD_LSTM(embedding_dim, hidden_size, memory_size, hidden_size) self.lstm2 = mod.GD_LSTM(hidden_size, hidden_size, memory_size, hidden_size) self.dropout1 = nn.Dropout(0.1) self.dropout2 = nn.Dropout(0.1) #self.bnorm1 = nn.BatchNorm2d(hidden_size) #self.w_out1 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) self.w_out2 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) for param in self.parameters(): # torch.nn.init.xavier_normal(param) # torch.nn.init.orthogonal(param) # torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) def forward(self, input): embeds = self.embeddings(input) lstm1_out = self.lstm1.forward(torch.t(embeds)) lstm1_out = self.dropout1(lstm1_out) #lstm1_out = self.bnorm1(lstm1_out) lstm2_out = self.lstm2.forward(lstm1_out) lstm2_out = self.dropout1(lstm2_out) out = self.w_out2.mm( lstm2_out ) # + self.w_out2.mm(lstm2_out) + self.w_out1.mm(lstm1_out) out = F.log_softmax(torch.t(out)) return out def reset(self, batch_size): #self.poly.reset(batch_size) self.lstm1.reset(batch_size) self.lstm2.reset(batch_size)
class Stacked_LSTM(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size): super(Stacked_LSTM, self).__init__() #Define model #self.poly = mod.GD_polynet(input_size, hidden_size, hidden_size, hidden_size, None) self.lstm1 = mod.GD_LSTM(input_size, hidden_size, memory_size, hidden_size) self.lstm2 = mod.GD_LSTM(hidden_size, hidden_size, memory_size, hidden_size) self.lstm3 = mod.GD_LSTM(hidden_size, hidden_size, memory_size, hidden_size) self.w_out1 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) self.w_out2 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) self.w_out3 = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) def forward(self, input): #input = self.poly.forward(input) lstm1_out = self.lstm1.forward(input) lstm2_out = self.lstm2.forward(lstm1_out) lstm3_out = self.lstm3.forward(lstm2_out) out = self.w_out3.mm( lstm3_out ) # + self.w_out2.mm(lstm2_out) + self.w_out1.mm(lstm1_out) out = F.log_softmax(torch.t(out)) return torch.t(out) def reset(self, batch_size): #self.poly.reset(batch_size) self.lstm1.reset(batch_size) self.lstm2.reset(batch_size) self.lstm3.reset(batch_size)
class GRUPoem(nn.Module): def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128): super(GRUPoem, self).__init__() self.hidden_dim = hidden_dim self.embedding = nn.Embedding(vocab_size, embedding_dim) input_dim = hidden_dim + embedding_dim #init weight self.Wr = Parameter( torch.rand(hidden_dim, input_dim) * np.sqrt(2 / (input_dim + hidden_dim))) self.Br = Parameter(torch.rand(hidden_dim, 1)) self.Wz = Parameter( torch.rand(hidden_dim, input_dim) * np.sqrt(2 / (hidden_dim + hidden_dim))) self.Bz = Parameter(torch.rand(hidden_dim, 1)) self.Wh = Parameter( torch.rand(hidden_dim, input_dim) * np.sqrt(2 / (input_dim + hidden_dim))) self.Bh = Parameter(torch.rand(hidden_dim, 1)) self.W = Parameter( torch.rand(vocab_size, hidden_dim) * np.sqrt(2 / (vocab_size + hidden_dim))) self.b = Parameter(torch.rand(vocab_size, 1)) def forward(self, x, hidden=None): # x: seq_len * batch_size seq_len, batch_size = x.size() if hidden is None: Ht = x.data.new(self.hidden_dim, batch_size).fill_(0).float() else: Ht = hidden embeds = self.embedding(x) # seq * batch * embedding output = [] for i in range(len(embeds)): xTmp = embeds[i].transpose(1, 0).contiguous() x_h = torch.cat((xTmp, Ht), 0).cuda() Rt = torch.sigmoid(self.Wr.mm(x_h) + self.Br) Zt = torch.sigmoid(self.Wz.mm(x_h) + self.Bz) Ht_ = torch.mul(Ht, Rt) x_h_ = torch.cat((xTmp, Ht_), 0).cuda() H_ = torch.tanh(self.Wh.mm(x_h_) + self.Bh) Ht = torch.mul(Zt, Ht) + torch.mul((1 - Zt, H_)) y = self.W.mm(Ht) + self.b # no softmax: included in cross entropy loss y = y.transpose(1, 0).contiguous() # y: batch_size, vocab output.append(y) output = torch.cat(output, 0) output = output.view(seq_len * batch_size, -1) #output: (seq * batchsize, vocab) return output, Ht
class lstmPoem(nn.Module): def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, cell_dim=128): super(lstmPoem, self).__init__() self.hidden_dim = hidden_dim self.cell_dim = cell_dim self.embedding = nn.Embedding(vocab_size, embedding_dim) input_dim = hidden_dim + embedding_dim #init weight self.Wc = Parameter( torch.rand(cell_dim, input_dim) * np.sqrt(2 / (input_dim + cell_dim))) self.Bc = Parameter(torch.rand(cell_dim, 1)) self.Wf = Parameter( torch.rand(cell_dim, input_dim) * np.sqrt(2 / (input_dim + cell_dim))) self.Bf = Parameter(torch.rand(cell_dim, 1)) self.Wi = Parameter( torch.rand(cell_dim, input_dim) * np.sqrt(2 / (input_dim + cell_dim))) self.Bi = Parameter(torch.rand(cell_dim, 1)) self.Wo = Parameter( torch.rand(cell_dim, input_dim) * np.sqrt(2 / (input_dim + cell_dim))) self.Bo = Parameter(torch.rand(cell_dim, 1)) self.W = Parameter( torch.rand(vocab_size, hidden_dim) * np.sqrt(2 / (vocab_size + hidden_dim))) self.b = Parameter(torch.rand(vocab_size, 1)) # self.gate = nn.Linear(input_dim, cell_dim) # self.output = nn.Linear(hidden_dim, vocab_size) # self.sigmoid = nn.Sigmoid() # self.tanh = nn.Tanh() def forward(self, x, hidden=None, cell=None): # x: seq_len * batch_size seq_len, batch_size = x.size() if hidden is None: Ht = x.data.new(self.hidden_dim, batch_size).fill_(0).float() else: Ht = hidden if cell is None: Ct = x.data.new(self.cell_dim, batch_size).fill_(0).float() else: Ct = cell embeds = self.embedding(x) # seq * batch * embedding output = [] for i in range(len(embeds)): # self.Bx: cell_dim * 1 # Wx: cell_dim * input_dim # x_h: input_dim * batch_size # C: cell_dim * batch_size # H: hidden_dim * batch_size xTmp = embeds[i].transpose(1, 0).contiguous() x_h = torch.cat((xTmp, Ht), 0).cuda() Ft = torch.sigmoid(self.Wf.mm(x_h) + self.Bf) It = torch.sigmoid(self.Wi.mm(x_h) + self.Bi) Ot = torch.sigmoid(self.Wo.mm(x_h) + self.Bo) Ct_ = torch.tanh(self.Wc.mm(x_h) + self.Bc) Ct = torch.add(torch.mul(Ft, Ct), torch.mul(It, Ct_)) Ht = torch.mul(torch.tanh(Ct), Ot) y = self.W.mm(Ht) + self.b # no softmax: included in cross entropy loss y = y.transpose(1, 0).contiguous() # y: batch_size, vocab output.append(y) output = torch.cat(output, 0) output = output.view(seq_len * batch_size, -1) #output: (seq * batchsize, vocab) return output, Ht, Ct
class GD_MMU(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size, output_activation): super(GD_MMU, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.memory_size = memory_size self.output_size = output_size if output_activation == 'sigmoid': self.output_activation = F.sigmoid elif output_activation == 'tanh': self.output_activation = F.tanh else: self.output_activation = None #Input gate self.w_inpgate = Parameter(torch.rand(hidden_size, input_size), requires_grad=1) self.w_rec_inpgate = Parameter(torch.rand(hidden_size, output_size), requires_grad=1) self.w_mem_inpgate = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1) #Block Input self.w_inp = Parameter(torch.rand(hidden_size, input_size), requires_grad=1) self.w_rec_inp = Parameter(torch.rand(hidden_size, output_size), requires_grad=1) #Read Gate self.w_readgate = Parameter(torch.rand(memory_size, input_size), requires_grad=1) self.w_rec_readgate = Parameter(torch.rand(memory_size, output_size), requires_grad=1) self.w_mem_readgate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1) #Memory Decoder self.w_decoder = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1) #Write Gate self.w_writegate = Parameter(torch.rand(memory_size, input_size), requires_grad=1) self.w_rec_writegate = Parameter(torch.rand(memory_size, output_size), requires_grad=1) self.w_mem_writegate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1) #Memory Encoder self.w_encoder = Parameter(torch.rand(memory_size, hidden_size), requires_grad=1) #Output weights self.w_hid_out = Parameter(torch.rand(output_size, hidden_size), requires_grad=1) #Biases self.w_input_gate_bias = Parameter(torch.zeros(hidden_size, 1), requires_grad=1) self.w_block_input_bias = Parameter(torch.zeros(hidden_size, 1), requires_grad=1) self.w_readgate_bias = Parameter(torch.zeros(memory_size, 1), requires_grad=1) self.w_writegate_bias = Parameter(torch.zeros(memory_size, 1), requires_grad=1) # Adaptive components self.mem = Variable(torch.zeros(self.memory_size, 1), requires_grad=1).cuda() self.out = Variable(torch.zeros(self.output_size, 1), requires_grad=1).cuda() for param in self.parameters(): #torch.nn.init.xavier_normal(param) #torch.nn.init.orthogonal(param) #torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) def reset(self, batch_size): # Adaptive components self.mem = Variable(torch.zeros(self.memory_size, batch_size), requires_grad=1).cuda() self.out = Variable(torch.zeros(self.output_size, batch_size), requires_grad=1).cuda() def graph_compute(self, input, rec_output, mem): #Block Input block_inp = F.sigmoid( self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output)) # + self.w_block_input_bias) #Input gate inp_gate = F.sigmoid( self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) + self.w_rec_inpgate.mm(rec_output)) # + self.w_input_gate_bias) #Input out inp_out = block_inp * inp_gate #Read gate read_gate_out = F.sigmoid( self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(mem)) # + self.w_readgate_bias) * mem #Compute hidden activation decoded_mem = self.w_decoder.mm(read_gate_out * mem) hidden_act = inp_out + decoded_mem #Write gate write_gate_out = F.sigmoid( self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) + self.w_rec_writegate.mm(rec_output)) # + self.w_writegate_bias) #Update memory encoded_update = F.tanh(self.w_encoder.mm(hidden_act)) mem = mem + write_gate_out * encoded_update output = self.w_hid_out.mm(hidden_act) if self.output_activation != None: output = self.output_activation(output) return output, mem def bgraph_compute(self, input, rec_output, mem): #Block Input block_inp = F.sigmoid( self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output)) # + self.w_block_input_bias) #Input gate inp_gate = F.sigmoid( self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) + self.w_rec_inpgate.mm(rec_output)) # + self.w_input_gate_bias) #Input out inp_out = block_inp * inp_gate #Read gate read_gate_out = F.sigmoid( self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(mem)) # + self.w_readgate_bias) * mem #Compute hidden activation hidden_act = inp_out + read_gate_out * mem #Write gate write_gate_out = F.sigmoid( self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) + self.w_rec_writegate.mm(rec_output)) # + self.w_writegate_bias) #Update memory mem = mem + write_gate_out * F.tanh(hidden_act) output = self.w_hid_out.mm(hidden_act) if self.output_activation != None: output = self.output_activation(output) return output, mem def forward(self, input): self.out, self.mem = self.graph_compute(input, self.out, self.mem) return self.out def turn_grad_on(self): for param in self.parameters(): param.requires_grad = True param.volatile = False def turn_grad_off(self): for param in self.parameters(): param.requires_grad = False param.volatile = True
class GD_MMU(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size): super(GD_MMU, self).__init__() self.input_size = input_size; self.hidden_size = hidden_size; self.memory_size = memory_size; self.output_size = output_size #Input gate self.w_inpgate = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1) self.w_rec_inpgate = Parameter(torch.rand( hidden_size, output_size+1), requires_grad=1) self.w_mem_inpgate = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1) #Block Input self.w_inp = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1) self.w_rec_inp = Parameter(torch.rand(hidden_size, output_size+1), requires_grad=1) #Read Gate self.w_readgate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1) self.w_rec_readgate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1) self.w_mem_readgate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1) #Memory Decoder self.w_decoder = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1) #Write Gate self.w_writegate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1) self.w_rec_writegate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1) self.w_mem_writegate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1) #Memory Encoder self.w_encoder = Parameter(torch.rand(memory_size, hidden_size), requires_grad=1) #Memory init self.w_mem_init = Parameter(torch.rand(memory_size, 1), requires_grad=1) # Adaptive components self.mem = Variable(torch.ones(1, 1), requires_grad=1).cuda() self.out = Variable(torch.zeros(self.output_size, 1), requires_grad=1).cuda() for param in self.parameters(): #torch.nn.init.xavier_normal(param) #torch.nn.init.orthogonal(param) #torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) #Gates to 1 # self.w_writegate = Parameter(torch.ones(memory_size, input_size), requires_grad=1) # self.w_rec_writegate = Parameter(torch.ones(memory_size, output_size), requires_grad=1) # self.w_mem_writegate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1) # self.w_readgate = Parameter(torch.ones(memory_size, input_size), requires_grad=1) # self.w_rec_readgate = Parameter(torch.ones(memory_size, output_size), requires_grad=1) # self.w_mem_readgate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1) # self.w_inpgate = Parameter(torch.ones(hidden_size, input_size), requires_grad=1) # self.w_rec_inpgate = Parameter(torch.ones(hidden_size, output_size), requires_grad=1) # self.w_mem_inpgate = Parameter(torch.ones(hidden_size, memory_size), requires_grad=1) def reset(self, batch_size): # Adaptive components self.mem = self.w_mem_init.mm(Variable(torch.zeros(1, batch_size), requires_grad=1).cuda()) self.out = Variable(torch.zeros(self.output_size, batch_size), requires_grad=1).cuda() def prep_bias(self, mat, batch_size): return Variable(torch.cat((mat.cpu().data, torch.ones(1, batch_size))).cuda()) def bgraph_compute(self, input, rec_output, memory, batch_size): #Reshape add 1 for bias input = self.prep_bias(input, batch_size); rec_output = self.prep_bias(rec_output, batch_size); mem = self.prep_bias(memory, batch_size) #Input process block_inp = F.tanh(self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output)) #Block Input inp_gate = F.sigmoid(self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) + self.w_rec_inpgate.mm(rec_output)) #Input gate #Read from memory read_gate_out = F.sigmoid(self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(mem)) decoded_mem = self.w_decoder.mm(read_gate_out * memory) # Compute hidden activation hidden_act = block_inp * inp_gate + decoded_mem #Update memory write_gate_out = F.sigmoid(self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) + self.w_rec_writegate.mm(rec_output)) # #Write gate encoded_update = F.tanh(self.w_encoder.mm(hidden_act)) memory = memory + write_gate_out * encoded_update return hidden_act, memory def graph_compute(self, input, rec_output, memory, batch_size): #Reshape add 1 for bias input = self.prep_bias(input, batch_size); rec_output = self.prep_bias(rec_output, batch_size) #Input process block_inp = F.tanh(self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output)) #Block Input inp_gate = F.sigmoid(self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(memory) + self.w_rec_inpgate.mm(rec_output)) #Input gate #Read from memory read_gate_out = F.sigmoid(self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(memory)) decoded_mem = self.w_decoder.mm(read_gate_out * memory) # Compute hidden activation hidden_act = block_inp * inp_gate + decoded_mem #Update memory write_gate_out = F.sigmoid(self.w_writegate.mm(input) + self.w_mem_writegate.mm(memory) + self.w_rec_writegate.mm(rec_output)) # #Write gate encoded_update = F.tanh(self.w_encoder.mm(hidden_act)) memory = memory + write_gate_out * encoded_update return hidden_act, memory def forward(self, input): batch_size = input.data.shape[-1] self.out, self.mem = self.graph_compute(input, self.out, self.mem, batch_size) return self.out def turn_grad_on(self): for param in self.parameters(): param.requires_grad = True param.volatile = False def turn_grad_off(self): for param in self.parameters(): param.requires_grad = False param.volatile = True
class GD_LSTM(nn.Module): def __init__(self, input_size, hidden_size, memory_size, output_size): super(GD_LSTM, self).__init__() self.input_size = input_size; self.hidden_size = hidden_size; self.memory_size = memory_size; self.output_size = output_size # Input gate self.w_inpgate = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1) self.w_rec_inpgate = Parameter(torch.rand(hidden_size, output_size+1), requires_grad=1) self.w_mem_inpgate = Parameter(torch.rand(hidden_size, memory_size), requires_grad=1) # Block Input self.w_inp = Parameter(torch.rand(hidden_size, input_size+1), requires_grad=1) self.w_rec_inp = Parameter(torch.rand(hidden_size, output_size+1), requires_grad=1) # Read Gate self.w_readgate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1) self.w_rec_readgate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1) self.w_mem_readgate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1) # Write Gate self.w_writegate = Parameter(torch.rand(memory_size, input_size+1), requires_grad=1) self.w_rec_writegate = Parameter(torch.rand(memory_size, output_size+1), requires_grad=1) self.w_mem_writegate = Parameter(torch.rand(memory_size, memory_size), requires_grad=1) # Adaptive components self.mem = Variable(torch.zeros(self.memory_size, 1), requires_grad=1).cuda() self.out = Variable(torch.zeros(self.output_size, 1), requires_grad=1).cuda() for param in self.parameters(): # torch.nn.init.xavier_normal(param) # torch.nn.init.orthogonal(param) # torch.nn.init.sparse(param, sparsity=0.5) torch.nn.init.kaiming_normal(param) # Gates to 1 # self.w_writegate = Parameter(torch.ones(memory_size, input_size), requires_grad=1) # self.w_rec_writegate = Parameter(torch.ones(memory_size, output_size), requires_grad=1) # self.w_mem_writegate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1) # self.w_readgate = Parameter(torch.ones(memory_size, input_size), requires_grad=1) # self.w_rec_readgate = Parameter(torch.ones(memory_size, output_size), requires_grad=1) # self.w_mem_readgate = Parameter(torch.ones(memory_size, memory_size), requires_grad=1) # self.w_inpgate = Parameter(torch.ones(hidden_size, input_size), requires_grad=1) # self.w_rec_inpgate = Parameter(torch.ones(hidden_size, output_size), requires_grad=1) # self.w_mem_inpgate = Parameter(torch.ones(hidden_size, memory_size), requires_grad=1) def prep_bias(self, mat, batch_size): return Variable(torch.cat((mat.cpu().data, torch.ones(1, batch_size))).cuda()) def reset(self, batch_size): # Adaptive components self.mem = Variable(torch.zeros(self.memory_size, batch_size), requires_grad=1).cuda() self.out = Variable(torch.zeros(self.output_size, batch_size), requires_grad=1).cuda() def graph_compute(self, input, rec_output, mem, batch_size): #Reshape add 1 for bias input = self.prep_bias(input, batch_size); rec_output = self.prep_bias(rec_output, batch_size) # Block Input block_inp = F.tanh(self.w_inp.mm(input) + self.w_rec_inp.mm(rec_output)) # + self.w_block_input_bias) # Input gate inp_gate = F.sigmoid(self.w_inpgate.mm(input) + self.w_mem_inpgate.mm(mem) + self.w_rec_inpgate.mm(rec_output)) # + self.w_input_gate_bias) # Input out inp_out = block_inp * inp_gate # Read gate read_gate_out = F.sigmoid(self.w_readgate.mm(input) + self.w_rec_readgate.mm(rec_output) + self.w_mem_readgate.mm(mem)) # + self.w_readgate_bias) * mem # Output gate out_gate = F.sigmoid(self.w_writegate.mm(input) + self.w_mem_writegate.mm(mem) + self.w_rec_writegate.mm(rec_output)) # + self.w_writegate_bias) # Compute new mem mem = inp_out + read_gate_out * mem out = out_gate * mem return out, mem def forward(self, input): batch_size = input.data.shape[-1] self.out, self.mem = self.graph_compute(input, self.out, self.mem, batch_size) return self.out def turn_grad_on(self): for param in self.parameters(): param.requires_grad = True param.volatile = False def turn_grad_off(self): for param in self.parameters(): param.requires_grad = False param.volatile = True
class MMU(nn.Module): def __init__(self, input_dim, hid_dim, mem_dim, out_dim): super(MMU, self).__init__() self.input_dim = input_dim self.hid_dim = hid_dim self.mem_dim = mem_dim self.out_dim = out_dim # Input gate self.w_inpgate = nn.Linear(input_dim, hid_dim) self.w_rec_inpgate = nn.Linear(out_dim, hid_dim) self.w_mem_inpgate = nn.Linear(mem_dim, hid_dim) # Block Input self.w_inp = nn.Linear(input_dim, hid_dim) self.w_rec_inp = nn.Linear(out_dim, hid_dim) # Read Gate self.w_readgate = nn.Linear(input_dim, mem_dim) self.w_rec_readgate = nn.Linear(out_dim, mem_dim) self.w_mem_readgate = nn.Linear(mem_dim, mem_dim) # Memory Decoder self.w_decoder = nn.Linear(hid_dim, mem_dim) # Write Gate self.w_writegate = nn.Linear(input_dim, mem_dim) self.w_rec_writegate = nn.Linear(out_dim, mem_dim) self.w_mem_writegate = nn.Linear(mem_dim, mem_dim) # Memory Encoder self.w_encoder = nn.Linear(mem_dim, hid_dim) #Adaptive components self.mem = None self.out = None #Output weights self.w_hid_out = Parameter(torch.rand(out_dim, mem_dim), requires_grad=True) # History for RRN self.hist_steps = 5 self.rnn_history = [] #np.zeros([5,20,20]) def reset(self, batch_size): # Adaptive components self.mem = Variable(torch.zeros(batch_size, self.mem_dim), requires_grad=True) #.cuda() self.out = Variable(torch.zeros(batch_size, self.out_dim), requires_grad=True) #.cuda() self.rnn_history = [] #Variable(torch.zeros()) def predict(self, input): return self.forward(input) def graph_compute(self, input, rec_output, memory): # Input process #block_inp = F.sigmoid(self.w_inp(input) + self.w_rec_inp(rec_output)) # Block Input block_inp = torch.sigmoid( self.w_inp(torch.t(input)) + self.w_rec_inp(rec_output)) inp_gate = torch.sigmoid( self.w_inpgate(torch.t(input)) + self.w_mem_inpgate(memory) + self.w_rec_inpgate(rec_output)) #Input gate # Read from memory read_gate_out = torch.sigmoid( self.w_readgate(torch.t(input)) + self.w_mem_readgate(memory) + self.w_rec_readgate(rec_output)) decoded_mem = self.w_decoder(read_gate_out * memory) # Compute hidden activation hidden_act = decoded_mem + block_inp * inp_gate # Update memory write_gate_out = torch.sigmoid( self.w_writegate(torch.t(input)) + self.w_mem_writegate(memory) + self.w_rec_writegate(rec_output)) # #Write gate encoded_update = torch.tanh(self.w_encoder(hidden_act)) memory = (1 - write_gate_out) * memory + write_gate_out * encoded_update #memory = memory + encoded_update return hidden_act, memory def forward(self, input): # Adaptive components self.mem = Variable(torch.zeros(input.shape[1], self.mem_dim), requires_grad=True) #.cuda() self.out = Variable(torch.zeros(input.shape[1], self.out_dim), requires_grad=True) #.cuda() #print(self.out.shape) '''Create history of n time-steps and loop graph_compute n times to generate final output''' if not torch.is_tensor(self.rnn_history): self.rnn_history = torch.Tensor( np.zeros([input.shape[0], self.hist_steps, input.shape[1] ])) #control_inputs, history, batch_size # Shift the history and update to latest input for i in range(self.hist_steps - 1): self.rnn_history[:, i, :] = self.rnn_history[:, i + 1, :] '''Trying to fix batch size change''' # the last batch_size can be different if input.shape != self.rnn_history[:, -1, :].shape: temp = copy.deepcopy(input) input = torch.Tensor( np.zeros( [self.rnn_history.shape[0], self.rnn_history.shape[2]])) input[0:temp.shape[0], 0:temp.shape[1]] = temp self.rnn_history[:, -1, :] = input #print(self.out.shape) # Loop to generate final output for i in range(self.hist_steps): out, mem = self.graph_compute(self.rnn_history[:, i, :], self.out, self.mem) self.out, self.mem = out, mem self.out = self.w_hid_out.mm(torch.t(self.out)) self.out = torch.t(self.out) '''Old working code without history''' #self.out, self.mem = self.graph_compute(input, self.out, self.mem) # Till here, "out" is the hidden_act #self.out = self.w_hid_out.mm(torch.t(self.out)) #self.out = torch.t(self.out) return self.out def turn_grad_on(self): for param in self.parameters(): param.requires_grad = True param.volatile = False def turn_grad_off(self): for param in self.parameters(): param.requires_grad = False param.volatile = True