def setUp(self): path = pretrained_language_models['bilstm'] self.embedding = BiLM() self.embedding.load_state_dict(torch.load(path)) self.embedding.eval() self.tokenizer = UniprotTokenizer(pad_ends=False) nalpha, ninput, nunits, nembed = 22, 1024, 1024, 1024 self.aligner = NeedlemanWunschAligner(nalpha, ninput, nunits, nembed)
class TestBiLM(unittest.TestCase): def setUp(self): path = pretrained_language_models['bilstm'] self.embedding = BiLM() self.embedding.load_state_dict(torch.load(path)) self.embedding.eval() self.tokenizer = UniprotTokenizer() def test_bilm(self): toks = torch.Tensor(self.tokenizer(b'ABC')).long().unsqueeze(0) res = self.embedding(toks) self.assertEqual(res.shape, (1, 3, 21)) @unittest.skip('something is misbehaving here.') def test_bilm_batch(self): toks = torch.Tensor([[0, 20, 4, 3], [0, 20, 4, 0]]).long() lens = torch.Tensor([4, 3]).long() idx = pack_padded_sequence(toks, lens, batch_first=True) res = self.embedding(idx.data) x, xlen = pad_packed_sequence(res) tt.assert_allclose(xlen, lens) tt.assert_allclose(x, toks)
def setUp(self): path = pretrained_language_models['bilstm'] self.embedding = BiLM() self.embedding.load_state_dict(torch.load(path)) self.embedding.eval() self.tokenizer = UniprotTokenizer()
class NeedlemanWunschAligner(nn.Module): def __init__(self, n_alpha, n_input, n_units, n_embed, n_layers=2, lm=None, device='gpu'): """ NeedlemanWunsch Alignment model Parameters ---------- n_alpha : int Size of the alphabet (default 22) n_input : int Input dimensions. n_units : int Number of hidden units in RNN. n_embed : int Embedding dimension n_layers : int Number of RNN layers. lm : BiLM Pretrained language model (optional) padding_idx : int Location of padding index in embedding (default -1) transform : function Activation function (default relu) sparse : False? """ super(NeedlemanWunschAligner, self).__init__() if lm is None: path = pretrained_language_models['bilstm'] self.lm = BiLM() self.lm.load_state_dict(torch.load(path)) self.lm.eval() if n_layers > 1: self.match_embedding = StackedRNN(n_alpha, n_input, n_units, n_embed, n_layers, lm=lm) self.gap_embedding = StackedRNN(n_alpha, n_input, n_units, n_embed, n_layers, lm=lm) else: self.match_embedding = EmbedLinear(n_alpha, n_input, n_embed, lm=lm) self.gap_embedding = EmbedLinear(n_alpha, n_input, n_embed, lm=lm) # TODO: make cpu compatible version # if device == 'cpu': # self.nw = NWDecoderCPU(operator='softmax') # else: self.nw = NWDecoderCUDA(operator='softmax') def forward(self, x, order): """ Generate alignment matrix. Parameters ---------- x : PackedSequence Packed sequence object of proteins to align. order : np.array The origin order of the sequences Returns ------- aln : torch.Tensor Alignment Matrix (dim B x N x M) """ with torch.enable_grad(): zx, _, zy, _ = unpack_sequences(self.match_embedding(x), order) gx, _, gy, _ = unpack_sequences(self.gap_embedding(x), order) # Obtain theta through an inner product across latent dimensions theta = F.softplus(torch.einsum('bid,bjd->bij', zx, zy)) A = F.logsigmoid(torch.einsum('bid,bjd->bij', gx, gy)) aln = self.nw.decode(theta, A) return aln, theta, A def traceback(self, x, order): # dim B x N x D with torch.enable_grad(): zx, _, zy, _ = unpack_sequences(self.match_embedding(x), order) gx, xlen, gy, ylen = unpack_sequences(self.gap_embedding(x), order) match = F.softplus(torch.einsum('bid,bjd->bij', zx, zy)) gap = F.logsigmoid(torch.einsum('bid,bjd->bij', gx, gy)) B, _, _ = match.shape for b in range(B): aln = self.nw.decode(match[b, :xlen[b], :ylen[b]].unsqueeze(0), gap[b, :xlen[b], :ylen[b]].unsqueeze(0)) decoded = self.nw.traceback(aln.squeeze()) yield decoded, aln
def __init__(self, n_alpha, n_input, n_units, n_embed, n_layers=2, lm=None, device='gpu'): """ NeedlemanWunsch Alignment model Parameters ---------- n_alpha : int Size of the alphabet (default 22) n_input : int Input dimensions. n_units : int Number of hidden units in RNN. n_embed : int Embedding dimension n_layers : int Number of RNN layers. lm : BiLM Pretrained language model (optional) padding_idx : int Location of padding index in embedding (default -1) transform : function Activation function (default relu) sparse : False? """ super(NeedlemanWunschAligner, self).__init__() if lm is None: path = pretrained_language_models['bilstm'] self.lm = BiLM() self.lm.load_state_dict(torch.load(path)) self.lm.eval() if n_layers > 1: self.match_embedding = StackedRNN(n_alpha, n_input, n_units, n_embed, n_layers, lm=lm) self.gap_embedding = StackedRNN(n_alpha, n_input, n_units, n_embed, n_layers, lm=lm) else: self.match_embedding = EmbedLinear(n_alpha, n_input, n_embed, lm=lm) self.gap_embedding = EmbedLinear(n_alpha, n_input, n_embed, lm=lm) # TODO: make cpu compatible version # if device == 'cpu': # self.nw = NWDecoderCPU(operator='softmax') # else: self.nw = NWDecoderCUDA(operator='softmax')
class TestAlignmentModel(unittest.TestCase): def setUp(self): path = pretrained_language_models['bilstm'] self.embedding = BiLM() self.embedding.load_state_dict(torch.load(path)) self.embedding.eval() self.tokenizer = UniprotTokenizer(pad_ends=False) nalpha, ninput, nunits, nembed = 22, 1024, 1024, 1024 self.aligner = NeedlemanWunschAligner(nalpha, ninput, nunits, nembed) @unittest.skipUnless(torch.cuda.is_available(), "No GPU detected") def test_alignment(self): self.embedding = self.embedding.cuda() self.aligner = self.aligner.cuda() x = torch.Tensor( self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')).long().cuda() y = torch.Tensor( self.tokenizer(b'ARNDCQEGHILKARNDCQMFPSTWYVXOUBZ')).long().cuda() N, M = x.shape[0], y.shape[0] M = max(N, M) seq, order = pack_sequences([x], [y]) aln, theta, A = self.aligner(seq, order) self.assertEqual(aln.shape, (1, M, M)) @unittest.skipUnless(torch.cuda.is_available(), "No GPU detected") def test_batch_alignment(self): self.embedding = self.embedding.cuda() self.aligner = self.aligner.cuda() length = len('ARNDCQEGHILKMFPSTWYVXOUBZ') x = torch.zeros((2, length)) y = torch.zeros((2, length)) x1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ') x2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWY') y1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ') y2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYV') x = [torch.Tensor(x1).cuda().long(), torch.Tensor(x2).cuda().long()] y = [torch.Tensor(y1).cuda().long(), torch.Tensor(y2).cuda().long()] seq, order = pack_sequences(x, y) aln, theta, A = self.aligner(seq, order) self.assertEqual(aln.shape, (2, length, length)) self.assertEqual(theta.shape, (2, length, length)) @unittest.skipUnless(torch.cuda.is_available(), "No GPU detected") def test_collate_alignment(self): M = 5 x1 = torch.Tensor(self.tokenizer(b'NDCQ')).long() x2 = torch.Tensor(self.tokenizer(b'NDC')).long() y1 = torch.Tensor(self.tokenizer(b'ND')).long() y2 = torch.Tensor(self.tokenizer(b'NDCQE')).long() s1 = torch.Tensor([1, 1, 1, 0]).long() s2 = torch.Tensor([1, 1, 2, 2, 2]).long() A1 = torch.ones((len(x1), len(y1))).long() A2 = torch.ones((len(x2), len(y2))).long() P1 = torch.ones((len(x1), len(y1))).long() P2 = torch.ones((len(x2), len(y2))).long() G1 = torch.ones((len(x1), len(y1))).long() G2 = torch.ones((len(x2), len(y2))).long() batch = [(x1, y1, s1, A1, P1, G1), (x2, y2, s2, A2, P2, G2)] gene_codes, other_codes, states, dm, p, g = collate_f(batch) self.embedding = self.embedding.cuda() self.aligner = self.aligner.cuda() seq, order = pack_sequences(gene_codes, other_codes) seq = seq.cuda() aln, theta, A = self.aligner(seq, order) self.assertEqual(aln.shape, (2, M, M)) self.assertEqual(theta.shape, (2, M, M))