Ejemplo n.º 1
0
 def test_step(self, batch, batch_idx):
     genes, others, s, A, P, G, gene_names, other_names = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     x, xlen, y, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     # Obtain alignment statistics + visualizations
     gen = self.aligner.traceback(seq, order)
     # TODO: compare the traceback and the forward
     statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA,
                                        theta, gap, batch_idx)
     assert len(statistics) > 0, (batch_idx, s)
     genes = list(
         map(
             lambda x: self.tokenizer.alphabet.decode(x.detach().cpu(
             ).numpy()).decode("utf-8"), genes))
     others = list(
         map(
             lambda x: self.tokenizer.alphabet.decode(x.detach().cpu(
             ).numpy()).decode("utf-8"), others))
     statistics = pd.DataFrame(statistics,
                               columns=[
                                   'test_tp', 'test_fp', 'test_fn',
                                   'test_perc_id', 'test_ppv', 'test_fnr',
                                   'test_fdr'
                               ])
     statistics['query_name'] = gene_names
     statistics['key_name'] = other_names
     return statistics
Ejemplo n.º 2
0
 def test_pack_sequences(self):
     X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])]
     Y = [
         torch.Tensor([21, 10, 12, 2, 4, 5]),
         torch.Tensor([1, 4, 11, 13, 14])
     ]
     res, order = pack_sequences(X, Y)
     npt.assert_allclose(order, np.array([2, 3, 1, 0]))
Ejemplo n.º 3
0
 def align(self, x, y):
     x_code = torch.Tensor(self.tokenizer(str.encode(x))).long()
     y_code = torch.Tensor(self.tokenizer(str.encode(y))).long()
     x_code = x_code.to(self.device)
     y_code = y_code.to(self.device)
     seq, order = pack_sequences([x_code], [y_code])
     gen = self.aligner.traceback(seq, order)
     decoded, _ = next(gen)
     pred_x, pred_y, pred_states = zip(*decoded)
     s = ''.join(list(map(revstate_f, pred_states)))
     return s
Ejemplo n.º 4
0
 def test_alignment(self):
     self.embedding = self.embedding.cuda()
     self.aligner = self.aligner.cuda()
     x = torch.Tensor(
         self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')).long().cuda()
     y = torch.Tensor(
         self.tokenizer(b'ARNDCQEGHILKARNDCQMFPSTWYVXOUBZ')).long().cuda()
     N, M = x.shape[0], y.shape[0]
     M = max(N, M)
     seq, order = pack_sequences([x], [y])
     aln, theta, A = self.aligner(seq, order)
     self.assertEqual(aln.shape, (1, M, M))
Ejemplo n.º 5
0
 def test_unpack_sequences(self):
     X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])]
     Y = [
         torch.Tensor([21, 10, 12, 2, 4, 5]),
         torch.Tensor([1, 4, 11, 13, 14])
     ]
     z, order = pack_sequences(X, Y)
     resX, xlen, resY, ylen = unpack_sequences(z, order)
     tt.assert_allclose(xlen, torch.Tensor([3, 4]).long())
     tt.assert_allclose(ylen, torch.Tensor([6, 5]).long())
     expX = torch.Tensor([[6, 4, 5, 0, 0, 0], [1, 4, 5, 7, 0, 0]])
     expY = torch.Tensor([[21, 10, 12, 2, 4, 5], [1, 4, 11, 13, 14, 0]])
     tt.assert_allclose(expX, resX)
     tt.assert_allclose(expY, resY)
Ejemplo n.º 6
0
 def training_step(self, batch, batch_idx):
     self.aligner.train()
     genes, others, s, A, P, G = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     _, xlen, _, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     if len(self.trainer.lr_schedulers) >= 1:
         current_lr = self.trainer.lr_schedulers[0]['scheduler']
         current_lr = current_lr.get_last_lr()[0]
     else:
         current_lr = self.hparams.learning_rate
     tensorboard_logs = {'train_loss': loss, 'lr': current_lr}
     # log the learning rate
     return {'loss': loss, 'log': tensorboard_logs}
Ejemplo n.º 7
0
 def test_batch_alignment(self):
     self.embedding = self.embedding.cuda()
     self.aligner = self.aligner.cuda()
     length = len('ARNDCQEGHILKMFPSTWYVXOUBZ')
     x = torch.zeros((2, length))
     y = torch.zeros((2, length))
     x1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     x2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWY')
     y1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     y2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYV')
     x = [torch.Tensor(x1).cuda().long(), torch.Tensor(x2).cuda().long()]
     y = [torch.Tensor(y1).cuda().long(), torch.Tensor(y2).cuda().long()]
     seq, order = pack_sequences(x, y)
     aln, theta, A = self.aligner(seq, order)
     self.assertEqual(aln.shape, (2, length, length))
     self.assertEqual(theta.shape, (2, length, length))
Ejemplo n.º 8
0
def deepblast_align(
    pairings: List[Tuple[str, str]],
    query_by_id: Dict[str, str],
    target_by_id: Dict[str, str],
    model_file: str,
    device: torch.device,
    batch_size: int,
) -> List[Tuple[str, str, str, str]]:
    """Aligns the given pairings using DeepBLAST

    Returns a list of query id, target id, query aligned, target aligned

    The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7)

    Note that the batch size has much less of an impact for DeepBLAST than for the embedders
    """
    model = LightningAligner.load_from_checkpoint(model_file).to(device)
    tokenizer = UniprotTokenizer()
    alignments = []
    # Naive batching
    batches = numpy.array_split(pairings,
                                math.ceil(len(pairings) / batch_size))
    for batch in tqdm(batches):
        # noinspection PyArgumentList
        queries = [
            torch.Tensor(tokenizer(query_by_id[query].encode())).long()
            for query, _ in batch
        ]
        # noinspection PyArgumentList
        targets = [
            torch.Tensor(tokenizer(target_by_id[target].encode())).long()
            for _, target in batch
        ]
        seqs, order = pack_sequences(queries, targets)
        gen = model.aligner.traceback(seqs.to(device), order)
        for (decoded, _), (query, target) in zip(gen, batch):
            pred_x, pred_y, pred_states = zip(*decoded)
            pred_alignment = "".join(list(map(revstate_f, pred_states)))
            x_aligned, y_aligned = states2alignment(pred_alignment,
                                                    query_by_id[query],
                                                    target_by_id[target])
            alignments.append((query, target, x_aligned, y_aligned))
    return alignments
Ejemplo n.º 9
0
 def validation_step(self, batch, batch_idx):
     genes, others, s, A, P, G = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     x, xlen, y, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     # Obtain alignment statistics + visualizations
     gen = self.aligner.traceback(seq, order)
     # TODO; compare the traceback and the forward
     statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA,
                                        theta, gap, batch_idx)
     statistics = pd.DataFrame(statistics,
                               columns=[
                                   'val_tp', 'val_fp', 'val_fn',
                                   'val_perc_id', 'val_ppv', 'val_fnr',
                                   'val_fdr'
                               ])
     statistics = statistics.mean(axis=0).to_dict()
     tensorboard_logs = {'valid_loss': loss}
     tensorboard_logs = {**tensorboard_logs, **statistics}
     return {'validation_loss': loss, 'log': tensorboard_logs}
Ejemplo n.º 10
0
    def test_collate_alignment(self):
        M = 5
        x1 = torch.Tensor(self.tokenizer(b'NDCQ')).long()
        x2 = torch.Tensor(self.tokenizer(b'NDC')).long()
        y1 = torch.Tensor(self.tokenizer(b'ND')).long()
        y2 = torch.Tensor(self.tokenizer(b'NDCQE')).long()
        s1 = torch.Tensor([1, 1, 1, 0]).long()
        s2 = torch.Tensor([1, 1, 2, 2, 2]).long()
        A1 = torch.ones((len(x1), len(y1))).long()
        A2 = torch.ones((len(x2), len(y2))).long()
        P1 = torch.ones((len(x1), len(y1))).long()
        P2 = torch.ones((len(x2), len(y2))).long()
        G1 = torch.ones((len(x1), len(y1))).long()
        G2 = torch.ones((len(x2), len(y2))).long()

        batch = [(x1, y1, s1, A1, P1, G1), (x2, y2, s2, A2, P2, G2)]
        gene_codes, other_codes, states, dm, p, g = collate_f(batch)
        self.embedding = self.embedding.cuda()
        self.aligner = self.aligner.cuda()
        seq, order = pack_sequences(gene_codes, other_codes)
        seq = seq.cuda()
        aln, theta, A = self.aligner(seq, order)
        self.assertEqual(aln.shape, (2, M, M))
        self.assertEqual(theta.shape, (2, M, M))
Ejemplo n.º 11
0
 def score(self, x_code, y_code):
     seq, order = pack_sequences(x_code, y_code)
     seq = seq.to(self.device)
     A = self.aligner.score(seq, order)
     return A
Ejemplo n.º 12
0
		i=i.rstrip()
		if i[0]=='>':
			ID=i[1:]
			continue
		seqs[ID]=seqs.get(ID,'')+i

keys_list = list(seqs)
x = seqs[keys_list[0]]
y = seqs[keys_list[1]]
pred_alignment = model.align(x, y)

x_aligned, y_aligned = states2alignment(pred_alignment, x, y)

file = open(args.output,"w")
file.write(">%s\n%s\n>%s\n%s" % (keys_list[0], x_aligned, keys_list[1], y_aligned))
file.close()

print(x_aligned)
print(pred_alignment)
print(y_aligned)

x_ = torch.Tensor(model.tokenizer(str.encode(x))).long()
y_ = torch.Tensor(model.tokenizer(str.encode(y))).long()

seq, order = pack_sequences([x_], [y_])

score = model.aligner.score(seq, order).item()
print('Score', score)