Ejemplos de pack_sequences en Python, ejemplos de deepblast.dataset.utils.pack_sequences en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: trainer.py Proyecto: mortonjt/deepblast

 def test_step(self, batch, batch_idx):
     genes, others, s, A, P, G, gene_names, other_names = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     x, xlen, y, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     # Obtain alignment statistics + visualizations
     gen = self.aligner.traceback(seq, order)
     # TODO: compare the traceback and the forward
     statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA,
                                        theta, gap, batch_idx)
     assert len(statistics) > 0, (batch_idx, s)
     genes = list(
         map(
             lambda x: self.tokenizer.alphabet.decode(x.detach().cpu(
             ).numpy()).decode("utf-8"), genes))
     others = list(
         map(
             lambda x: self.tokenizer.alphabet.decode(x.detach().cpu(
             ).numpy()).decode("utf-8"), others))
     statistics = pd.DataFrame(statistics,
                               columns=[
                                   'test_tp', 'test_fp', 'test_fn',
                                   'test_perc_id', 'test_ppv', 'test_fnr',
                                   'test_fdr'
                               ])
     statistics['query_name'] = gene_names
     statistics['key_name'] = other_names
     return statistics

Ejemplo n.º 2

0

Mostrar archivo

 def test_pack_sequences(self):
     X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])]
     Y = [
         torch.Tensor([21, 10, 12, 2, 4, 5]),
         torch.Tensor([1, 4, 11, 13, 14])
     ]
     res, order = pack_sequences(X, Y)
     npt.assert_allclose(order, np.array([2, 3, 1, 0]))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: trainer.py Proyecto: mortonjt/deepblast

 def align(self, x, y):
     x_code = torch.Tensor(self.tokenizer(str.encode(x))).long()
     y_code = torch.Tensor(self.tokenizer(str.encode(y))).long()
     x_code = x_code.to(self.device)
     y_code = y_code.to(self.device)
     seq, order = pack_sequences([x_code], [y_code])
     gen = self.aligner.traceback(seq, order)
     decoded, _ = next(gen)
     pred_x, pred_y, pred_states = zip(*decoded)
     s = ''.join(list(map(revstate_f, pred_states)))
     return s

Ejemplo n.º 4

0

Mostrar archivo

 def test_alignment(self):
     self.embedding = self.embedding.cuda()
     self.aligner = self.aligner.cuda()
     x = torch.Tensor(
         self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')).long().cuda()
     y = torch.Tensor(
         self.tokenizer(b'ARNDCQEGHILKARNDCQMFPSTWYVXOUBZ')).long().cuda()
     N, M = x.shape[0], y.shape[0]
     M = max(N, M)
     seq, order = pack_sequences([x], [y])
     aln, theta, A = self.aligner(seq, order)
     self.assertEqual(aln.shape, (1, M, M))

Ejemplo n.º 5

0

Mostrar archivo

 def test_unpack_sequences(self):
     X = [torch.Tensor([6, 4, 5]), torch.Tensor([1, 4, 5, 7])]
     Y = [
         torch.Tensor([21, 10, 12, 2, 4, 5]),
         torch.Tensor([1, 4, 11, 13, 14])
     ]
     z, order = pack_sequences(X, Y)
     resX, xlen, resY, ylen = unpack_sequences(z, order)
     tt.assert_allclose(xlen, torch.Tensor([3, 4]).long())
     tt.assert_allclose(ylen, torch.Tensor([6, 5]).long())
     expX = torch.Tensor([[6, 4, 5, 0, 0, 0], [1, 4, 5, 7, 0, 0]])
     expY = torch.Tensor([[21, 10, 12, 2, 4, 5], [1, 4, 11, 13, 14, 0]])
     tt.assert_allclose(expX, resX)
     tt.assert_allclose(expY, resY)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: trainer.py Proyecto: mortonjt/deepblast

 def training_step(self, batch, batch_idx):
     self.aligner.train()
     genes, others, s, A, P, G = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     _, xlen, _, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     if len(self.trainer.lr_schedulers) >= 1:
         current_lr = self.trainer.lr_schedulers[0]['scheduler']
         current_lr = current_lr.get_last_lr()[0]
     else:
         current_lr = self.hparams.learning_rate
     tensorboard_logs = {'train_loss': loss, 'lr': current_lr}
     # log the learning rate
     return {'loss': loss, 'log': tensorboard_logs}

Ejemplo n.º 7

0

Mostrar archivo

 def test_batch_alignment(self):
     self.embedding = self.embedding.cuda()
     self.aligner = self.aligner.cuda()
     length = len('ARNDCQEGHILKMFPSTWYVXOUBZ')
     x = torch.zeros((2, length))
     y = torch.zeros((2, length))
     x1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     x2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWY')
     y1 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     y2 = self.tokenizer(b'ARNDCQEGHILKMFPSTWYV')
     x = [torch.Tensor(x1).cuda().long(), torch.Tensor(x2).cuda().long()]
     y = [torch.Tensor(y1).cuda().long(), torch.Tensor(y2).cuda().long()]
     seq, order = pack_sequences(x, y)
     aln, theta, A = self.aligner(seq, order)
     self.assertEqual(aln.shape, (2, length, length))
     self.assertEqual(theta.shape, (2, length, length))

Ejemplo n.º 8

0

Mostrar archivo

def deepblast_align(
    pairings: List[Tuple[str, str]],
    query_by_id: Dict[str, str],
    target_by_id: Dict[str, str],
    model_file: str,
    device: torch.device,
    batch_size: int,
) -> List[Tuple[str, str, str, str]]:
    """Aligns the given pairings using DeepBLAST

    Returns a list of query id, target id, query aligned, target aligned

    The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7)

    Note that the batch size has much less of an impact for DeepBLAST than for the embedders
    """
    model = LightningAligner.load_from_checkpoint(model_file).to(device)
    tokenizer = UniprotTokenizer()
    alignments = []
    # Naive batching
    batches = numpy.array_split(pairings,
                                math.ceil(len(pairings) / batch_size))
    for batch in tqdm(batches):
        # noinspection PyArgumentList
        queries = [
            torch.Tensor(tokenizer(query_by_id[query].encode())).long()
            for query, _ in batch
        ]
        # noinspection PyArgumentList
        targets = [
            torch.Tensor(tokenizer(target_by_id[target].encode())).long()
            for _, target in batch
        ]
        seqs, order = pack_sequences(queries, targets)
        gen = model.aligner.traceback(seqs.to(device), order)
        for (decoded, _), (query, target) in zip(gen, batch):
            pred_x, pred_y, pred_states = zip(*decoded)
            pred_alignment = "".join(list(map(revstate_f, pred_states)))
            x_aligned, y_aligned = states2alignment(pred_alignment,
                                                    query_by_id[query],
                                                    target_by_id[target])
            alignments.append((query, target, x_aligned, y_aligned))
    return alignments

Ejemplo n.º 9

0

Mostrar archivo

Archivo: trainer.py Proyecto: mortonjt/deepblast

 def validation_step(self, batch, batch_idx):
     genes, others, s, A, P, G = batch
     seq, order = pack_sequences(genes, others)
     predA, theta, gap = self.aligner(seq, order)
     x, xlen, y, ylen = unpack_sequences(seq, order)
     loss = self.compute_loss(xlen, ylen, predA, A, P, G, theta)
     assert torch.isnan(loss).item() is False
     # Obtain alignment statistics + visualizations
     gen = self.aligner.traceback(seq, order)
     # TODO; compare the traceback and the forward
     statistics = self.validation_stats(x, y, xlen, ylen, gen, s, A, predA,
                                        theta, gap, batch_idx)
     statistics = pd.DataFrame(statistics,
                               columns=[
                                   'val_tp', 'val_fp', 'val_fn',
                                   'val_perc_id', 'val_ppv', 'val_fnr',
                                   'val_fdr'
                               ])
     statistics = statistics.mean(axis=0).to_dict()
     tensorboard_logs = {'valid_loss': loss}
     tensorboard_logs = {**tensorboard_logs, **statistics}
     return {'validation_loss': loss, 'log': tensorboard_logs}

Ejemplo n.º 10

0

Mostrar archivo

    def test_collate_alignment(self):
        M = 5
        x1 = torch.Tensor(self.tokenizer(b'NDCQ')).long()
        x2 = torch.Tensor(self.tokenizer(b'NDC')).long()
        y1 = torch.Tensor(self.tokenizer(b'ND')).long()
        y2 = torch.Tensor(self.tokenizer(b'NDCQE')).long()
        s1 = torch.Tensor([1, 1, 1, 0]).long()
        s2 = torch.Tensor([1, 1, 2, 2, 2]).long()
        A1 = torch.ones((len(x1), len(y1))).long()
        A2 = torch.ones((len(x2), len(y2))).long()
        P1 = torch.ones((len(x1), len(y1))).long()
        P2 = torch.ones((len(x2), len(y2))).long()
        G1 = torch.ones((len(x1), len(y1))).long()
        G2 = torch.ones((len(x2), len(y2))).long()

        batch = [(x1, y1, s1, A1, P1, G1), (x2, y2, s2, A2, P2, G2)]
        gene_codes, other_codes, states, dm, p, g = collate_f(batch)
        self.embedding = self.embedding.cuda()
        self.aligner = self.aligner.cuda()
        seq, order = pack_sequences(gene_codes, other_codes)
        seq = seq.cuda()
        aln, theta, A = self.aligner(seq, order)
        self.assertEqual(aln.shape, (2, M, M))
        self.assertEqual(theta.shape, (2, M, M))

Ejemplo n.º 11

0

Mostrar archivo

Archivo: trainer.py Proyecto: konstin/deepblast

 def score(self, x_code, y_code):
     seq, order = pack_sequences(x_code, y_code)
     seq = seq.to(self.device)
     A = self.aligner.score(seq, order)
     return A

Ejemplo n.º 12

0

Mostrar archivo

		i=i.rstrip()
		if i[0]=='>':
			ID=i[1:]
			continue
		seqs[ID]=seqs.get(ID,'')+i

keys_list = list(seqs)
x = seqs[keys_list[0]]
y = seqs[keys_list[1]]
pred_alignment = model.align(x, y)

x_aligned, y_aligned = states2alignment(pred_alignment, x, y)

file = open(args.output,"w")
file.write(">%s\n%s\n>%s\n%s" % (keys_list[0], x_aligned, keys_list[1], y_aligned))
file.close()

print(x_aligned)
print(pred_alignment)
print(y_aligned)

x_ = torch.Tensor(model.tokenizer(str.encode(x))).long()
y_ = torch.Tensor(model.tokenizer(str.encode(y))).long()

seq, order = pack_sequences([x_], [y_])

score = model.aligner.score(seq, order).item()
print('Score', score)