Beispiel #1
0
 def setUp(self):
     path = pretrained_language_models['bilstm']
     self.embedding = BiLM()
     self.embedding.load_state_dict(torch.load(path))
     self.embedding.eval()
     self.tokenizer = UniprotTokenizer(pad_ends=False)
     nalpha, ninput, nunits, nembed = 22, 1024, 1024, 1024
     self.aligner = NeedlemanWunschAligner(nalpha, ninput, nunits, nembed)
Beispiel #2
0
 def test_tokenizer_encode_no_padding(self):
     tokenizer = UniprotTokenizer(pad_ends=False)
     x = 'ARNDCQEGHILKMFPSTWYVXOUBZ'
     x = str.encode(x)
     res = tokenizer(x)
     exp = np.array([
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 11, 4, 20, 20
     ])
     npt.assert_allclose(exp, res)
Beispiel #3
0
    def __init__(self, pairs, tokenizer=UniprotTokenizer()):
        """ Read in pairs of proteins

        Parameters
        ----------
        pairs: np.array of str
            Pairs of proteins that are aligned.  This includes gaps
            and require that the proteins have the same length
        """
        self.pairs = pairs
        self.tokenizer = tokenizer
Beispiel #4
0
 def __init__(self, args):
     super(LightningAligner, self).__init__()
     self.tokenizer = UniprotTokenizer(pad_ends=False)
     self.hparams = args
     self.initialize_aligner()
     if self.hparams.loss == 'sse':
         self.loss_func = SoftAlignmentLoss()
     elif self.hparams.loss == 'cross_entropy':
         self.loss_func = MatrixCrossEntropy()
     elif self.hparams.loss == 'path':
         self.loss_func = SoftPathLoss()
     else:
         raise ValueError(f'`{args.loss}` is not implemented.')
Beispiel #5
0
    def __init__(self, query_file, db_file, tokenizer=UniprotTokenizer()):
        """ Read in pairs of proteins

        Parameters
        ----------
        query_file : path
            Path to query protein sequences.
        db_file : path
            Path to database protein sequences.
        """
        self.tokenizer = tokenizer
        self.query_file = query_file
        self.db_file = db_file
Beispiel #6
0
def deepblast_align(
    pairings: List[Tuple[str, str]],
    query_by_id: Dict[str, str],
    target_by_id: Dict[str, str],
    model_file: str,
    device: torch.device,
    batch_size: int,
) -> List[Tuple[str, str, str, str]]:
    """Aligns the given pairings using DeepBLAST

    Returns a list of query id, target id, query aligned, target aligned

    The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7)

    Note that the batch size has much less of an impact for DeepBLAST than for the embedders
    """
    model = LightningAligner.load_from_checkpoint(model_file).to(device)
    tokenizer = UniprotTokenizer()
    alignments = []
    # Naive batching
    batches = numpy.array_split(pairings,
                                math.ceil(len(pairings) / batch_size))
    for batch in tqdm(batches):
        # noinspection PyArgumentList
        queries = [
            torch.Tensor(tokenizer(query_by_id[query].encode())).long()
            for query, _ in batch
        ]
        # noinspection PyArgumentList
        targets = [
            torch.Tensor(tokenizer(target_by_id[target].encode())).long()
            for _, target in batch
        ]
        seqs, order = pack_sequences(queries, targets)
        gen = model.aligner.traceback(seqs.to(device), order)
        for (decoded, _), (query, target) in zip(gen, batch):
            pred_x, pred_y, pred_states = zip(*decoded)
            pred_alignment = "".join(list(map(revstate_f, pred_states)))
            x_aligned, y_aligned = states2alignment(pred_alignment,
                                                    query_by_id[query],
                                                    target_by_id[target])
            alignments.append((query, target, x_aligned, y_aligned))
    return alignments
Beispiel #7
0
    def __init__(self,
                 path,
                 tokenizer=UniprotTokenizer(),
                 tm_threshold=0.4,
                 max_len=1024,
                 pad_ends=False,
                 clip_ends=True,
                 mask_gaps=True,
                 return_names=False,
                 construct_paths=False):
        """ Read in pairs of proteins.


        This assumes that columns are labeled as
        | chain1_name | chain2_name | tmscore1 | tmscore2 | rmsd |
        | chain1 | chain2 | alignment |

        Parameters
        ----------
        path: path
            Data path to aligned protein pairs.  This includes gaps
            and require that the proteins have the same length
        tokenizer: UniprotTokenizer
            Converts residues to one-hot encodings
        tm_threshold: float
            Minimum threshold to investigate alignments
        max_len : float
            Maximum sequence length to be aligned
        pad_ends : bool
            Specifies if the ends of the sequences should be padded or not.
        clip_ends : bool
            Specifies if the ends of the alignments should be clipped or not.
        mask_gaps : bool
            Specifies if the mask for the gaps should be constructed.
        return_names : bool
            Specifies if the names of the proteins should be returned.
        construct_paths : bool
            Specifies if path distances should be calculated.

        Notes
        -----
        There are start/stop tokens that are incorporated into the
        alignment. The needleman-wunsch algorithm assumes this to be true.
        """
        self.tokenizer = tokenizer
        self.tm_threshold = tm_threshold
        self.max_len = max_len
        self.pairs = pd.read_table(path, header=None)
        self.construct_paths = construct_paths
        cols = [
            'chain1_name', 'chain2_name', 'tmscore1', 'tmscore2', 'rmsd',
            'chain1', 'chain2', 'alignment'
        ]
        self.pairs.columns = cols
        self.pairs['tm'] = np.maximum(self.pairs['tmscore1'],
                                      self.pairs['tmscore2'])
        self.pairs['length'] = self.pairs.apply(
            lambda x: max(len(x['chain1']), len(x['chain2'])), axis=1)
        idx = np.logical_and(self.pairs['tm'] > self.tm_threshold,
                             self.pairs['length'] < self.max_len)
        self.pairs = self.pairs.loc[idx]
        # TODO: pad_ends needs to be documented properly
        self.pad_ends = pad_ends
        self.clip_ends = clip_ends
        self.mask_gaps = mask_gaps
        self.return_names = return_names
Beispiel #8
0
 def __init__(self, pairs, tokenizer=UniprotTokenizer()):
     self.tokenizer = tokenizer
     self.pairs = pairs
Beispiel #9
0
 def setUp(self):
     path = pretrained_language_models['bilstm']
     self.embedding = BiLM()
     self.embedding.load_state_dict(torch.load(path))
     self.embedding.eval()
     self.tokenizer = UniprotTokenizer()
Beispiel #10
0
 def setUp(self):
     self.data_path = get_data_path('test_tm_align.tab')
     self.tokenizer = UniprotTokenizer(pad_ends=False)
Beispiel #11
0
 def test_tokenizer(self):
     tokenizer = UniprotTokenizer(pad_ends=True)
     res = tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     # Need to account for padding and offset
     exp = np.array([20] + list(range(0, 21)) + [11, 4, 20, 20] + [20])
     npt.assert_allclose(res, exp)