Python UniprotTokenizer Beispiele

Programmiersprache: Python

Namespace / Paketname: deepblast.dataset.alphabet

Klasse / Typ: UniprotTokenizer

Beispiele auf hotexamples.com: 11

Python UniprotTokenizer - 11 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die deepblast.dataset.alphabet.UniprotTokenizer, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

UniprotTokenizer(11)

Häufig verwendete Methoden

UniprotTokenizer (11)

Beispiel #1

Datei anzeigen

 def setUp(self):
     path = pretrained_language_models['bilstm']
     self.embedding = BiLM()
     self.embedding.load_state_dict(torch.load(path))
     self.embedding.eval()
     self.tokenizer = UniprotTokenizer(pad_ends=False)
     nalpha, ninput, nunits, nembed = 22, 1024, 1024, 1024
     self.aligner = NeedlemanWunschAligner(nalpha, ninput, nunits, nembed)

Beispiel #2

Datei anzeigen

 def test_tokenizer_encode_no_padding(self):
     tokenizer = UniprotTokenizer(pad_ends=False)
     x = 'ARNDCQEGHILKMFPSTWYVXOUBZ'
     x = str.encode(x)
     res = tokenizer(x)
     exp = np.array([
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 11, 4, 20, 20
     ])
     npt.assert_allclose(exp, res)

Beispiel #3

Datei anzeigen

    def __init__(self, pairs, tokenizer=UniprotTokenizer()):
        """ Read in pairs of proteins

        Parameters
        ----------
        pairs: np.array of str
            Pairs of proteins that are aligned.  This includes gaps
            and require that the proteins have the same length
        """
        self.pairs = pairs
        self.tokenizer = tokenizer

Beispiel #4

Datei anzeigen

Datei: trainer.py Projekt: mortonjt/deepblast

 def __init__(self, args):
     super(LightningAligner, self).__init__()
     self.tokenizer = UniprotTokenizer(pad_ends=False)
     self.hparams = args
     self.initialize_aligner()
     if self.hparams.loss == 'sse':
         self.loss_func = SoftAlignmentLoss()
     elif self.hparams.loss == 'cross_entropy':
         self.loss_func = MatrixCrossEntropy()
     elif self.hparams.loss == 'path':
         self.loss_func = SoftPathLoss()
     else:
         raise ValueError(f'`{args.loss}` is not implemented.')

Beispiel #5

Datei anzeigen

    def __init__(self, query_file, db_file, tokenizer=UniprotTokenizer()):
        """ Read in pairs of proteins

        Parameters
        ----------
        query_file : path
            Path to query protein sequences.
        db_file : path
            Path to database protein sequences.
        """
        self.tokenizer = tokenizer
        self.query_file = query_file
        self.db_file = db_file

Beispiel #6

Datei anzeigen

def deepblast_align(
    pairings: List[Tuple[str, str]],
    query_by_id: Dict[str, str],
    target_by_id: Dict[str, str],
    model_file: str,
    device: torch.device,
    batch_size: int,
) -> List[Tuple[str, str, str, str]]:
    """Aligns the given pairings using DeepBLAST

    Returns a list of query id, target id, query aligned, target aligned

    The model on its own takes between 740MiB (Titan X, torch 1.5) and 1284MiB (RTX 8000, torch 1.7)

    Note that the batch size has much less of an impact for DeepBLAST than for the embedders
    """
    model = LightningAligner.load_from_checkpoint(model_file).to(device)
    tokenizer = UniprotTokenizer()
    alignments = []
    # Naive batching
    batches = numpy.array_split(pairings,
                                math.ceil(len(pairings) / batch_size))
    for batch in tqdm(batches):
        # noinspection PyArgumentList
        queries = [
            torch.Tensor(tokenizer(query_by_id[query].encode())).long()
            for query, _ in batch
        ]
        # noinspection PyArgumentList
        targets = [
            torch.Tensor(tokenizer(target_by_id[target].encode())).long()
            for _, target in batch
        ]
        seqs, order = pack_sequences(queries, targets)
        gen = model.aligner.traceback(seqs.to(device), order)
        for (decoded, _), (query, target) in zip(gen, batch):
            pred_x, pred_y, pred_states = zip(*decoded)
            pred_alignment = "".join(list(map(revstate_f, pred_states)))
            x_aligned, y_aligned = states2alignment(pred_alignment,
                                                    query_by_id[query],
                                                    target_by_id[target])
            alignments.append((query, target, x_aligned, y_aligned))
    return alignments

Beispiel #7

Datei anzeigen

    def __init__(self,
                 path,
                 tokenizer=UniprotTokenizer(),
                 tm_threshold=0.4,
                 max_len=1024,
                 pad_ends=False,
                 clip_ends=True,
                 mask_gaps=True,
                 return_names=False,
                 construct_paths=False):
        """ Read in pairs of proteins.


        This assumes that columns are labeled as
        | chain1_name | chain2_name | tmscore1 | tmscore2 | rmsd |
        | chain1 | chain2 | alignment |

        Parameters
        ----------
        path: path
            Data path to aligned protein pairs.  This includes gaps
            and require that the proteins have the same length
        tokenizer: UniprotTokenizer
            Converts residues to one-hot encodings
        tm_threshold: float
            Minimum threshold to investigate alignments
        max_len : float
            Maximum sequence length to be aligned
        pad_ends : bool
            Specifies if the ends of the sequences should be padded or not.
        clip_ends : bool
            Specifies if the ends of the alignments should be clipped or not.
        mask_gaps : bool
            Specifies if the mask for the gaps should be constructed.
        return_names : bool
            Specifies if the names of the proteins should be returned.
        construct_paths : bool
            Specifies if path distances should be calculated.

        Notes
        -----
        There are start/stop tokens that are incorporated into the
        alignment. The needleman-wunsch algorithm assumes this to be true.
        """
        self.tokenizer = tokenizer
        self.tm_threshold = tm_threshold
        self.max_len = max_len
        self.pairs = pd.read_table(path, header=None)
        self.construct_paths = construct_paths
        cols = [
            'chain1_name', 'chain2_name', 'tmscore1', 'tmscore2', 'rmsd',
            'chain1', 'chain2', 'alignment'
        ]
        self.pairs.columns = cols
        self.pairs['tm'] = np.maximum(self.pairs['tmscore1'],
                                      self.pairs['tmscore2'])
        self.pairs['length'] = self.pairs.apply(
            lambda x: max(len(x['chain1']), len(x['chain2'])), axis=1)
        idx = np.logical_and(self.pairs['tm'] > self.tm_threshold,
                             self.pairs['length'] < self.max_len)
        self.pairs = self.pairs.loc[idx]
        # TODO: pad_ends needs to be documented properly
        self.pad_ends = pad_ends
        self.clip_ends = clip_ends
        self.mask_gaps = mask_gaps
        self.return_names = return_names

Beispiel #8

Datei anzeigen

 def __init__(self, pairs, tokenizer=UniprotTokenizer()):
     self.tokenizer = tokenizer
     self.pairs = pairs

Beispiel #9

Datei anzeigen

 def setUp(self):
     path = pretrained_language_models['bilstm']
     self.embedding = BiLM()
     self.embedding.load_state_dict(torch.load(path))
     self.embedding.eval()
     self.tokenizer = UniprotTokenizer()

Beispiel #10

Datei anzeigen

Datei: test_dataset.py Projekt: blackwer/garfunkel

 def setUp(self):
     self.data_path = get_data_path('test_tm_align.tab')
     self.tokenizer = UniprotTokenizer(pad_ends=False)

Beispiel #11

Datei anzeigen

 def test_tokenizer(self):
     tokenizer = UniprotTokenizer(pad_ends=True)
     res = tokenizer(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
     # Need to account for padding and offset
     exp = np.array([20] + list(range(0, 21)) + [11, 4, 20, 20] + [20])
     npt.assert_allclose(res, exp)