Exemple #1
0
 def __init__(self,
              device: Union[None, str, torch.device] = None,
              **kwargs):
     super().__init__(device, **kwargs)
     self.tokenizer = Tokenizer(vocab="iupac")
     raw_model = CPCProtModel().to(self._device)
     state_dict = dict(
         torch.load(self._options["model_file"], map_location=self._device))
     for i in list(state_dict.keys()):
         if i.startswith("module."):
             state_dict[i[7:]] = state_dict[i]
             del state_dict[i]
     raw_model.load_state_dict(state_dict)
     self._model = CPCProtEmbedding(raw_model.to(self._device).eval())
Exemple #2
0
    def __init__(self,
                 data_path: Union[str, Path],
                 in_memory: bool = False,
                 min_len: int = 0,
                 max_len: int = sys.maxsize,
                 scramble: bool = False):
        super().__init__()
        self.tokenizer = Tokenizer()
        self._min_len = min_len
        self._max_len = max_len
        self._scramble = scramble

        self.num_too_short = 0
        self.num_too_long = 0

        data_path = Path(data_path)
        # data_file = f'pfam/pfam_{split}.lmdb'
        # self.data = dataset_factory(data_path / data_file, in_memory)
        self.data = dataset_factory(data_path, in_memory)
Exemple #3
0
def fasta_to_padded_data(fasta_fpath: str):
    tokenizer = Tokenizer()
    families = []
    clans = []
    seqs = []

    with open(fasta_fpath) as f:
        for line in f:
            if line[0] == ">":
                line = line.rstrip().split("_")
                families.append(int(line[1]))
                clans.append(int(line[3]))
            else:
                seq = tokenizer.convert_tokens_to_ids(line.rstrip())
                seqs.append(np.array(seq))

    families = np.array(families)
    clans = np.array(clans)
    seqs = pad_sequences(np.array(seqs))
Exemple #4
0
 def __init__(self,
              device: Union[None, str, torch.device] = None,
              **kwargs):
     super().__init__(device, **kwargs)
     self.tokenizer = Tokenizer(vocab="iupac")
     # If we don't do this here, CPCProtModel will end up on the gpu if one is
     # available, even if we passed the cpu as device.
     # Afaik this is the best way to derive from DEFAULT_CONFIG
     dict_cfg = DEFAULT_CONFIG.to_dict()
     dict_cfg["use_cuda"] = self._device.type == "cuda"
     raw_model = CPCProtModel(cfg=CPCProtConfig.from_dict(dict_cfg)).to(
         self._device)
     state_dict = dict(
         torch.load(self._options["model_file"], map_location=self._device))
     for i in list(state_dict.keys()):
         if i.startswith("module."):
             state_dict[i[7:]] = state_dict[i]
             del state_dict[i]
     raw_model.load_state_dict(state_dict)
     self._model = CPCProtEmbedding(raw_model.to(self._device).eval())
Exemple #5
0
class CPCProtEmbedder(EmbedderInterface):
    """CPCProt Embedder

    Lu, Amy X., et al. "Self-supervised contrastive learning of protein
    representations by mutual information maximization." bioRxiv (2020).
    https://doi.org/10.1101/2020.09.04.283929
    """

    name = "cpcprot"
    embedding_dimension = 512
    number_of_layers = 1

    _necessary_files = ["model_file"]

    def __init__(self,
                 device: Union[None, str, torch.device] = None,
                 **kwargs):
        super().__init__(device, **kwargs)
        self.tokenizer = Tokenizer(vocab="iupac")
        # If we don't do this here, CPCProtModel will end up on the gpu if one is
        # available, even if we passed the cpu as device.
        # Afaik this is the best way to derive from DEFAULT_CONFIG
        dict_cfg = DEFAULT_CONFIG.to_dict()
        dict_cfg["use_cuda"] = self._device.type == "cuda"
        raw_model = CPCProtModel(cfg=CPCProtConfig.from_dict(dict_cfg)).to(
            self._device)
        state_dict = dict(
            torch.load(self._options["model_file"], map_location=self._device))
        for i in list(state_dict.keys()):
            if i.startswith("module."):
                state_dict[i[7:]] = state_dict[i]
                del state_dict[i]
        raw_model.load_state_dict(state_dict)
        self._model = CPCProtEmbedding(raw_model.to(self._device).eval())

    def embed(self, sequence: str) -> ndarray:
        [embedding] = self.embed_batch([sequence])
        return embedding

    def embed_batch(self, batch: List[str]) -> Generator[ndarray, None, None]:
        """See https://github.com/amyxlu/CPCProt/blob/df1ad1118544ed349b5e711207660a7c205b3128/embed_fasta.py"""
        encoded = [
            numpy.array(self.tokenizer.encode(sequence)) for sequence in batch
        ]
        # 11 is the minimum patch size, so we need to zero-pad shorter sequences
        pad_length = max(max([i.shape[0] for i in encoded]), 11)
        padded = [numpy.pad(i, (0, pad_length - i.shape[0])) for i in encoded]
        torch_inputs = torch.from_numpy(numpy.array(padded))
        yield from self._model.get_z_mean(torch_inputs).detach().cpu().numpy()

    @staticmethod
    def reduce_per_protein(embedding: ndarray) -> ndarray:
        return embedding
Exemple #6
0
    def __init__(self,
                 data_file: Path,
                 min_len: int = 0,
                 max_len: int = 100000000,
                 tokenizer: Tokenizer = 'iupac',
                 scramble=False):

        data_file = Path(data_file)
        if not data_file.exists():
            raise FileNotFoundError(data_file)

        if isinstance(tokenizer, str):
            tokenizer = Tokenizer(vocab=tokenizer)
        self.tokenizer = tokenizer

        self.min_len = min_len
        self.max_len = max_len
        self.num_too_short = 0
        self.num_too_long = 0
        self.data = dict()

        idx = 0  # may or may not always correspond to the ID in the Fasta...

        with open(data_file, 'r') as f:
            for line in f:
                if line[0] == ">":
                    line = line.rstrip().split("_")
                    clan = int(line[3])
                    pseudolabel = int(line[5])

                else:
                    seq = line.rstrip()
                    if len(seq) < self.min_len:
                        self.num_too_short += 1
                        continue

                    elif len(seq) > self.max_len:
                        self.num_too_long += 1
                        seq = seq[:self.max_len]

                    self.data[idx] = {
                        'primary': seq,
                        'pseudolabel': pseudolabel,
                        'clan': clan,
                        'protein_length': len(seq)
                    }
                    idx += 1

        self._num_examples = len(self.data)
        self._scramble = scramble
Exemple #7
0
class CPCProtEmbedder(EmbedderInterface):
    """CPCProt Embedder

    Self-Supervised Contrastive Learning of Protein Representations By Mutual Information Maximization
    Amy X. Lu, Haoran Zhang, Marzyeh Ghassemi, Alan Moses
    bioRxiv 2020.09.04.283929; doi: https://doi.org/10.1101/2020.09.04.283929
    """

    name = "cpcprot"
    embedding_dimension = 512
    number_of_layers = 1

    _necessary_files = ["model_file"]

    def __init__(self,
                 device: Union[None, str, torch.device] = None,
                 **kwargs):
        super().__init__(device, **kwargs)
        self.tokenizer = Tokenizer(vocab="iupac")
        raw_model = CPCProtModel().to(self._device)
        state_dict = dict(
            torch.load(self._options["model_file"], map_location=self._device))
        for i in list(state_dict.keys()):
            if i.startswith("module."):
                state_dict[i[7:]] = state_dict[i]
                del state_dict[i]
        raw_model.load_state_dict(state_dict)
        self._model = CPCProtEmbedding(raw_model.to(self._device).eval())

    def embed(self, sequence: str) -> ndarray:
        [embedding] = self.embed_batch([sequence])
        return embedding

    def embed_batch(self, batch: List[str]) -> Generator[ndarray, None, None]:
        """See https://github.com/amyxlu/CPCProt/blob/df1ad1118544ed349b5e711207660a7c205b3128/embed_fasta.py"""
        encoded = [
            numpy.array(self.tokenizer.encode(sequence)) for sequence in batch
        ]
        # 11 is the minimum patch size, so we need to zero-pad shorter sequences
        pad_length = max(max([i.shape[0] for i in encoded]), 11)
        padded = [numpy.pad(i, (0, pad_length - i.shape[0])) for i in encoded]
        torch_inputs = torch.from_numpy(numpy.array(padded))
        yield from self._model.get_z_mean(torch_inputs).detach().cpu().numpy()

    @staticmethod
    def reduce_per_protein(embedding: ndarray) -> ndarray:
        return embedding
Exemple #8
0
class PfamDataset(Dataset):
    """
    Modified from https://github.com/songlab-cal/tape/blob/master/tape/datasets.py
    Creates the Pfam Dataset. Modified to trim lengths for the patched model.
    """

    def __init__(self,
                 data_path: Union[str, Path],
                 in_memory: bool = False,
                 min_len: int = 0,
                 max_len: int = sys.maxsize,
                 scramble: bool = False):
        super().__init__()
        self.tokenizer = Tokenizer()
        self._min_len = min_len
        self._max_len = max_len
        self._scramble = scramble

        self.num_too_short = 0
        self.num_too_long = 0

        data_path = Path(data_path)
        # data_file = f'pfam/pfam_{split}.lmdb'
        # self.data = dataset_factory(data_path / data_file, in_memory)
        self.data = dataset_factory(data_path, in_memory)

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        # must use `convert_tokens_to_ids` to avoid adding the [CLS] and [SEP] tokens:
        token_ids = np.array(self.tokenizer.convert_tokens_to_ids(item['primary']))

        ## added to trim lengths:
        if len(token_ids) < self._min_len:
            self.num_too_short += 1
            token_ids = "DROP"
        elif len(token_ids) > self._max_len:
            self.num_too_long += 1
            token_ids = token_ids[:self._max_len]

        if self._scramble:
            np.random.shuffle(token_ids)
        ######

        return token_ids, item['clan'], item['family'], item['protein_length']