def __init__(self, device: Union[None, str, torch.device] = None, **kwargs): super().__init__(device, **kwargs) self.tokenizer = Tokenizer(vocab="iupac") raw_model = CPCProtModel().to(self._device) state_dict = dict( torch.load(self._options["model_file"], map_location=self._device)) for i in list(state_dict.keys()): if i.startswith("module."): state_dict[i[7:]] = state_dict[i] del state_dict[i] raw_model.load_state_dict(state_dict) self._model = CPCProtEmbedding(raw_model.to(self._device).eval())
def __init__(self, data_path: Union[str, Path], in_memory: bool = False, min_len: int = 0, max_len: int = sys.maxsize, scramble: bool = False): super().__init__() self.tokenizer = Tokenizer() self._min_len = min_len self._max_len = max_len self._scramble = scramble self.num_too_short = 0 self.num_too_long = 0 data_path = Path(data_path) # data_file = f'pfam/pfam_{split}.lmdb' # self.data = dataset_factory(data_path / data_file, in_memory) self.data = dataset_factory(data_path, in_memory)
def fasta_to_padded_data(fasta_fpath: str): tokenizer = Tokenizer() families = [] clans = [] seqs = [] with open(fasta_fpath) as f: for line in f: if line[0] == ">": line = line.rstrip().split("_") families.append(int(line[1])) clans.append(int(line[3])) else: seq = tokenizer.convert_tokens_to_ids(line.rstrip()) seqs.append(np.array(seq)) families = np.array(families) clans = np.array(clans) seqs = pad_sequences(np.array(seqs))
def __init__(self, device: Union[None, str, torch.device] = None, **kwargs): super().__init__(device, **kwargs) self.tokenizer = Tokenizer(vocab="iupac") # If we don't do this here, CPCProtModel will end up on the gpu if one is # available, even if we passed the cpu as device. # Afaik this is the best way to derive from DEFAULT_CONFIG dict_cfg = DEFAULT_CONFIG.to_dict() dict_cfg["use_cuda"] = self._device.type == "cuda" raw_model = CPCProtModel(cfg=CPCProtConfig.from_dict(dict_cfg)).to( self._device) state_dict = dict( torch.load(self._options["model_file"], map_location=self._device)) for i in list(state_dict.keys()): if i.startswith("module."): state_dict[i[7:]] = state_dict[i] del state_dict[i] raw_model.load_state_dict(state_dict) self._model = CPCProtEmbedding(raw_model.to(self._device).eval())
class CPCProtEmbedder(EmbedderInterface): """CPCProt Embedder Lu, Amy X., et al. "Self-supervised contrastive learning of protein representations by mutual information maximization." bioRxiv (2020). https://doi.org/10.1101/2020.09.04.283929 """ name = "cpcprot" embedding_dimension = 512 number_of_layers = 1 _necessary_files = ["model_file"] def __init__(self, device: Union[None, str, torch.device] = None, **kwargs): super().__init__(device, **kwargs) self.tokenizer = Tokenizer(vocab="iupac") # If we don't do this here, CPCProtModel will end up on the gpu if one is # available, even if we passed the cpu as device. # Afaik this is the best way to derive from DEFAULT_CONFIG dict_cfg = DEFAULT_CONFIG.to_dict() dict_cfg["use_cuda"] = self._device.type == "cuda" raw_model = CPCProtModel(cfg=CPCProtConfig.from_dict(dict_cfg)).to( self._device) state_dict = dict( torch.load(self._options["model_file"], map_location=self._device)) for i in list(state_dict.keys()): if i.startswith("module."): state_dict[i[7:]] = state_dict[i] del state_dict[i] raw_model.load_state_dict(state_dict) self._model = CPCProtEmbedding(raw_model.to(self._device).eval()) def embed(self, sequence: str) -> ndarray: [embedding] = self.embed_batch([sequence]) return embedding def embed_batch(self, batch: List[str]) -> Generator[ndarray, None, None]: """See https://github.com/amyxlu/CPCProt/blob/df1ad1118544ed349b5e711207660a7c205b3128/embed_fasta.py""" encoded = [ numpy.array(self.tokenizer.encode(sequence)) for sequence in batch ] # 11 is the minimum patch size, so we need to zero-pad shorter sequences pad_length = max(max([i.shape[0] for i in encoded]), 11) padded = [numpy.pad(i, (0, pad_length - i.shape[0])) for i in encoded] torch_inputs = torch.from_numpy(numpy.array(padded)) yield from self._model.get_z_mean(torch_inputs).detach().cpu().numpy() @staticmethod def reduce_per_protein(embedding: ndarray) -> ndarray: return embedding
def __init__(self, data_file: Path, min_len: int = 0, max_len: int = 100000000, tokenizer: Tokenizer = 'iupac', scramble=False): data_file = Path(data_file) if not data_file.exists(): raise FileNotFoundError(data_file) if isinstance(tokenizer, str): tokenizer = Tokenizer(vocab=tokenizer) self.tokenizer = tokenizer self.min_len = min_len self.max_len = max_len self.num_too_short = 0 self.num_too_long = 0 self.data = dict() idx = 0 # may or may not always correspond to the ID in the Fasta... with open(data_file, 'r') as f: for line in f: if line[0] == ">": line = line.rstrip().split("_") clan = int(line[3]) pseudolabel = int(line[5]) else: seq = line.rstrip() if len(seq) < self.min_len: self.num_too_short += 1 continue elif len(seq) > self.max_len: self.num_too_long += 1 seq = seq[:self.max_len] self.data[idx] = { 'primary': seq, 'pseudolabel': pseudolabel, 'clan': clan, 'protein_length': len(seq) } idx += 1 self._num_examples = len(self.data) self._scramble = scramble
class CPCProtEmbedder(EmbedderInterface): """CPCProt Embedder Self-Supervised Contrastive Learning of Protein Representations By Mutual Information Maximization Amy X. Lu, Haoran Zhang, Marzyeh Ghassemi, Alan Moses bioRxiv 2020.09.04.283929; doi: https://doi.org/10.1101/2020.09.04.283929 """ name = "cpcprot" embedding_dimension = 512 number_of_layers = 1 _necessary_files = ["model_file"] def __init__(self, device: Union[None, str, torch.device] = None, **kwargs): super().__init__(device, **kwargs) self.tokenizer = Tokenizer(vocab="iupac") raw_model = CPCProtModel().to(self._device) state_dict = dict( torch.load(self._options["model_file"], map_location=self._device)) for i in list(state_dict.keys()): if i.startswith("module."): state_dict[i[7:]] = state_dict[i] del state_dict[i] raw_model.load_state_dict(state_dict) self._model = CPCProtEmbedding(raw_model.to(self._device).eval()) def embed(self, sequence: str) -> ndarray: [embedding] = self.embed_batch([sequence]) return embedding def embed_batch(self, batch: List[str]) -> Generator[ndarray, None, None]: """See https://github.com/amyxlu/CPCProt/blob/df1ad1118544ed349b5e711207660a7c205b3128/embed_fasta.py""" encoded = [ numpy.array(self.tokenizer.encode(sequence)) for sequence in batch ] # 11 is the minimum patch size, so we need to zero-pad shorter sequences pad_length = max(max([i.shape[0] for i in encoded]), 11) padded = [numpy.pad(i, (0, pad_length - i.shape[0])) for i in encoded] torch_inputs = torch.from_numpy(numpy.array(padded)) yield from self._model.get_z_mean(torch_inputs).detach().cpu().numpy() @staticmethod def reduce_per_protein(embedding: ndarray) -> ndarray: return embedding
class PfamDataset(Dataset): """ Modified from https://github.com/songlab-cal/tape/blob/master/tape/datasets.py Creates the Pfam Dataset. Modified to trim lengths for the patched model. """ def __init__(self, data_path: Union[str, Path], in_memory: bool = False, min_len: int = 0, max_len: int = sys.maxsize, scramble: bool = False): super().__init__() self.tokenizer = Tokenizer() self._min_len = min_len self._max_len = max_len self._scramble = scramble self.num_too_short = 0 self.num_too_long = 0 data_path = Path(data_path) # data_file = f'pfam/pfam_{split}.lmdb' # self.data = dataset_factory(data_path / data_file, in_memory) self.data = dataset_factory(data_path, in_memory) def __len__(self) -> int: return len(self.data) def __getitem__(self, index): item = self.data[index] # must use `convert_tokens_to_ids` to avoid adding the [CLS] and [SEP] tokens: token_ids = np.array(self.tokenizer.convert_tokens_to_ids(item['primary'])) ## added to trim lengths: if len(token_ids) < self._min_len: self.num_too_short += 1 token_ids = "DROP" elif len(token_ids) > self._max_len: self.num_too_long += 1 token_ids = token_ids[:self._max_len] if self._scramble: np.random.shuffle(token_ids) ###### return token_ids, item['clan'], item['family'], item['protein_length']