def __init__(self, dataset, max_len: int): super(ScnDataset, self).__init__() self.dataset = dataset self.max_len = max_len self.scn_collate_fn = get_collate_fn(False) self.vocab = ProteinVocabulary()
class ScnDataset(Dataset): def __init__(self, dataset, max_len: int): super(ScnDataset, self).__init__() self.dataset = dataset self.max_len = max_len self.scn_collate_fn = get_collate_fn(False) self.vocab = ProteinVocabulary() def collate_fn(self, batch): batch = self.scn_collate_fn(batch) real_seqs = [ "".join([self.vocab.int2char(aa) for aa in seq]) for seq in batch.int_seqs.numpy() ] seq = real_seqs[0][:self.max_len] true_coords = batch.crds[0].view(-1, 14, 3)[:self.max_len].view(-1, 3) angles = batch.angs[0, :self.max_len] mask = batch.msks[0, :self.max_len] # get padding padding_seq = (np.array([*seq]) == "_").sum() return { "seq": seq, "true_coords": true_coords, "angles": angles, "padding_seq": padding_seq, "mask": mask, } def __getitem__(self, index: int): return self.dataset[index] def __len__(self) -> int: return len(self.dataset)
def scn_cloud_mask(scn_seq, boolean=True): """ Gets the boolean mask atom positions (not all aas have same atoms). Inputs: * scn_seq: (batch, length) sequence as provided by Sidechainnet package * boolean: whether to return as array of idxs or boolean values Outputs: (batch, length, NUM_COORDS_PER_RES) boolean mask """ # scaffolds mask = torch.zeros(*scn_seq.shape, NUM_COORDS_PER_RES, device=scn_seq.device) # fill for n, seq in enumerate(scn_seq.cpu().numpy()): for i,aa in enumerate(seq): # get num of atom positions - backbone is 4: ...N-C-C(=O)... n_atoms = 4+len( SC_BUILD_INFO[VOCAB.int2chars(aa)]["atom-names"] ) mask[n, i, :n_atoms] = 1 if boolean: return mask.bool() return mask.nonzero()
# bio from Bio import SeqIO import itertools import string # sidechainnet from sidechainnet.utils.sequence import ProteinVocabulary, ONE_TO_THREE_LETTER_MAP from sidechainnet.utils.measure import GLOBAL_PAD_CHAR from sidechainnet.structure.build_info import NUM_COORDS_PER_RES, BB_BUILD_INFO, SC_BUILD_INFO from sidechainnet.structure.StructureBuilder import _get_residue_build_iter # build vocabulary VOCAB = ProteinVocabulary() # constants import alphafold2_pytorch.constants as constants # helpers def exists(val): return val is not None # constants: same as in alphafold2.py DISTANCE_THRESHOLDS = torch.linspace(2, 20, steps = constants.DISTOGRAM_BUCKETS) # distance binning function
import pytorch_lightning as pl LightningDataModule = pl.LightningDataModule except ImportError: LightningDataModule = object CACHE_PATH = Path("~/.cache/alphafold2_pytorch").expanduser() DATA_DIR = CACHE_PATH / "trrosetta" / "trrosetta" URL = "http://s3.amazonaws.com/proteindata/data_pytorch/trrosetta.tar.gz" REMOVE_KEYS = dict.fromkeys(string.ascii_lowercase) REMOVE_KEYS["."] = None REMOVE_KEYS["*"] = None translation = str.maketrans(REMOVE_KEYS) DEFAULT_VOCAB = ProteinVocabulary() def default_tokenize(seq: str) -> List[int]: return [DEFAULT_VOCAB[ch] for ch in seq] def read_fasta(filename: str) -> List[Tuple[str, str]]: def remove_insertions(sequence: str) -> str: return sequence.translate(translation) return [(record.description, remove_insertions(str(record.seq))) for record in SeqIO.parse(filename, "fasta")] def read_pdb(pdb: str):