Exemple #1
0
    def __init__(self, dataset, max_len: int):
        super(ScnDataset, self).__init__()
        self.dataset = dataset

        self.max_len = max_len
        self.scn_collate_fn = get_collate_fn(False)
        self.vocab = ProteinVocabulary()
Exemple #2
0
class ScnDataset(Dataset):
    def __init__(self, dataset, max_len: int):
        super(ScnDataset, self).__init__()
        self.dataset = dataset

        self.max_len = max_len
        self.scn_collate_fn = get_collate_fn(False)
        self.vocab = ProteinVocabulary()

    def collate_fn(self, batch):
        batch = self.scn_collate_fn(batch)
        real_seqs = [
            "".join([self.vocab.int2char(aa) for aa in seq])
            for seq in batch.int_seqs.numpy()
        ]
        seq = real_seqs[0][:self.max_len]
        true_coords = batch.crds[0].view(-1, 14, 3)[:self.max_len].view(-1, 3)
        angles = batch.angs[0, :self.max_len]
        mask = batch.msks[0, :self.max_len]

        # get padding
        padding_seq = (np.array([*seq]) == "_").sum()
        return {
            "seq": seq,
            "true_coords": true_coords,
            "angles": angles,
            "padding_seq": padding_seq,
            "mask": mask,
        }

    def __getitem__(self, index: int):
        return self.dataset[index]

    def __len__(self) -> int:
        return len(self.dataset)
Exemple #3
0
def scn_cloud_mask(scn_seq, boolean=True):
    """ Gets the boolean mask atom positions (not all aas have same atoms). 
        Inputs: 
        * scn_seq: (batch, length) sequence as provided by Sidechainnet package
        * boolean: whether to return as array of idxs or boolean values
        Outputs: (batch, length, NUM_COORDS_PER_RES) boolean mask 
    """
    # scaffolds 
    mask = torch.zeros(*scn_seq.shape, NUM_COORDS_PER_RES, device=scn_seq.device)
    # fill 
    for n, seq in enumerate(scn_seq.cpu().numpy()):
        for i,aa in enumerate(seq):
            # get num of atom positions - backbone is 4: ...N-C-C(=O)...
            n_atoms = 4+len( SC_BUILD_INFO[VOCAB.int2chars(aa)]["atom-names"] )
            mask[n, i, :n_atoms] = 1
    if boolean:
        return mask.bool()
    return mask.nonzero()
Exemple #4
0
# bio 
from Bio import SeqIO
import itertools
import string

# sidechainnet

from sidechainnet.utils.sequence import ProteinVocabulary, ONE_TO_THREE_LETTER_MAP
from sidechainnet.utils.measure import GLOBAL_PAD_CHAR
from sidechainnet.structure.build_info import NUM_COORDS_PER_RES, BB_BUILD_INFO, SC_BUILD_INFO
from sidechainnet.structure.StructureBuilder import _get_residue_build_iter

# build vocabulary

VOCAB = ProteinVocabulary()

# constants

import alphafold2_pytorch.constants as constants

# helpers

def exists(val):
    return val is not None

# constants: same as in alphafold2.py

DISTANCE_THRESHOLDS = torch.linspace(2, 20, steps = constants.DISTOGRAM_BUCKETS)

# distance binning function
    import pytorch_lightning as pl

    LightningDataModule = pl.LightningDataModule
except ImportError:
    LightningDataModule = object

CACHE_PATH = Path("~/.cache/alphafold2_pytorch").expanduser()
DATA_DIR = CACHE_PATH / "trrosetta" / "trrosetta"
URL = "http://s3.amazonaws.com/proteindata/data_pytorch/trrosetta.tar.gz"

REMOVE_KEYS = dict.fromkeys(string.ascii_lowercase)
REMOVE_KEYS["."] = None
REMOVE_KEYS["*"] = None
translation = str.maketrans(REMOVE_KEYS)

DEFAULT_VOCAB = ProteinVocabulary()


def default_tokenize(seq: str) -> List[int]:
    return [DEFAULT_VOCAB[ch] for ch in seq]


def read_fasta(filename: str) -> List[Tuple[str, str]]:
    def remove_insertions(sequence: str) -> str:
        return sequence.translate(translation)

    return [(record.description, remove_insertions(str(record.seq)))
            for record in SeqIO.parse(filename, "fasta")]


def read_pdb(pdb: str):