def calculate_features(pdb, chain, id):
    from molmimic.biopdbtools import Structure
    inputSpatialSize = torch.LongTensor((264, 264, 264))
    struct = Structure(pdb, chain, id=id)
    for rotation in struct.rotate(1000):
        indices, data = struct.map_atoms_to_voxel_space()
        inputs = scn.InputBatch(3, inputSpatialSize)
        inputs.addSample()
        try:
            inputs.setLocations(
                torch.from_numpy(indices).long(),
                torch.from_numpy(data).float(), 0)
        except AssertionError:
            theta, phi, z = rotation[1:]
            min_coord = np.min(indices, axis=0)
            max_coord = np.max(indices, axis=0)
            dist = int(np.ceil(np.linalg.norm(max_coord - min_coord)))

            with open("{}_{}_{}.txt".format(pdb, chain, id), "a") as f:
                print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".
                      format(pdb, chain, id, theta, phi, z, dist, min_coord[0],
                             min_coord[1], min_coord[2], max_coord[0],
                             max_coord[1], max_coord[2]),
                      file=f)
        del inputs
        del indices
        del data
Esempio n. 2
0
def feature_selection(ibis_data):
    dataset = IBISDataset(ibis_data)
    feature_names = Structure.get_feature_names()

    models = {
         "linear":linear_selection(),
         "variance":variance_selection(),
         "lasso":lasso_selection(),
         "pca":inc_pca()
    }

    X = np.array((len(dataset), len(feature_names)))
    for i in xrange(len(dataset)):
        features = dataset[i]
        X[i] = np.nan_to_num(features["data"])

    for name, (model, should_create) in models.iteritems():
    	try:
        	model.fit(X)
        except ValueError as e:
        	print name
        	print e
        	print X
        	print
        	continue
        if should_create:
        	model = SelectFromModel(model, prefit=True)
        if name != "pca":
            print name, "=", [feature_names[i] for i, s in enumerate(model.get_support()) if s]

    cumsum = np.cumsum(models["pca"][0].explained_variance_ratio_)
    d = np.argmax(cumsum > 0.95) + 1
    print "pca, n-components: {}; explained variance: {}".format(d, models["pca"][0].explained_variance_ratio_)
Esempio n. 3
0
def get_ibis(dataset_name, pdb_ibis_file, use_cdd_domain=None, multimers=False, check_binding_sites=False):
    out_dir = get_interfaces_path(dataset_name)
    seen_cdd = set()
    with open(pdb_ibis_file) as pdb_ibis:
        for pdb_chain, entries in groupby(parse_ibis(pdb_ibis), key=lambda l: l["Query"]):
            pdb, chain = pdb_chain[:-1], pdb_chain[-1]

            if check_binding_sites:
                try:
                    structure = Structure.from_pdb(pdb, chain)
                except (KeyboardInterrupt, SystemExit) as e:
                    raise
                except InvalidPDB:
                    continue
                except:
                    trace = traceback.format_exc()
                    print "Error:", trace
                    continue

            for entry in entries:
                if entry["Interaction_type"] not in ["PPI"]: continue #, "LIG"

                cdd = entry["Query_Domain"]
                partner = entry["Interaction_Partner"]
                residues = entry["PDB_Residue_No"].lstrip().replace(" ", ",")
                residue_str = entry["Binding_Site_Residues"]
                observed = entry["Is_Observed"] == "1"
                pdb_evidence = entry["PDB_Evidence"]
                is_multimer = "1" if cdd == partner else "0"


                if multimers and not is_multimer:
                    #Query domain should not be target domain
                    continue

                if use_cdd_domain is not None and cdd != use_cdd_domain:
                    continue

                if check_binding_sites:
                    #Check if positions match structure
                    pdb_seq = ""
                    for i, r in enumerate(residues.split(",")):
                        if r == "X": continue

                        res = structure.get_residue_from_resseq(r)

                        if res is None:
                            residue_str = residue_str[:i]+residue_str[i+1:]
                        else:
                            pdb_seq += PDB.Polypeptide.three_to_one(res.get_resname())

                    if pdb_seq != residue_str:
                        print "{} {} does not match {} =/= {}".format(entry["Query"], residues, pdb_seq, residue_str)
                        continue

                seen_cdd.add(cdd)

                with open(os.path.join(out_dir, cdd.replace("/", ""), "{}.tsv".format(pdb)), "a+") as f:
                    print >> f, "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pdb, chain, residues, residue_str, is_multimer, cdd, partner, pdb_evidence, int(observed))
Esempio n. 4
0
    def __data_generation(self, indexes, epoch=None, batch=None):
        'Generates data of batch_size samples'  # X : (n_samples, v_size, v_size, v_size, n_channels)
        # Initialization
        X = np.zeros(
            (self.batch_size, self.input_shape[0], self.input_shape[1],
             self.input_shape[1], 21
             if self.only_aa else Structure.nFeatures))  #or Structure.Features
        y = np.zeros((self.batch_size, self.input_shape[0],
                      self.input_shape[1], self.input_shape[1], 1))
        # Generate data
        failures = 0
        for batch_index, index in enumerate(indexes):
            datum = self.data.iloc[index]
            print "Running {} ({}.{}): {}".format(
                datum["unique_obs_int"], datum["pdb"], datum["chain"],
                ",".join([
                    "{}{}".format(i, n) for i, n in zip(
                        datum["resi"].split(","), datum["resn"].split(","))
                ]))
            if self.only_aa:
                print "AA only"

            try:
                (data_idx, data), (truth_idx,
                                   truth) = Structure.features_from_string(
                                       datum["pdb"],
                                       datum["chain"],
                                       datum["resi"],
                                       id=datum["unique_obs_int"],
                                       input_shape=self.input_shape,
                                       batch_index=batch_index - failures,
                                       only_aa=self.only_aa)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                failures += 1

                X = np.resize(X, (X.shape[0] - 1, ) + X.shape[1:])
                y = np.resize(y, (y.shape[0] - 1, ) + y.shape[1:])

                trace = traceback.format_exc()
                print "Error:", trace
                with open(
                        "{}_{}_{}_{}_{}.error".format(datum["pdb"],
                                                      datum["chain"],
                                                      datum["unique_obs_int"],
                                                      epoch, batch),
                        "w") as ef:
                    print >> ef, trace
                continue

            X[data_idx.T.tolist()] = data.flatten()
            y[truth_idx.T.tolist()] = truth

        return X, y
def calculate_time(pdb, chain, id):
    import time
    from molmimic.biopdbtools import Structure
    import numpy as np
    start = time.time()
    indices, _ = Structure.features_from_string(pdb, chain, id=id)
    total_time = time.time() - start

    num_atoms = indices.shape[0]

    min_coord = np.min(indices, axis=0)
    max_coord = np.max(indices, axis=0)
    dist = int(np.ceil(np.linalg.norm(max_coord - min_coord)))

    with open("pdbs/time_{}_{}.txt".format(pdb, chain), "w") as f:
        print >> f, "{}\t{}\t{}\t{}\t{}\t{}".format(pdb, chain, id, total_time,
                                                    num_atoms, dist)
Esempio n. 6
0
def load_model(model_file, no_batch_norm=False, use_resnet_unet=True, dropout_depth=False, dropout_width=False, dropout_p=0.5, unclustered=False, nFeatures=None, only_aa=False, only_atom=False, non_geom_features=False, use_deepsite_features=False, old_version=False):
    model_prefix = "./molmimic_model_{}".format(datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))

    dtype = 'torch.cuda.FloatTensor' if torch.cuda.is_available() else 'torch.FloatTensor'

    if nFeatures is None or nFeatures<0:
        nFeatures = Structure.number_of_features(only_aa=only_aa, only_atom=only_atom, non_geom_features=non_geom_features, use_deepsite_features=use_deepsite_features)

    if use_resnet_unet:
        model = ResNetUNet(nFeatures, 2, dropout_depth=dropout_depth, dropout_width=dropout_width, dropout_p=dropout_p, old_version=old_version)
    else:
        model = UNet3D(nFeatures, 2, batchnorm=not no_batch_norm)

    states =  torch.load(model_file)
    try:
        model.load_state_dict(states)
    except KeyError as e:
        if not old_version:
            try:
                return load_model(
                    model_file,
                    no_batch_norm=no_batch_norm,
                    use_resnet_unet=use_resnet_unet,
                    dropout_depth=dropout_depth,
                    dropout_width=dropout_width,
                    dropout_p=dropout_p,
                    unclustered=unclustered,
                    nFeatures=nFeatures,
                    only_aa=only_aa,
                    only_atom=only_atom,
                    non_geom_features=non_geom_features,
                    use_deepsite_features=use_deepsite_features,
                    old_version=True
                )
            except KeyError:
                pass
        raise
    model.type(dtype)
    model.train(False)  # Set model to evaluate mode

    return model
Esempio n. 7
0
def view_in_pymol(id,
                  predicted_voxels=None,
                  truth_voxels=None,
                  voxel_atom_ratio=.2):
    pdb, chain = id.split(".")
    structure = Structure.from_pdb(pdb, chain, rotate=False)

    cmd = """fetch {id}
remove hetatm
hide everything, {id}
show surface, {id}
color gray90, {id}
""".format(id=id)

    if truth_voxels is not None:
        truth_atoms = Counter()
        for v in truth_voxels:
            atoms = structure.convert_voxels(v, level="A")
            if len(atoms) > 0:
                truth_atoms[atoms[0]] += 1

        truth_atoms = [atom for atom, count in truth_atoms.iteritems() \
            if float(count)/atom_volume(structure, atom) >= voxel_atom_ratio]

        truth_residues = [
            str(r.get_id()[1]) for r in unfold_entities(truth_atoms, "R")
        ]
        truth_resi = "+".join(truth_residues)

        cmd += """select true_binding_site, resi {true_resi}
color orange, true_binding_site
""".format(true_resi=truth_resi)

    if predicted_voxels is not None:
        predicted_atoms = Counter()
        for v in predicted_voxels:
            atoms = structure.convert_voxels(v, level="A")
            if len(atoms) > 0:
                predicted_atoms[atoms[0]] += 1

        predicted_atoms = [atom for atom, count in predicted_atoms.iteritems() \
            if float(count)/atom_volume(structure, atom) >= voxel_atom_ratio]

        predicted_residues = [
            str(r.get_id()[1]) for r in unfold_entities(predicted_atoms, "R")
        ]
        predicted_resi = "+".join(truth_residues)

        cmd += """select predicted_binding_site, resi {predicted_resi}
color magenta, predicted_binding_site
""".format(predicted_resi=predicted_resi)

    if truth_voxels is not None and predicted_voxels is not None:
        false_postive_voxels = set(predicted_residues) - set(truth_residues)
        fp_resi = "+".join(false_postive_voxels)
        cmd += """select false_positive_binding_site, resi {fp_resi}
color blue, false_positive_binding_site
""".format(fp_resi=fp_resi)

    with open("{}_pymol.cmd".format(id), "w") as f:
        print >> f, cmd