def calculate_features(pdb, chain, id): from molmimic.biopdbtools import Structure inputSpatialSize = torch.LongTensor((264, 264, 264)) struct = Structure(pdb, chain, id=id) for rotation in struct.rotate(1000): indices, data = struct.map_atoms_to_voxel_space() inputs = scn.InputBatch(3, inputSpatialSize) inputs.addSample() try: inputs.setLocations( torch.from_numpy(indices).long(), torch.from_numpy(data).float(), 0) except AssertionError: theta, phi, z = rotation[1:] min_coord = np.min(indices, axis=0) max_coord = np.max(indices, axis=0) dist = int(np.ceil(np.linalg.norm(max_coord - min_coord))) with open("{}_{}_{}.txt".format(pdb, chain, id), "a") as f: print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}". format(pdb, chain, id, theta, phi, z, dist, min_coord[0], min_coord[1], min_coord[2], max_coord[0], max_coord[1], max_coord[2]), file=f) del inputs del indices del data
def feature_selection(ibis_data): dataset = IBISDataset(ibis_data) feature_names = Structure.get_feature_names() models = { "linear":linear_selection(), "variance":variance_selection(), "lasso":lasso_selection(), "pca":inc_pca() } X = np.array((len(dataset), len(feature_names))) for i in xrange(len(dataset)): features = dataset[i] X[i] = np.nan_to_num(features["data"]) for name, (model, should_create) in models.iteritems(): try: model.fit(X) except ValueError as e: print name print e print X print continue if should_create: model = SelectFromModel(model, prefit=True) if name != "pca": print name, "=", [feature_names[i] for i, s in enumerate(model.get_support()) if s] cumsum = np.cumsum(models["pca"][0].explained_variance_ratio_) d = np.argmax(cumsum > 0.95) + 1 print "pca, n-components: {}; explained variance: {}".format(d, models["pca"][0].explained_variance_ratio_)
def get_ibis(dataset_name, pdb_ibis_file, use_cdd_domain=None, multimers=False, check_binding_sites=False): out_dir = get_interfaces_path(dataset_name) seen_cdd = set() with open(pdb_ibis_file) as pdb_ibis: for pdb_chain, entries in groupby(parse_ibis(pdb_ibis), key=lambda l: l["Query"]): pdb, chain = pdb_chain[:-1], pdb_chain[-1] if check_binding_sites: try: structure = Structure.from_pdb(pdb, chain) except (KeyboardInterrupt, SystemExit) as e: raise except InvalidPDB: continue except: trace = traceback.format_exc() print "Error:", trace continue for entry in entries: if entry["Interaction_type"] not in ["PPI"]: continue #, "LIG" cdd = entry["Query_Domain"] partner = entry["Interaction_Partner"] residues = entry["PDB_Residue_No"].lstrip().replace(" ", ",") residue_str = entry["Binding_Site_Residues"] observed = entry["Is_Observed"] == "1" pdb_evidence = entry["PDB_Evidence"] is_multimer = "1" if cdd == partner else "0" if multimers and not is_multimer: #Query domain should not be target domain continue if use_cdd_domain is not None and cdd != use_cdd_domain: continue if check_binding_sites: #Check if positions match structure pdb_seq = "" for i, r in enumerate(residues.split(",")): if r == "X": continue res = structure.get_residue_from_resseq(r) if res is None: residue_str = residue_str[:i]+residue_str[i+1:] else: pdb_seq += PDB.Polypeptide.three_to_one(res.get_resname()) if pdb_seq != residue_str: print "{} {} does not match {} =/= {}".format(entry["Query"], residues, pdb_seq, residue_str) continue seen_cdd.add(cdd) with open(os.path.join(out_dir, cdd.replace("/", ""), "{}.tsv".format(pdb)), "a+") as f: print >> f, "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pdb, chain, residues, residue_str, is_multimer, cdd, partner, pdb_evidence, int(observed))
def __data_generation(self, indexes, epoch=None, batch=None): 'Generates data of batch_size samples' # X : (n_samples, v_size, v_size, v_size, n_channels) # Initialization X = np.zeros( (self.batch_size, self.input_shape[0], self.input_shape[1], self.input_shape[1], 21 if self.only_aa else Structure.nFeatures)) #or Structure.Features y = np.zeros((self.batch_size, self.input_shape[0], self.input_shape[1], self.input_shape[1], 1)) # Generate data failures = 0 for batch_index, index in enumerate(indexes): datum = self.data.iloc[index] print "Running {} ({}.{}): {}".format( datum["unique_obs_int"], datum["pdb"], datum["chain"], ",".join([ "{}{}".format(i, n) for i, n in zip( datum["resi"].split(","), datum["resn"].split(",")) ])) if self.only_aa: print "AA only" try: (data_idx, data), (truth_idx, truth) = Structure.features_from_string( datum["pdb"], datum["chain"], datum["resi"], id=datum["unique_obs_int"], input_shape=self.input_shape, batch_index=batch_index - failures, only_aa=self.only_aa) except (KeyboardInterrupt, SystemExit): raise except: failures += 1 X = np.resize(X, (X.shape[0] - 1, ) + X.shape[1:]) y = np.resize(y, (y.shape[0] - 1, ) + y.shape[1:]) trace = traceback.format_exc() print "Error:", trace with open( "{}_{}_{}_{}_{}.error".format(datum["pdb"], datum["chain"], datum["unique_obs_int"], epoch, batch), "w") as ef: print >> ef, trace continue X[data_idx.T.tolist()] = data.flatten() y[truth_idx.T.tolist()] = truth return X, y
def calculate_time(pdb, chain, id): import time from molmimic.biopdbtools import Structure import numpy as np start = time.time() indices, _ = Structure.features_from_string(pdb, chain, id=id) total_time = time.time() - start num_atoms = indices.shape[0] min_coord = np.min(indices, axis=0) max_coord = np.max(indices, axis=0) dist = int(np.ceil(np.linalg.norm(max_coord - min_coord))) with open("pdbs/time_{}_{}.txt".format(pdb, chain), "w") as f: print >> f, "{}\t{}\t{}\t{}\t{}\t{}".format(pdb, chain, id, total_time, num_atoms, dist)
def load_model(model_file, no_batch_norm=False, use_resnet_unet=True, dropout_depth=False, dropout_width=False, dropout_p=0.5, unclustered=False, nFeatures=None, only_aa=False, only_atom=False, non_geom_features=False, use_deepsite_features=False, old_version=False): model_prefix = "./molmimic_model_{}".format(datetime.now().strftime('%Y-%m-%d_%H:%M:%S')) dtype = 'torch.cuda.FloatTensor' if torch.cuda.is_available() else 'torch.FloatTensor' if nFeatures is None or nFeatures<0: nFeatures = Structure.number_of_features(only_aa=only_aa, only_atom=only_atom, non_geom_features=non_geom_features, use_deepsite_features=use_deepsite_features) if use_resnet_unet: model = ResNetUNet(nFeatures, 2, dropout_depth=dropout_depth, dropout_width=dropout_width, dropout_p=dropout_p, old_version=old_version) else: model = UNet3D(nFeatures, 2, batchnorm=not no_batch_norm) states = torch.load(model_file) try: model.load_state_dict(states) except KeyError as e: if not old_version: try: return load_model( model_file, no_batch_norm=no_batch_norm, use_resnet_unet=use_resnet_unet, dropout_depth=dropout_depth, dropout_width=dropout_width, dropout_p=dropout_p, unclustered=unclustered, nFeatures=nFeatures, only_aa=only_aa, only_atom=only_atom, non_geom_features=non_geom_features, use_deepsite_features=use_deepsite_features, old_version=True ) except KeyError: pass raise model.type(dtype) model.train(False) # Set model to evaluate mode return model
def view_in_pymol(id, predicted_voxels=None, truth_voxels=None, voxel_atom_ratio=.2): pdb, chain = id.split(".") structure = Structure.from_pdb(pdb, chain, rotate=False) cmd = """fetch {id} remove hetatm hide everything, {id} show surface, {id} color gray90, {id} """.format(id=id) if truth_voxels is not None: truth_atoms = Counter() for v in truth_voxels: atoms = structure.convert_voxels(v, level="A") if len(atoms) > 0: truth_atoms[atoms[0]] += 1 truth_atoms = [atom for atom, count in truth_atoms.iteritems() \ if float(count)/atom_volume(structure, atom) >= voxel_atom_ratio] truth_residues = [ str(r.get_id()[1]) for r in unfold_entities(truth_atoms, "R") ] truth_resi = "+".join(truth_residues) cmd += """select true_binding_site, resi {true_resi} color orange, true_binding_site """.format(true_resi=truth_resi) if predicted_voxels is not None: predicted_atoms = Counter() for v in predicted_voxels: atoms = structure.convert_voxels(v, level="A") if len(atoms) > 0: predicted_atoms[atoms[0]] += 1 predicted_atoms = [atom for atom, count in predicted_atoms.iteritems() \ if float(count)/atom_volume(structure, atom) >= voxel_atom_ratio] predicted_residues = [ str(r.get_id()[1]) for r in unfold_entities(predicted_atoms, "R") ] predicted_resi = "+".join(truth_residues) cmd += """select predicted_binding_site, resi {predicted_resi} color magenta, predicted_binding_site """.format(predicted_resi=predicted_resi) if truth_voxels is not None and predicted_voxels is not None: false_postive_voxels = set(predicted_residues) - set(truth_residues) fp_resi = "+".join(false_postive_voxels) cmd += """select false_positive_binding_site, resi {fp_resi} color blue, false_positive_binding_site """.format(fp_resi=fp_resi) with open("{}_pymol.cmd".format(id), "w") as f: print >> f, cmd