def featurize_dude(dude_dir, target, pickle_dir, num_jobs): """Featurize DUD-E docked poses and write features to pickle_out. Parameters ---------- dude_dir: string Path to DUD-E directory target: string Name of DUD-E target. pickle_dir: string Path to directory to output pickles """ target_dir = os.path.join(dude_dir, target) actives_dir = os.path.join(target_dir, "actives") decoys_dir = os.path.join(target_dir, "decoys") actives = [a for a in os.listdir(actives_dir)] decoys = [a for a in os.listdir(decoys_dir)] receptor = os.path.join(target_dir, "receptor.pdb") pickle_out = os.path.join(target_dir, "out.pkl.gz") # Just for debugging purposes actives = actives[:1] num_per_job = int(math.ceil(len(actives) / float(num_jobs))) print "Number per job: %d" % num_per_job protein_pdb_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdb" protein_pdbqt_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdbqt" print "About to load protein from input files" protein_pdb_obj = PDB() protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path) binana = Binana() feature_len = binana.num_features() feature_vectors = {} for compound in actives: compound_name = compound.split(".")[0] compound_pdbqt = compound_name + "_hyd_out.pdbqt" compound_pdbqt = os.path.join(actives_dir, compound_pdbqt) # Convert the pdbqt to pdb pdbqt_to_pdb(compound_pdbqt, actives_dir) compound_pdb = compound_name + "_hyd_out.pdb" compound_pdb = os.path.join(actives_dir, compound_pdb) structures = MultiStructure() structures.load_from_files(compound_pdb, compound_pdbqt) vectors = [] for key in sorted(structures.molecules.keys()): structure = structures.molecules[key] print "type(structure)" print type(structure) vectors.append( binana.compute_input_vector(structure, protein_pdb_obj)) feature_vectors[compound_name] = vectors with gzip.open(pickle_out, "wb") as f: pickle.dump(feature_vectors, f) decoys = decoys[:1]
def featurize_fingerprint(pdb_directories, pickle_out): """Featurize all pdbs in provided directories.""" # Instantiate copy of binana vector binana = Binana() # See features/tests/nnscore_test.py:TestBinana.testComputeInputVector # for derivation. feature_len = binana.num_features() feature_vectors = {} for count, pdb_dir in enumerate(pdb_directories): print "\nprocessing %d-th pdb %s" % (count, dir) print "About to extract ligand and protein input files" ligand_pdb, ligand_pdbqt = None, None protein_pdb, protein_pdbqt = None, None for f in os.listdir(pdb_dir): if re.search("_ligand_hyd.pdb$", f): ligand_pdb = f elif re.search("_ligand_hyd.pdbqt$", f): ligand_pdbqt = f elif re.search("_protein_hyd.pdb$", f): protein_pdb = f elif re.search("_protein_hyd.pdbqt$", f): protein_pdbqt = f print "Extracted Input Files:" print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt) if (not ligand_pdb or not ligand_pdbqt or not protein_pdb or not protein_pdbqt): raise ValueError("Required files not present for %s" % pdb_dir) ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb) ligand_pdbqt_path = os.path.join(pdb_dir, ligand_pdbqt) protein_pdb_path = os.path.join(pdb_dir, protein_pdb) protein_pdbqt_path = os.path.join(pdb_dir, protein_pdbqt) print "About to load ligand from input files" ligand_pdb_obj = PDB() ligand_pdb_obj.load_from_files(ligand_pdb_path, ligand_pdbqt_path) print "About to load protein from input files" protein_pdb_obj = PDB() protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path) print "About to generate feature vector." features = binana.compute_input_vector(ligand_pdb_obj, protein_pdb_obj) if len(features) != feature_len: raise ValueError("Feature length incorrect on %s" % pdb_dir) print "Feature vector generated correctly." print "About to compute ligand smiles string." ligand_mol = Chem.MolFromPDBFile(ligand_pdb_path) # TODO(rbharath): Why does this fail sometimes? if ligand_mol is None: continue smiles = Chem.MolToSmiles(ligand_mol) print "About to compute sequence." protein = md.load(protein_pdb_path) seq = [r.name for r in protein.top.residues] # Write the computed quantities feature_vectors[pdb_dir] = (features, smiles, seq) print "About to write pickle to " + pickle_out with open(pickle_out, "wb") as f: pickle.dump(feature_vectors, f)
def featurize_fingerprint(pdb_directories, pickle_out): """Featurize all pdbs in provided directories.""" # Instantiate copy of binana vector binana = Binana() # See features/tests/nnscore_test.py:TestBinana.testComputeInputVector # for derivation. feature_len = binana.num_features() feature_vectors = {} for count, pdb_dir in enumerate(pdb_directories): print "\nprocessing %d-th pdb %s" % (count, dir) print "About to extract ligand and protein input files" ligand_pdb, ligand_pdbqt = None, None protein_pdb, protein_pdbqt = None, None for f in os.listdir(pdb_dir): if re.search("_ligand_hyd.pdb$", f): ligand_pdb = f elif re.search("_ligand_hyd.pdbqt$", f): ligand_pdbqt = f elif re.search("_protein_hyd.pdb$", f): protein_pdb = f elif re.search("_protein_hyd.pdbqt$", f): protein_pdbqt = f print "Extracted Input Files:" print(ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt) if (not ligand_pdb or not ligand_pdbqt or not protein_pdb or not protein_pdbqt): raise ValueError("Required files not present for %s" % pdb_dir) ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb) ligand_pdbqt_path = os.path.join(pdb_dir, ligand_pdbqt) protein_pdb_path = os.path.join(pdb_dir, protein_pdb) protein_pdbqt_path = os.path.join(pdb_dir, protein_pdbqt) print "About to load ligand from input files" ligand_pdb_obj = PDB() ligand_pdb_obj.load_from_files(ligand_pdb_path, ligand_pdbqt_path) print "About to load protein from input files" protein_pdb_obj = PDB() protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path) print "About to generate feature vector." features = binana.compute_input_vector(ligand_pdb_obj, protein_pdb_obj) if len(features) != feature_len: raise ValueError("Feature length incorrect on %s" % pdb_dir) print "Feature vector generated correctly." print "About to compute ligand smiles string." ligand_mol = Chem.MolFromPDBFile(ligand_pdb_path) # TODO(rbharath): Why does this fail sometimes? if ligand_mol is None: continue smiles = Chem.MolToSmiles(ligand_mol) print "About to compute sequence." protein = md.load(protein_pdb_path) seq = [r.name for r in protein.top.residues] # Write the computed quantities feature_vectors[pdb_dir] = (features, smiles, seq) print "About to write pickle to " + pickle_out with gzip.open(pickle_out, "wb") as f: pickle.dump(feature_vectors, f)
def featurize_dude(dude_dir, target, pickle_dir, num_jobs): """Featurize DUD-E docked poses and write features to pickle_out. Parameters ---------- dude_dir: string Path to DUD-E directory target: string Name of DUD-E target. pickle_dir: string Path to directory to output pickles """ target_dir = os.path.join(dude_dir, target) actives_dir = os.path.join(target_dir, "actives") decoys_dir = os.path.join(target_dir, "decoys") actives = [a for a in os.listdir(actives_dir)] decoys = [a for a in os.listdir(decoys_dir)] receptor = os.path.join(target_dir, "receptor.pdb") pickle_out = os.path.join(target_dir, "out.pkl.gz") # Just for debugging purposes actives = actives[:1] num_per_job = int(math.ceil(len(actives)/float(num_jobs))) print "Number per job: %d" % num_per_job protein_pdb_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdb" protein_pdbqt_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdbqt" print "About to load protein from input files" protein_pdb_obj = PDB() protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path) binana = Binana() feature_len = binana.num_features() feature_vectors = {} for compound in actives: compound_name = compound.split(".")[0] compound_pdbqt = compound_name + "_hyd_out.pdbqt" compound_pdbqt = os.path.join(actives_dir, compound_pdbqt) # Convert the pdbqt to pdb pdbqt_to_pdb(compound_pdbqt, actives_dir) compound_pdb = compound_name + "_hyd_out.pdb" compound_pdb = os.path.join(actives_dir, compound_pdb) structures = MultiStructure() structures.load_from_files(compound_pdb, compound_pdbqt) vectors = [] for key in sorted(structures.molecules.keys()): structure = structures.molecules[key] print "type(structure)" print type(structure) vectors.append(binana.compute_input_vector(structure, protein_pdb_obj)) feature_vectors[compound_name] = vectors with gzip.open(pickle_out, "wb") as f: pickle.dump(feature_vectors, f) decoys = decoys[:1]