Beispiel #1
0
def featurize_dude(dude_dir, target, pickle_dir, num_jobs):
    """Featurize DUD-E docked poses and write features to pickle_out.
 
  Parameters
  ----------
  dude_dir: string
    Path to DUD-E directory
  target: string
    Name of DUD-E target.
  pickle_dir: string
    Path to directory to output pickles 
  """
    target_dir = os.path.join(dude_dir, target)
    actives_dir = os.path.join(target_dir, "actives")
    decoys_dir = os.path.join(target_dir, "decoys")
    actives = [a for a in os.listdir(actives_dir)]
    decoys = [a for a in os.listdir(decoys_dir)]
    receptor = os.path.join(target_dir, "receptor.pdb")
    pickle_out = os.path.join(target_dir, "out.pkl.gz")
    # Just for debugging purposes
    actives = actives[:1]

    num_per_job = int(math.ceil(len(actives) / float(num_jobs)))
    print "Number per job: %d" % num_per_job
    protein_pdb_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdb"
    protein_pdbqt_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdbqt"

    print "About to load protein from input files"
    protein_pdb_obj = PDB()
    protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path)

    binana = Binana()
    feature_len = binana.num_features()
    feature_vectors = {}
    for compound in actives:
        compound_name = compound.split(".")[0]
        compound_pdbqt = compound_name + "_hyd_out.pdbqt"
        compound_pdbqt = os.path.join(actives_dir, compound_pdbqt)

        # Convert the pdbqt to pdb
        pdbqt_to_pdb(compound_pdbqt, actives_dir)
        compound_pdb = compound_name + "_hyd_out.pdb"
        compound_pdb = os.path.join(actives_dir, compound_pdb)

        structures = MultiStructure()
        structures.load_from_files(compound_pdb, compound_pdbqt)

        vectors = []
        for key in sorted(structures.molecules.keys()):
            structure = structures.molecules[key]
            print "type(structure)"
            print type(structure)
            vectors.append(
                binana.compute_input_vector(structure, protein_pdb_obj))
        feature_vectors[compound_name] = vectors

    with gzip.open(pickle_out, "wb") as f:
        pickle.dump(feature_vectors, f)

    decoys = decoys[:1]
def featurize_fingerprint(pdb_directories, pickle_out):
  """Featurize all pdbs in provided directories."""
  # Instantiate copy of binana vector
  binana = Binana()
  # See features/tests/nnscore_test.py:TestBinana.testComputeInputVector
  # for derivation.
  feature_len = binana.num_features()
  feature_vectors = {}
  for count, pdb_dir in enumerate(pdb_directories):
    print "\nprocessing %d-th pdb %s" % (count, dir)

    print "About to extract ligand and protein input files"
    ligand_pdb, ligand_pdbqt = None, None
    protein_pdb, protein_pdbqt = None, None
    for f in os.listdir(pdb_dir):
      if re.search("_ligand_hyd.pdb$", f):
        ligand_pdb = f
      elif re.search("_ligand_hyd.pdbqt$", f):
        ligand_pdbqt = f
      elif re.search("_protein_hyd.pdb$", f):
        protein_pdb = f
      elif re.search("_protein_hyd.pdbqt$", f):
        protein_pdbqt = f

    print "Extracted Input Files:"
    print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt)
    if (not ligand_pdb or not ligand_pdbqt or not protein_pdb or not
        protein_pdbqt):
        raise ValueError("Required files not present for %s" % pdb_dir)

    ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb)
    ligand_pdbqt_path = os.path.join(pdb_dir, ligand_pdbqt)
    protein_pdb_path = os.path.join(pdb_dir, protein_pdb)
    protein_pdbqt_path = os.path.join(pdb_dir, protein_pdbqt)

    print "About to load ligand from input files"
    ligand_pdb_obj = PDB()
    ligand_pdb_obj.load_from_files(ligand_pdb_path, ligand_pdbqt_path)

    print "About to load protein from input files"
    protein_pdb_obj = PDB()
    protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path)

    print "About to generate feature vector."
    features = binana.compute_input_vector(ligand_pdb_obj,
        protein_pdb_obj)
    if len(features) != feature_len:
      raise ValueError("Feature length incorrect on %s" % pdb_dir)
    print "Feature vector generated correctly."

    print "About to compute ligand smiles string."
    ligand_mol = Chem.MolFromPDBFile(ligand_pdb_path)
    # TODO(rbharath): Why does this fail sometimes?
    if ligand_mol is None:
      continue
    smiles = Chem.MolToSmiles(ligand_mol)

    print "About to compute sequence."
    protein = md.load(protein_pdb_path)
    seq = [r.name for r in protein.top.residues] 

    # Write the computed quantities
    feature_vectors[pdb_dir] = (features, smiles, seq)
  print "About to write pickle to " + pickle_out
  with open(pickle_out, "wb") as f:
    pickle.dump(feature_vectors, f)
Beispiel #3
0
def featurize_fingerprint(pdb_directories, pickle_out):
    """Featurize all pdbs in provided directories."""
    # Instantiate copy of binana vector
    binana = Binana()
    # See features/tests/nnscore_test.py:TestBinana.testComputeInputVector
    # for derivation.
    feature_len = binana.num_features()
    feature_vectors = {}
    for count, pdb_dir in enumerate(pdb_directories):
        print "\nprocessing %d-th pdb %s" % (count, dir)

        print "About to extract ligand and protein input files"
        ligand_pdb, ligand_pdbqt = None, None
        protein_pdb, protein_pdbqt = None, None
        for f in os.listdir(pdb_dir):
            if re.search("_ligand_hyd.pdb$", f):
                ligand_pdb = f
            elif re.search("_ligand_hyd.pdbqt$", f):
                ligand_pdbqt = f
            elif re.search("_protein_hyd.pdb$", f):
                protein_pdb = f
            elif re.search("_protein_hyd.pdbqt$", f):
                protein_pdbqt = f

        print "Extracted Input Files:"
        print(ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt)
        if (not ligand_pdb or not ligand_pdbqt or not protein_pdb
                or not protein_pdbqt):
            raise ValueError("Required files not present for %s" % pdb_dir)

        ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb)
        ligand_pdbqt_path = os.path.join(pdb_dir, ligand_pdbqt)
        protein_pdb_path = os.path.join(pdb_dir, protein_pdb)
        protein_pdbqt_path = os.path.join(pdb_dir, protein_pdbqt)

        print "About to load ligand from input files"
        ligand_pdb_obj = PDB()
        ligand_pdb_obj.load_from_files(ligand_pdb_path, ligand_pdbqt_path)

        print "About to load protein from input files"
        protein_pdb_obj = PDB()
        protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path)

        print "About to generate feature vector."
        features = binana.compute_input_vector(ligand_pdb_obj, protein_pdb_obj)
        if len(features) != feature_len:
            raise ValueError("Feature length incorrect on %s" % pdb_dir)
        print "Feature vector generated correctly."

        print "About to compute ligand smiles string."
        ligand_mol = Chem.MolFromPDBFile(ligand_pdb_path)
        # TODO(rbharath): Why does this fail sometimes?
        if ligand_mol is None:
            continue
        smiles = Chem.MolToSmiles(ligand_mol)

        print "About to compute sequence."
        protein = md.load(protein_pdb_path)
        seq = [r.name for r in protein.top.residues]

        # Write the computed quantities
        feature_vectors[pdb_dir] = (features, smiles, seq)
    print "About to write pickle to " + pickle_out
    with gzip.open(pickle_out, "wb") as f:
        pickle.dump(feature_vectors, f)
def featurize_dude(dude_dir, target, pickle_dir, num_jobs):
  """Featurize DUD-E docked poses and write features to pickle_out.
 
  Parameters
  ----------
  dude_dir: string
    Path to DUD-E directory
  target: string
    Name of DUD-E target.
  pickle_dir: string
    Path to directory to output pickles 
  """
  target_dir = os.path.join(dude_dir, target)
  actives_dir = os.path.join(target_dir, "actives")
  decoys_dir = os.path.join(target_dir, "decoys")
  actives = [a for a in os.listdir(actives_dir)]
  decoys = [a for a in os.listdir(decoys_dir)]
  receptor = os.path.join(target_dir, "receptor.pdb")
  pickle_out = os.path.join(target_dir, "out.pkl.gz")
  # Just for debugging purposes
  actives = actives[:1]

  num_per_job = int(math.ceil(len(actives)/float(num_jobs)))
  print "Number per job: %d" % num_per_job
  protein_pdb_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdb"
  protein_pdbqt_path = "/home/rbharath/DUD-E/aa2ar/receptor_hyd.pdbqt"

  print "About to load protein from input files"
  protein_pdb_obj = PDB()
  protein_pdb_obj.load_from_files(protein_pdb_path, protein_pdbqt_path)

  binana = Binana()
  feature_len = binana.num_features()
  feature_vectors = {}
  for compound in actives:
    compound_name = compound.split(".")[0]
    compound_pdbqt = compound_name + "_hyd_out.pdbqt"
    compound_pdbqt = os.path.join(actives_dir, compound_pdbqt)

    # Convert the pdbqt to pdb
    pdbqt_to_pdb(compound_pdbqt, actives_dir)
    compound_pdb = compound_name + "_hyd_out.pdb"
    compound_pdb = os.path.join(actives_dir, compound_pdb)

    structures = MultiStructure()
    structures.load_from_files(compound_pdb, compound_pdbqt)

    vectors = []
    for key in sorted(structures.molecules.keys()):
      structure = structures.molecules[key]
      print "type(structure)"
      print type(structure)
      vectors.append(binana.compute_input_vector(structure,
          protein_pdb_obj))
    feature_vectors[compound_name] = vectors

  with gzip.open(pickle_out, "wb") as f:
    pickle.dump(feature_vectors, f)


  decoys = decoys[:1]