def test_read_text(self): test_filename = self.make_test_txt() readtext_obj = ReadText() readtext_result = readtext_obj.parse_txt(test_filename) known_answer = self.get_parsed_test_data() for key in known_answer.keys(): self.assertEqual(readtext_result[key], known_answer[key])
def run_scientific_protein_prep(self, challenge_data_path, pdb_protein_path, working_folder): abs_challenge_data_path = os.path.abspath(challenge_data_path) chal_data_obj = ChallengeData(abs_challenge_data_path) if not (chal_data_obj.is_valid_for_celpp()): logging.info( '%s is not a valid CELPP challenge data directory. Unable to run protein prep.' % (abs_challenge_data_path)) return False week_chal_data_dict = chal_data_obj.get_targets() week_name = week_chal_data_dict.keys()[0] abs_week_path = os.path.join(abs_challenge_data_path, week_name) pot_target_dirs = week_chal_data_dict[week_name] os.chdir(working_folder) current_dir_layer_1 = os.getcwd() ## Get all potential target directories and candidates within valid_candidates = {} # Ensure that the chalengedata targets are valid and copy in files for pot_target_dir in pot_target_dirs: os.chdir(current_dir_layer_1) pot_target_id = os.path.basename(pot_target_dir.strip('/')) # Does it look like a pdb id? if len(pot_target_id) != 4: logging.info( 'Filtering potential target directories: %s is not 4 characters long. Skipping' % (pot_target_id)) continue os.mkdir(pot_target_id) valid_candidates[pot_target_id] = [] target_dir_path = os.path.join(abs_week_path, pot_target_id) # Copy in <targ id>.txt file targ_info_basename = pot_target_id + '.txt' targ_info_file = os.path.join(target_dir_path, targ_info_basename) targ_info_dest = os.path.join(pot_target_id, targ_info_basename) shutil.copyfile(targ_info_file, targ_info_dest) # Copy in center.txt file center_file = os.path.join(target_dir_path, 'center.txt') #center_file_basename = os.path.basename(center_file) center_file_dest = os.path.join(pot_target_id, 'center.txt') shutil.copyfile(center_file, center_file_dest) # Copy in each valid candidate for candidate_file in glob.glob('%s/*-%s_*.pdb' % (target_dir_path, pot_target_id)): # The LMCSS ligand will be in a pdb file called something like celpp_week19_2016/1fcz/LMCSS-1fcz_1fcz-156-lig.pdb # We want to make sure we don't treat this like a receptor if 'lig.pdb' in candidate_file: continue candidate_file_basename = os.path.basename(candidate_file) candidate_file_dest = os.path.join(pot_target_id, candidate_file_basename) shutil.copyfile(candidate_file, candidate_file_dest) candidate_local_file = os.path.basename(candidate_file) valid_candidates[pot_target_id].append(candidate_local_file) for target_id in valid_candidates.keys(): os.chdir(current_dir_layer_1) os.chdir(target_id) current_dir_layer_2 = os.getcwd() ReadText_obj = ReadText() targ_info_dict = ReadText_obj.parse_txt(target_id + '.txt') for candidate_filename in valid_candidates[target_id]: os.chdir(current_dir_layer_2) ## Parse the candidate name ## Get the method type, target, and candidate info from the filename # for example, this will parse 'hiResApo-5hib_2eb2.pdb' into [('hiResApo', '5hib', '2eb2')] parsed_name = re.findall( '([a-zA-Z0-9]+)-([a-zA-Z0-9]+)_([a-zA-Z0-9]+)-?([a-zA-Z0-9]*).pdb', candidate_filename) if len(parsed_name) != 1: logging.info( 'Failed to parse docked structure name "%s". Parsing yielded %r' % (candidate_filename, parsed_name)) continue candidate_structure_type = parsed_name[0][0] candidate_structure_target = parsed_name[0][1] candidate_structure_candidate = parsed_name[0][2] candidate_structure_ligand = parsed_name[0][2] candidate_prefix = '%s-%s_%s' % (candidate_structure_type, candidate_structure_target, candidate_structure_candidate) # Make candidate prep directory os.mkdir(candidate_prefix) # Copy in raw candidate file candidate_copy_origin = candidate_filename candidate_copy_dest = os.path.join(candidate_prefix, candidate_filename) shutil.copyfile(candidate_copy_origin, candidate_copy_dest) # Copy in center file center_copy_origin = 'center.txt' center_copy_dest = os.path.join(candidate_prefix, 'center.txt') shutil.copyfile(center_copy_origin, center_copy_dest) # Move into candidate prep directory os.chdir(candidate_prefix) # Run prep prepared_protein_file = "%s_prepared%s" % ( candidate_prefix, ProteinPrep.OUTPUT_PROTEIN_SUFFIX) try: preparation_result = self.receptor_scientific_prep( candidate_filename, prepared_protein_file, targ_info_dict=targ_info_dict) except: logging.info(traceback.format_exc()) logging.info( 'try/except statement caught error in scientific protein prep. Skipping candidate %s' % (candidate_prefix)) continue if preparation_result == False: logging.info("Unable to prepare this protein:%s" % (candidate_filename)) continue if not (os.path.exists(prepared_protein_file)): logging.info( 'Expected output file %s does not exist after protein preparation. Assuming that protein prep failed. Skipping candidate %s' % (prepared_protein_file, candidate_prefix)) continue if os.path.getsize(prepared_protein_file) == 0: logging.info( 'Expected output file %s has size 0. Assuming that protein prep failed. Skipping candidate %s' % (prepared_protein_file, candidate_prefix)) continue prepared_receptor_origin = prepared_protein_file prepared_receptor_dest = os.path.join(current_dir_layer_2, prepared_protein_file) shutil.copyfile(prepared_receptor_origin, prepared_receptor_dest) logging.info("Successfully prepared this protein:%s" % (prepared_protein_file)) os.chdir(current_dir_layer_1)
def run_scientific_ligand_prep(self, challenge_data_path, pdb_protein_path, working_folder): abs_challenge_data_path = os.path.abspath(challenge_data_path) chal_data_obj = ChallengeData(abs_challenge_data_path) if not (chal_data_obj.is_valid_for_celpp()): logging.info( '%s is not a valid CELPP challenge data directory. Unable to run ligand prep.' % (abs_challenge_data_path)) return False week_chal_data_dict = chal_data_obj.get_targets() week_name = week_chal_data_dict.keys()[0] abs_week_path = os.path.join(abs_challenge_data_path, week_name) pot_target_dirs = week_chal_data_dict[week_name] os.chdir(working_folder) current_dir_layer_1 = os.getcwd() ## Get all potential target directories and candidates within valid_targets = {} # Ensure that the challengedata targets are valid and copy in files for pot_target_dir in pot_target_dirs: os.chdir(current_dir_layer_1) pot_targ_id = os.path.basename(pot_target_dir.strip('/')) # Does it look like a pdb id? if len(pot_targ_id) != 4: logging.info( 'Filtering potential target directories: %s is not 4 characters long. Skipping' % (pot_targ_id)) continue os.mkdir(pot_targ_id) target_dir_path = os.path.join(abs_week_path, pot_targ_id) # Copy in <targ id>.txt file targ_info_basename = pot_targ_id + '.txt' origin_txt_file = os.path.join(target_dir_path, targ_info_basename) dest_txt_file = os.path.join(pot_targ_id, targ_info_basename) shutil.copyfile(origin_txt_file, dest_txt_file) # Pull in the ligand inchi/smiles lig_smiles_files = glob.glob('%s/lig_*.smi' % (target_dir_path)) if len(lig_smiles_files) != 1: logging.info( 'Unable to find unambiguous ligand smiles for %s - glob returned %r' % (pot_targ_id, lig_smiles_files)) continue lig_smiles_file = lig_smiles_files[0] local_smiles_file = os.path.basename(lig_smiles_file) dest_smiles_file = os.path.join(pot_targ_id, local_smiles_file) shutil.copyfile(lig_smiles_file, dest_smiles_file) valid_targets[pot_targ_id] = local_smiles_file for target_id in valid_targets.keys(): os.chdir(current_dir_layer_1) os.chdir(target_id) smiles_filename = valid_targets[target_id] # Parse the <targ id>.txt file ReadText_obj = ReadText() targ_info_dict = ReadText_obj.parse_txt(target_id + '.txt') # Prepare the ligand lig_prefix = smiles_filename.replace('.smi', '') prepared_lig_file = '%s_prepared%s' % ( lig_prefix, LigandPrep.OUTPUT_LIG_SUFFIX) try: lig_prep_result = self.ligand_scientific_prep( smiles_filename, prepared_lig_file, targ_info_dict=targ_info_dict) except: logging.info(sys.exc_info()) logging.info( "try/except caught error in ligand_scientific_prep function. Skipping target %s" % (target_id)) continue if lig_prep_result == False: logging.info( "Unable to prepare the ligand for this target protein: %s. Skipping" % (target_id)) continue if not (os.path.exists(prepared_lig_file)): logging.info( 'Expected output file %s does not exist. Assuming that ligand prep failed. Skipping target %s' % (prepared_lig_file, target_id)) continue if os.path.getsize(prepared_lig_file) == 0: logging.info( 'Expected output file %s has size 0. Assuming that ligand prep failed. Skipping candidate %s' % (prepared_lig_file, target_id)) continue logging.info("Successfully prepared ligand %s for target %s" % (lig_prefix, target_id))
def run_dock(self, prot_sci_prep_dir, lig_sci_prep_dir, dock_dir): #os.chdir(prep_result_dir) abs_lig_sci_prep_dir = os.path.abspath(lig_sci_prep_dir) abs_prot_sci_prep_dir = os.path.abspath(prot_sci_prep_dir) abs_dock_dir = os.path.abspath(dock_dir) targ_prot_prep_dirs = glob.glob('%s/????' % (abs_prot_sci_prep_dir)) #targ_lig_prep_dirs = glob.glob('%s/????' %(abs_lig_prep_dir)) #prepped_prot_targs = [os.path.basename(i.rstrip('/')) for i in targ_prot_prep_dirs] #prepped_lig_targs = [os.path.basename(i.rstrip('/')) for i in targ_lig_prep_dirs] targ_dic = {} for targ_prot_prep_dir in targ_prot_prep_dirs: targ_name = os.path.basename(targ_prot_prep_dir.rstrip('/')) targ_lig_prep_dir = os.path.join(abs_lig_sci_prep_dir, targ_name) targ_dock_dir = os.path.join(abs_dock_dir, targ_name) #os.mkdir(target_dock_dir) logging.info( "============= Starting to process target:%s =============" % targ_name) targ_dic[targ_name] = {} # Until we find a good ligand and cand receptor, mark this as invalid targ_dic[targ_name]['valid_targ'] = False # Get the binding pocket center pocket_center = self.get_pocket_center(targ_prot_prep_dir) if pocket_center == False: logging.info( 'Failed to find pocket center file in dirctory %s. Skipping target %s.' % (targ_prot_prep_dir, targ_name)) continue # Get the ligand name in this directory sci_prepped_lig_file = self.get_sci_prepped_lig( targ_lig_prep_dir, Dock.SCI_PREPPED_LIG_SUFFIX) if sci_prepped_lig_file == False: logging.info( 'Unable to find single ligand for target dir %s. Skipping target %s.' % (targ_lig_prep_dir, targ_name)) continue # Process ligand file names lig_prefix = self.parse_lig_filename(sci_prepped_lig_file) if lig_prefix == False: logging.info( 'Unable to parse ligand filename %s. Skipping target %s.' % (sci_prepped_lig_file, targ_name)) # Get the cand protein names in this directory potential_cand_proteins = glob.glob( '%s/*-????_????%s' % (targ_prot_prep_dir, Dock.SCI_PREPPED_PROT_SUFFIX)) # Process potential cand protein names for potential_cand_protein in potential_cand_proteins: potential_cand_basename = os.path.basename( potential_cand_protein) category, targ_id, cand_id = self.parse_cand_name( potential_cand_basename) targ_txt_file = os.path.join(targ_lig_prep_dir, targ_id + '.txt') # If this is the first valid cand for this target, make a new dictionary key and mark the target as valid if not ('valid_cands' in targ_dic[targ_name].keys()): targ_dic[targ_name]['valid_targ'] = True targ_dic[targ_name]['prot_prep_dir'] = targ_prot_prep_dir targ_dic[targ_name]['lig_prep_dir'] = targ_lig_prep_dir targ_dic[targ_name]['pocket_center'] = pocket_center targ_dic[targ_name]['targ_txt_file'] = targ_txt_file targ_dic[targ_name]['lig_file'] = sci_prepped_lig_file targ_dic[targ_name]['lig_prefix'] = lig_prefix targ_dic[targ_name]['valid_cands'] = [] targ_dic[targ_name]['valid_cands'].append( (potential_cand_protein, category, targ_id, cand_id)) os.chdir(abs_dock_dir) # Print out all of the target/cands we will dock to logging.info('targ_dic is: ') for targ in targ_dic.keys(): logging.info('%s: %s' % (targ, targ_dic[targ])) # Generate a list of targets for docking targ_names = targ_dic.keys() targ_names = [ i for i in targ_names if targ_dic[i]['valid_targ'] == True ] targ_names.sort() # Begin populating the target directory os.chdir(abs_dock_dir) for targ_name in targ_names: pocket_center = targ_dic[targ_name]['pocket_center'] os.chdir(abs_dock_dir) # Make the target directory os.mkdir(targ_name) os.chdir(targ_name) abs_targ_dock_dir = os.getcwd() # Copy the targ.txt file in copy_dest = '%s/%s' % (abs_targ_dock_dir, os.path.basename( targ_dic[targ_name]['targ_txt_file'])) shutil.copyfile(targ_dic[targ_name]['targ_txt_file'], copy_dest) # Parse the targ.txt file ReadText_obj = ReadText() targ_info_dict = ReadText_obj.parse_txt(copy_dest) #### Run CELPPade technical prep ### Ligand technical prep ## Ligand technical prep setup lig_tech_prep_dir = 'lig_%s_tech_prep' % ( targ_dic[targ_name]['lig_prefix']) os.mkdir(lig_tech_prep_dir) # Copy the sci prepped ligand in copy_dest = '%s/%s' % (lig_tech_prep_dir, os.path.basename( targ_dic[targ_name]['lig_file'])) shutil.copyfile(targ_dic[targ_name]['lig_file'], copy_dest) lig_base_filename = os.path.basename( targ_dic[targ_name]['lig_file']) os.chdir(lig_tech_prep_dir) ## Call user-defined ligand technical prep try: tech_prepped_lig_file_list = self.ligand_technical_prep( lig_base_filename, targ_info_dict=targ_info_dict) except: logging.info(sys.exc_info()) logging.info( 'try/except statement caught error in function lig_technical_prep. Skipping target %s.' % (os.path.abspath(lig_base_filename), targ_name)) continue ## Ensure that ligand technical prep was successful # Check for function-reported failure if tech_prepped_lig_file_list == False: logging.info( 'Technical ligand preparation failed on %s. Skipping target %s.' % (os.path.abspath(lig_base_filename), targ_name)) continue # Ensure that ligand technical prep returns a list of filenames if not (type(tech_prepped_lig_file_list) is list): logging.info( 'Technical ligand preparation for %s did not return a list of filenames. Skipping target %s.' % (os.path.abspath(lig_base_filename), targ_name)) continue # Ensure that all files in list really exist for filename in tech_prepped_lig_file_list: if not (os.path.exists(filename)): logging.info( 'Technical ligand preparation for %s returned file list %r, but file %s does not exist. Skipping target %s.' % (os.path.abspath(lig_base_filename), tech_prepped_lig_file_list, filename, targ_name)) continue ## Prepare to copy these files for later tech_prepped_lig_file_list = [ os.path.abspath(i) for i in tech_prepped_lig_file_list ] targ_dic[targ_name][ 'tech_prepped_lig_files'] = tech_prepped_lig_file_list logging.info( 'Technical ligand prep successful for %s. All files exist from returned list %r. ' % (os.path.abspath(lig_base_filename), tech_prepped_lig_file_list)) ### Candidate tech prep for cand_file, category, targ_id, cand_id in targ_dic[targ_name][ 'valid_cands']: os.chdir(abs_targ_dock_dir) ## Candidate tech prep setup cand_tech_prep_dir = '%s_%s_tech_prep/' % (category, cand_id) os.mkdir(cand_tech_prep_dir) copy_dest = '%s/%s' % (cand_tech_prep_dir, os.path.basename(cand_file)) shutil.copyfile(cand_file, copy_dest) prot_base_filename = os.path.basename(cand_file) os.chdir(cand_tech_prep_dir) ## Call user-defined protein technical prep try: tech_prepped_prot_file_list = self.receptor_technical_prep( prot_base_filename, pocket_center, targ_info_dict=targ_info_dict) except: logging.info(sys.exc_info()) logging.info( 'try/except statement caught error in function receptor_technical_prep. Skipping candidate %s for target %s.' % (os.path.abspath(prot_base_filename), cand_id, targ_name)) continue ## Ensure that receptor technical prep was successful # Check for function-reported failure if tech_prepped_prot_file_list == False: logging.info( 'Technical protein preparation failed on %s. Skipping candidate %s for target %s.' % (os.path.abspath(prot_base_filename), cand_id, targ_name)) continue # Ensure that receptor technical prep returns a list of filenames if not (type(tech_prepped_prot_file_list) is list): logging.info( 'Technical protein preparation for %s did not return a list of filenames. Skipping candidate %s for target %s.' % (os.path.abspath(prot_base_filename), cand_id, targ_name)) continue # Ensure that all files in list really exist for filename in tech_prepped_prot_file_list: if not (os.path.exists(filename)): logging.info( 'Technical protein preparation for %s returned file list %r, but file %s does not exist. Skipping candidate %s for target %s.' % (os.path.abspath(lig_base_filename), tech_prepped_lig_file_list, filename, cand_id, targ_name)) continue ## Prepare to copy these files for docking step tech_prepped_prot_file_list = [ os.path.abspath(i) for i in tech_prepped_prot_file_list ] logging.info( 'Protein technical prep successful for %s. All files exist from returned list %r.' % (cand_file, tech_prepped_prot_file_list)) #### Run CELPPade docking os.chdir(abs_targ_dock_dir) cand_dock_dir = '%s_%s_docking' % (category, cand_id) os.mkdir(cand_dock_dir) os.chdir(cand_dock_dir) ## Prepare expected file names output_receptor_pdb = '%s-%s_%s_docked.pdb' % ( category, targ_id, cand_id) output_lig_mol = '%s-%s_%s_docked.mol' % (category, targ_id, cand_id) ## Copy in tech prepped files for filename in tech_prepped_lig_file_list: file_base_name = os.path.basename(filename) shutil.copyfile(filename, file_base_name) for filename in tech_prepped_prot_file_list: file_base_name = os.path.basename(filename) shutil.copyfile(filename, file_base_name) ## Do the actual docking try: dock_results = self.dock(tech_prepped_lig_file_list, tech_prepped_prot_file_list, output_receptor_pdb, output_lig_mol, targ_info_dict=targ_info_dict) except: logging.info(sys.exc_info()) logging.info( 'try/except statement caught error in dock() function. Docking was given ' 'inputs tech_prepped_lig_file_list=%r tech_prepped_prot_file_list=%r ' 'output_receptor_pdb=%r output_lig_mol=%r. Skipping docking to this ' 'candidate.' % (tech_prepped_lig_file_list, tech_prepped_prot_file_list, output_receptor_pdb, output_lig_mol)) continue ## Check for success # Check for self-reported failure if dock_results == False: logging.info( 'Docking returned False given inputs: tech_prepped_lig_file_list=%r ' 'tech_prepped_prot_file_list=%r output_receptor_pdb=%r output_lig_mol=%r. ' 'Skipping docking to this candidate.' % (tech_prepped_lig_file_list, tech_prepped_prot_file_list, output_receptor_pdb, output_lig_mol)) continue # Ensure that correct output files exist if not (os.path.exists(output_receptor_pdb)) or ( os.path.getsize(output_receptor_pdb) == 0): logging.info( 'Docking did not create receptor pdb file %s given inputs: ' 'tech_prepped_lig_file_list=%r tech_prepped_prot_file_list=%r ' 'output_receptor_pdb=%r output_lig_mol=%r. Skipping docking ' 'to this candidate.' % (output_receptor_pdb, tech_prepped_lig_file_list, tech_prepped_prot_file_list, output_receptor_pdb, output_lig_mol)) continue if not (os.path.exists(output_lig_mol)) or ( os.path.getsize(output_lig_mol) == 0): logging.info( 'Docking did not create ligand mol file %s given inputs: ' 'tech_prepped_lig_file_list=%r tech_prepped_prot_file_list=%r ' 'output_receptor_pdb=%r output_lig_mol=%r. Skipping docking ' 'to this candidate.' % (output_lig_mol, tech_prepped_lig_file_list, tech_prepped_prot_file_list, output_receptor_pdb, output_lig_mol)) continue logging.info( 'Docking was successful for %s. Final receptor and ligand ' 'files %s and %s exist and are nonzero size.' % (cand_file, output_receptor_pdb, output_lig_mol)) # Prepare to copy docking results into final result directory abs_output_receptor_pdb = os.path.abspath(output_receptor_pdb) abs_output_lig_mol = os.path.abspath(output_lig_mol) # Copy the files one directory up os.chdir(abs_targ_dock_dir) shutil.copyfile(abs_output_receptor_pdb, output_receptor_pdb) shutil.copyfile(abs_output_lig_mol, output_lig_mol)