def run_agadir_on_1000_fastas(): path_output_mutants = os.path.join( Paths.OUTPUT, Paths.DIR_MUTANTS_FASTAS.value, str(startnum) + Str.DOTS3.value + str(endnum)) path_to_fastas = path_output_mutants + '/**/*' + Str.FSTAEXT.value path_fastafile_list = natsort.natsorted( glob.glob(path_to_fastas, recursive=True)) agadir = Agadir(Cond.INCELL_MAML.value) for path_fastafile in path_fastafile_list: time.sleep(1) if GUM.using_cluster(): jobname = Paths.PREFIX_WRITE.value + path_fastafile.split('/')[-1] path_to_script = os.path.join( Paths.SRC, 'write_1fastafile_per_fasta_from_multifastafile_zeus.py') Cluster.write_job_q_bash(jobname, path_job_q_dir=Paths.CONFIG_JOBQ, python_script_with_paths=path_to_script + Str.SPCE.value + path_fastafile) Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_JOBQ) else: GUM.write_1_fastafile_per_fasta_from_multifastafile( path_dst=Paths.INPUT, path_fastafile=path_fastafile)
res2 = regex2.search(filename) fxmutname = filename[res.end():res2.end()] return fxmutname """ 3. Read csv files and write to a single csvfile for mysql dump. """ for path_output_csvfile in path_output_csvfiles: csvfilename = os.path.basename(path_output_csvfile) pdbname = _extract_pdbname(csvfilename) fxmutantname = _extract_fxmutantname(csvfilename) if using_cluster: path_jobq_dir = GUM.os_makedirs(Paths.CONFIG_BM_JOBQ, pdbname, fxmutantname) Cluster.write_job_q_bash( jobname=Paths.PREFIX_FX_RM.value + fxmutantname, path_job_q_dir=path_jobq_dir, python_script_with_paths=os.path.join( Paths.SE_SRC_CLSTR_PYSCRPTS.value, 'write_1csvfile_from_csvPerMutantfiles_zeus.py' + Str.SPCE.value + path_output_csvfile + Str.SPCE.value + path_output_ac_or_bm_dir + Str.SPCE.value + pdbname)) Cluster.run_job_q(path_job_q_dir=path_jobq_dir) else: GUM.write_1_csvfile_from_csv_per_mutants(path_output_csvfile, path_output_ac_or_bm_dir, pdbname) # pydevd.stoptrace()
# path_input_fastafile_list = sys.argv[1].split(';') path_input_fastafile_list = glob.glob(sys.argv[1] + '/*.fasta') path_output = sys.argv[2] path_config_job = sys.argv[3] path_output_blastp = sys.argv[4] write_idmaps_for_mysldb = sys.argv[5] == 'True' write_csv = sys.argv[6] == 'True' write_xml = sys.argv[7] == 'True' write_json = sys.argv[8] == 'True' for path_fastafile in path_input_fastafile_list: with open(path_fastafile) as fastafile_opened: fastafile_name = path_fastafile.split('/')[-1].split('.')[0] jobname = 'BLSTP_' + fastafile_name Cluster.write_job_q_bash(jobname=jobname, path_job_q_dir=path_config_job, queue='all.q', memory_limit_GB='3', cluster_node='hodor1.vib') path_output_blastp_fastaname = GUM.os_makedirs(path_output_blastp, fastafile_name) os.chdir(path_output_blastp_fastaname) Cluster.run_job_q(path_job_q_dir=path_config_job) Cluster.wait_for_grid_engine_job_to_complete(grid_engine_job_prefix_or_full_name=jobname) path_raw_blstp_xml = IdProt._write_raw_blast_xml(path_output, fastafile_name, blastp_result=NCBIWWW.qblast( program=Biopy.BlastParam.BLST_P.value, database=Biopy.BlastParam.SWSPRT.value, sequence=fastafile_opened.read(), entrez_query=Biopy.BlastParam.HOMSAP_ORG.value, alignments=Biopy.BlastParam.MAX_ALIGN_20.value, hitlist_size=Biopy.BlastParam.MAX_HIT_20.value)) blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(path_raw_blstp_xml, fastafile_name, path_fastafile) # blastp_dict_list.append(blastp_dict) if write_idmaps_for_mysldb:
def start(operations: dict, use_multithread: bool, path_input: str, path_output: str, path_pdbfiles: list, path_fastafiles: list, specific_fxmutants: list, amino_acids: list, write_1_fasta_only: bool, write_fasta_per_mut: bool, write_to_csv_dumpfile_after_each_mutant: bool): """ Iterate through a list of fasta files or pdb files and perform Agadir, or Foldx computations as specified by 'operations'. :param operations: Each operation paired with True/False flag to indicate whether or not to perform the operation. :param use_multithread: True to employ parallel processing. :param path_input: Absolute path to input_data root dir. :param path_output: Absolute path to output_data root dir. :param path_pdbfiles: Absolute path to pdb input files. :param path_fastafiles: Absolute path to fasta input files. :param specific_fxmutants: Given when specific mutants only should be calculated. :param amino_acids: Amino acids that mutation operations should use to mutate to. :param write_1_fasta_only: True to write any fasta output data to 1 fasta file, each separated by \n. :param write_fasta_per_mut: True to write any fasta output data as 1 fasta file per mutant. (Uses a lot of disk space). :param write_to_csv_dumpfile_after_each_mutant: True to write ddG values from fxout files to one csv file (for database dump). """ if path_fastafiles: if operations[Scheduler.Strs.OPER_RUN_MUT_FSTA.value]: path_output_fastas_3dots = GUM.make_path_fastas_3dots_dirs( path_output, path_fastafiles[0]) mutate_fasta = MutateFasta(amino_acids) for path_fastafile in path_fastafiles: sleep_secs = 0 if len( path_fastafiles) < 200 else len(path_fastafiles) / 5000 time.sleep(sleep_secs) if use_multithread: # Scheduler._launch_thread(target=mutate_fasta.mutate_every_residue, # args=[path_fastafile, write_1_fasta_only, write_fasta_per_mut, # path_output_3dots]) Scheduler._launch_process( target=mutate_fasta.mutate_every_residue, args=[ path_fastafile, write_1_fasta_only, write_fasta_per_mut, path_output_fastas_3dots ]) elif not GUM.using_cluster(): mutate_fasta.mutate_every_residue( path_fastafile, write_1_fasta_only, write_fasta_per_mut, path_output_fastas_3dots) if GUM.using_cluster(): jobname = Paths.PREFIX_MUTFSTA.value + path_fastafile.split( '/')[-1] write_1_fasta_only = True write_fasta_per_mut = False Cluster.write_job_q_bash( jobname=jobname, path_job_q_dir=Paths.SE_CONFIG_MUTFASTA_JOBQ.value, python_script_with_paths=os.path.join( Paths.SE_SRC.value, 'run_mutate_fasta_zeus.py') + Str.SPCE.value + path_fastafile + Str.SPCE.value + str(write_1_fasta_only) + Str.SPCE.value + str(write_fasta_per_mut) + Str.SPCE.value + path_output_fastas_3dots, queue='', n_slots='', total_memory_GB='', memory_limit_GB='3', cluster_node='') Cluster.run_job_q( path_job_q_dir=Paths.SE_CONFIG_MUTFASTA_JOBQ.value) if operations[Scheduler.Strs.OPER_RUN_AGDR.value]: agadir = Agadir(Cond.INCELL_MAML_FX.value) for path_fastafile in path_fastafiles: sleep_secs = 0 if len( path_fastafiles) < 200 else len(path_fastafiles) / 1000 time.sleep(sleep_secs) if GUM.using_cluster(): print( 'Calling scheduler.do_agadir using_cluster condition' ) jobname = Paths.PREFIX_AGADIR.value + path_fastafile.split( '/')[-1] Cluster.write_job_q_bash( jobname=jobname, path_job_q_dir=Paths.SE_CONFIG_AGAD_JOBQ.value, python_script_with_paths=os.path.join( Paths.SE_SRC.value, 'run_agadir_on_multifastas_zeus.py' + Str.SPCE.value + path_fastafile + Str.SPCE.value + Paths.SE_OUTPUT.value)) Cluster.run_job_q( path_job_q_dir=Paths.SE_CONFIG_AGAD_JOBQ.value) path_dst = GUM.make_path_agadir_3dots_filename_mutants_dirs( path_output, path_fastafile, add_filename_subdir=True) if use_multithread: # Scheduler._launch_thread(target=agadir.run_agadir_on_multifastas, # args=[path_fastafile, path_output]) Scheduler._launch_process( target=agadir.run_agadir_on_multifastas, args=[path_fastafile, path_dst]) elif not GUM.using_cluster() and not use_multithread: agadir.run_agadir_on_multifastas( path_fastafile, path_dst) if path_pdbfiles: for path_pdbfile in path_pdbfiles: if operations[Scheduler.Strs.OPER_RUN_FX_BM.value]: buildmodel = FoldX().BuildModel(Cond.INCELL_MAML_FX.value) if use_multithread: Scheduler._launch_thread( target=buildmodel.mutate_protein_structure, args=[ path_pdbfile, amino_acids, specific_fxmutants ]) else: buildmodel.mutate_protein_structure( path_pdbfile, amino_acids, specific_fxmutants, write_to_csv_dumpfile_after_each_mutant= write_to_csv_dumpfile_after_each_mutant) if operations[Scheduler.Strs.OPER_RUN_FX_AC.value]: analysecomplex = FoldX().AnalyseComplex( Cond.INCELL_MAML_FX.value) if use_multithread: Scheduler._launch_thread( target=analysecomplex.calculate_complex_energies, args=path_pdbfile) else: analysecomplex.calculate_complex_energies( path_pdbfile, specific_fxmutants, write_to_csv_dumpfile_after_each_mutant= write_to_csv_dumpfile_after_each_mutant) if operations[Scheduler.Strs.OPER_RUN_FX_RPR.value]: repair = FoldX().Repair(Cond.INCELL_MAML_FX.value) if use_multithread: Scheduler._launch_thread(target=repair.do_repair, args=path_pdbfile) else: repair.do_repair(path_pdbfile)
fxmutantname = os.path.basename( path_output_bm_pdb_fxmutant_dir) if fx.has_already_removed_config_logs_fxoutfile( path_output_bm_pdb_fxmutant_dir): print( 'No configs, logs, or unnecessary fxoutfiles found in ' + str(pdbname) + '_' + fxmutantname + '. Hence, nothing to delete.') continue if using_cluster: path_jobq_dir = GUM.os_makedirs(Paths.CONFIG_BM_JOBQ, pdbname, fxmutantname) Cluster.write_job_q_bash( jobname=Paths.PREFIX_FX_RM.value + fxmutantname, path_job_q_dir=path_jobq_dir, python_script_with_paths=os.path.join( Paths.SE_SRC_CLSTR_PYSCRPTS.value, 'remove_files_zeus.py' + Str.SPCE.value + path_output_bm_pdb_fxmutant_dir)) Cluster.run_job_q(path_job_q_dir=path_jobq_dir) else: fx.rm_config_files(path_output_bm_pdb_fxmutant_dir) fx.rm_cluster_logfiles(path_output_bm_pdb_fxmutant_dir, rm_non_empty_err_files=True) fx.rm_unnecessary_fxoutfiles( path_output_bm_pdb_fxmutant_dir) if delete_from_analysecomplex_outputs: fx = FoldX() path_output_bm_pdb_fxmutant_dirs = [] path_output_ac_pdb_fxmutant_dirs = []
class TestCluster(TestCase): # Currently the tests are copying over all configuration and input data from the main directory into the tests # before running the tests (i.e. here in the setUpClass method). # The data in those main folders will be programmatically generated but is currently manually transferred. @classmethod def setUpClass(cls): if not os.path.exists(TPLS.MC_TESTS_CONFIG.value): GUM.linux_copy_all_files_in_dir(path_src_dir=TPLS.CONFIG_FOR_READ_ONLY.value, path_dst_dir=TPLS.MC_TESTS_CONFIG.value) if not os.path.exists(TPLS.MC_TESTS_INPUT.value): GUM.linux_copy_all_files_in_dir(path_src_dir=TPLS.INPUT_FOR_READ_ONLY.value, path_dst_dir=TPLS.MC_TESTS_INPUT.value) # @classmethod # def tearDownClass(cls): # # HM.remove_config_folders() def setUp(self): self.cluster = Cluster() def tearDown(self): self.cluster = None # The method has named keyword argument set to '' for following: python_script_with_paths, queue, n_slots, # total_memory_GB, memory_limit_GB, cluster_node. None of these values is given here so the default will be applied # such that the resulting bash string does not include them. def test_write_job_q_bash(self): # arrange pdbname = 'RepairPDB_1_A' fxbm_jobname_prefix = 'FXBM_' fx_mutant_name = 'RA1A' jobname = fxbm_jobname_prefix + fx_mutant_name expected_job_q = '#!/bin/bash\n' + '#$ -N ' + jobname + '\n' + '#$ -V\n' + '#$ -cwd\n' + \ 'source ~/.bash_profile\n' + TPLS.ZEUS_FOLDX_EXE.value + ' -runfile runscript.txt\n' # not_expected_job_q is just the same as expected_job_q but has an extra single single single_space = ' ' not_expected_job_q = '#!/bin/bash' + single_space + '\n' + '#$ -N ' + jobname + '\n' + '#$ -V\n' + \ '#$ -cwd\n' + 'source ~/.bash_profile\n' + TPLS.ZEUS_FOLDX_EXE.value + \ ' -runfile runscript.txt\n' # not_expected_job_q is just the same as expected_job_q but is missing a \n missing_new_line = '' not_expected_job_q_2 = '#!/bin/bash\n' + '#$ -N ' + jobname + missing_new_line + '#$ -V\n' + \ '#$ -cwd\n' + 'source ~/.bash_profile\n' + TPLS.ZEUS_FOLDX_EXE.value + \ ' -runfile runscript.txt\n' # act actual_job_q = self.cluster.write_job_q_bash(jobname, TPLS.MC_TESTS_CONFIG_JOBQ.value, using_runscript=True) # assert self.assertEqual(expected_job_q, actual_job_q) self.assertNotEqual(not_expected_job_q, actual_job_q) self.assertNotEqual(not_expected_job_q_2, actual_job_q) self._test_job_q_bash_file_created(os.path.join(TPLS.MC_TESTS_CONFIG_JOBQ.value, 'job.q')) def _test_job_q_bash_file_created(self, path_job_q_file): path_job_q_dir = path_job_q_file.strip(path_job_q_file.split('/')[-1]) self.assertTrue(os.path.exists(path_job_q_dir), 'path to job.q directory (which should contain the job.q file) ' 'does not exist: ' + path_job_q_dir) if os.path.exists(path_job_q_dir): self.assertTrue(os.path.exists(path_job_q_file), '(Absolute path to) job.q file does not exist: ' + path_job_q_file) self.assertTrue(os.path.isfile(path_job_q_file))
def setUp(self): self.cluster = Cluster()
def map_seq_to_swsprt_acc_id_and_write_files( path_input_fastafiles, path_output: str, write_idmaps_for_mysqldb: bool, write_csv=True, write_xml=True, write_json=False): """ Maps the specified protein sequences in FASTA format to sequences in the SwissProt database, to find 100% identity hits. The results including the SwissProt Accession Id are written to a csv file. Expects a directory location of fastafiles (not a fastafile itself). (This method relies on the presence of fastafiles in the specified dir, in order for them to be run on Blastp. This transfer is currently done manually) :param path_input_fastafiles: Absolute path of root directory for input fastafiles (e.g. /input_data/fastas_10). :param path_output: Absolute path of root directory for blastp output files (..../output_data/). :param write_idmaps_for_mysqldb: True (by default) builds dictionary mapping RvdK's ids to swsprt accession nos & write files. :param write_csv: True to write csvfiles. :param write_xml: True to write xmlfiles. :param write_json: True to write jsonfiles. :return: List of dictionary data structure representations of each parsed & filtered Blastp run result. """ if isinstance(path_input_fastafiles, str): path_input_fastafiles = [path_input_fastafiles] blastp_dict_list = [] # There are problems with using Biopython.Blast on the cluster that I have not yet solved. I may use the # blast module that is loaded on the cluster (v 2.5.0+) instead of via Biopython. if GUM.using_cluster(): # THE 18 OR SO LINES BELOW HERE ARE COMMENTED OUT BECAUSE BIOPYTHON BLAST DID NOT WORK ON THE CLUSTER AND I DON'T # YET KNOW WHY. # for path_fastafile in path_input_fastafile_list: # with open(path_fastafile) as fastafile_opened: # fastafilename = path_fastafile.split('/')[-1].split('.')[0] # jobname = 'BLSTP_' + fastafilename # Cluster.write_job_q_bash(job_name=jobname, path_job_q_dir=Paths.CONFIG_JOBQ) # path_output_blastp_fastaname = GUM._os_makedirs(Paths.OUTPUT_BLASTP, fastafilename) # os.chdir(path_output_blastp_fastaname) # Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_JOBQ) # Cluster.wait_for_grid_engine_job_to_complete(grid_engine_jobname=jobname) # path_blstp_xml = IdProt._write_raw_blast_xml(path_output, fastafilename, # blastp_result=Biopy.run_blastp(fastafile_opened.read())) # blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(path_blstp_xml, fastafilename, # path_fastafile) # # blastp_dict_list.append(blastp_dict) # if write_idmaps_for_mysqldb: # IdProt._write_idmaps_for_mysqldb(path_output, blastp_dict, write_csv=write_csv, # write_xml=write_xml, # write_json=write_json) python_script_w_paths = os.path.join(Paths.SRC, 'run_blastp_zeus.py') + ' ' + path_input_fastafiles + ' ' \ + path_output + ' ' + Paths.CONFIG_BLST_JOBQ + ' ' + Paths.OUTPUT_BLASTP + ' ' + \ str(write_idmaps_for_mysqldb) + ' ' + str(write_csv) + ' ' + str(write_xml) + \ ' ' + str(write_json) Cluster.write_job_q_bash( jobname='IdProtJobs', path_job_q_dir=Paths.CONFIG_BLST_JOBQ, python_script_with_paths=python_script_w_paths) Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_BLST_JOBQ) else: for path_fastafile in path_input_fastafiles: with open(path_fastafile) as f: fasta_str = f.read() fastafilename = path_fastafile.split('/')[-1].split('.')[0] if IdProt._has_all_A_sequence(path_fastafile): print( 'This sequence has all As, BLAST would think it is a nucleotide sequence and fail. So it is ' 'not being run: ' + path_fastafile) continue path_output_blastp_fastafilename = IdProt._build_dir_tree_with_intermed_dir( path_root=path_output, intermed_dir=Paths.DIR_BLASTP.value, fastadir=fastafilename) if os.path.exists( os.path.join(path_output_blastp_fastafilename, fastafilename + Str.XMLEXT.value)): continue blastp_result = Biopy.run_blastp(fasta_str) path_raw_blstp_xml = IdProt._write_raw_blast_xml( path_output, fastafilename, blastp_result) blastp_dict = Biopy.parse_filter_blastp_xml_to_dict( path_raw_blstp_xml, fastafilename, path_fastafile) blastp_dict_list.append(blastp_dict) if write_idmaps_for_mysqldb: IdProt._write_idmaps_for_mysqldb(path_output, blastp_dict, write_csv=write_csv, write_xml=write_xml, write_json=write_json) return blastp_dict_list
path_output_bm_pdb_fxmutant_dirs = glob.glob( os.path.join(Paths.OUTPUT_BM, pdbname, '*')) for path_output_bm_pdb_fxmutant_dir in path_output_bm_pdb_fxmutant_dirs: fxmutantname = os.path.basename( path_output_bm_pdb_fxmutant_dir) os.chdir(path_output_bm_pdb_fxmutant_dir) bm = fx.BuildModel(Cond.INCELL_MAML_FX.value) while not bm.has_already_generated_avg_bm_fxoutfile( path_output_bm_pdb_fxmutant_dir): print(fx.Strs.AVG_BMDL_.value + pdbname + fx.Strs.FXOUTEXT.value + ' has not been created yet for ' + fxmutantname + '. Therefore cannot write to csv file just yet.') if using_cluster: jobname = Paths.PREFIX_FX_BM.value + fxmutantname Cluster.wait_for_grid_engine_job_to_complete( jobname) fx.write_to_csvfile_for_db_dump( path_output_bm_pdb_fxmutant_dir) if using_cluster: path_jobq_dir = GUM.os_makedirs( Paths.CONFIG_BM_JOBQ, pdbname, fxmutantname) Cluster.write_job_q_bash( jobname=Paths.PREFIX_WRITE.value + fxmutantname, path_job_q_dir=path_jobq_dir, python_script_with_paths=os.path.join( Paths.SE_SRC_CLSTR_PYSCRPTS.value, 'write_csvdumpfile_from_fxout_zeus.py' + Str.SPCE.value + path_output_bm_pdb_fxmutant_dir)) Cluster.run_job_q(path_job_q_dir=path_jobq_dir) else: