Exemple #1
0
def run_agadir_on_1000_fastas():
    path_output_mutants = os.path.join(
        Paths.OUTPUT, Paths.DIR_MUTANTS_FASTAS.value,
        str(startnum) + Str.DOTS3.value + str(endnum))
    path_to_fastas = path_output_mutants + '/**/*' + Str.FSTAEXT.value
    path_fastafile_list = natsort.natsorted(
        glob.glob(path_to_fastas, recursive=True))
    agadir = Agadir(Cond.INCELL_MAML.value)
    for path_fastafile in path_fastafile_list:
        time.sleep(1)
        if GUM.using_cluster():
            jobname = Paths.PREFIX_WRITE.value + path_fastafile.split('/')[-1]
            path_to_script = os.path.join(
                Paths.SRC,
                'write_1fastafile_per_fasta_from_multifastafile_zeus.py')
            Cluster.write_job_q_bash(jobname,
                                     path_job_q_dir=Paths.CONFIG_JOBQ,
                                     python_script_with_paths=path_to_script +
                                     Str.SPCE.value + path_fastafile)
            Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_JOBQ)
        else:
            GUM.write_1_fastafile_per_fasta_from_multifastafile(
                path_dst=Paths.INPUT, path_fastafile=path_fastafile)
    res2 = regex2.search(filename)
    fxmutname = filename[res.end():res2.end()]
    return fxmutname


"""
3. Read csv files and write to a single csvfile for mysql dump. 
"""
for path_output_csvfile in path_output_csvfiles:
    csvfilename = os.path.basename(path_output_csvfile)
    pdbname = _extract_pdbname(csvfilename)
    fxmutantname = _extract_fxmutantname(csvfilename)
    if using_cluster:
        path_jobq_dir = GUM.os_makedirs(Paths.CONFIG_BM_JOBQ, pdbname,
                                        fxmutantname)
        Cluster.write_job_q_bash(
            jobname=Paths.PREFIX_FX_RM.value + fxmutantname,
            path_job_q_dir=path_jobq_dir,
            python_script_with_paths=os.path.join(
                Paths.SE_SRC_CLSTR_PYSCRPTS.value,
                'write_1csvfile_from_csvPerMutantfiles_zeus.py' +
                Str.SPCE.value + path_output_csvfile + Str.SPCE.value +
                path_output_ac_or_bm_dir + Str.SPCE.value + pdbname))
        Cluster.run_job_q(path_job_q_dir=path_jobq_dir)
    else:
        GUM.write_1_csvfile_from_csv_per_mutants(path_output_csvfile,
                                                 path_output_ac_or_bm_dir,
                                                 pdbname)

# pydevd.stoptrace()
Exemple #3
0
# path_input_fastafile_list = sys.argv[1].split(';')
path_input_fastafile_list = glob.glob(sys.argv[1] + '/*.fasta')
path_output = sys.argv[2]
path_config_job = sys.argv[3]
path_output_blastp = sys.argv[4]
write_idmaps_for_mysldb = sys.argv[5] == 'True'
write_csv = sys.argv[6] == 'True'
write_xml = sys.argv[7] == 'True'
write_json = sys.argv[8] == 'True'


for path_fastafile in path_input_fastafile_list:
    with open(path_fastafile) as fastafile_opened:
        fastafile_name = path_fastafile.split('/')[-1].split('.')[0]
        jobname = 'BLSTP_' + fastafile_name
        Cluster.write_job_q_bash(jobname=jobname, path_job_q_dir=path_config_job, queue='all.q', memory_limit_GB='3',
                                 cluster_node='hodor1.vib')
        path_output_blastp_fastaname = GUM.os_makedirs(path_output_blastp, fastafile_name)
        os.chdir(path_output_blastp_fastaname)
        Cluster.run_job_q(path_job_q_dir=path_config_job)
        Cluster.wait_for_grid_engine_job_to_complete(grid_engine_job_prefix_or_full_name=jobname)
        path_raw_blstp_xml = IdProt._write_raw_blast_xml(path_output, fastafile_name,
                                                         blastp_result=NCBIWWW.qblast(
                                                             program=Biopy.BlastParam.BLST_P.value,
                                                             database=Biopy.BlastParam.SWSPRT.value,
                                                             sequence=fastafile_opened.read(),
                                                             entrez_query=Biopy.BlastParam.HOMSAP_ORG.value,
                                                             alignments=Biopy.BlastParam.MAX_ALIGN_20.value,
                                                             hitlist_size=Biopy.BlastParam.MAX_HIT_20.value))
        blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(path_raw_blstp_xml, fastafile_name, path_fastafile)
        # blastp_dict_list.append(blastp_dict)
        if write_idmaps_for_mysldb:
Exemple #4
0
    def start(operations: dict, use_multithread: bool, path_input: str,
              path_output: str, path_pdbfiles: list, path_fastafiles: list,
              specific_fxmutants: list, amino_acids: list,
              write_1_fasta_only: bool, write_fasta_per_mut: bool,
              write_to_csv_dumpfile_after_each_mutant: bool):
        """
        Iterate through a list of fasta files or pdb files and perform Agadir, or Foldx computations as specified by 'operations'.
        :param operations: Each operation paired with True/False flag to indicate whether or not to perform the operation.
        :param use_multithread: True to employ parallel processing.
        :param path_input: Absolute path to input_data root dir.
        :param path_output: Absolute path to output_data root dir.
        :param path_pdbfiles: Absolute path to pdb input files.
        :param path_fastafiles: Absolute path to fasta input files.
        :param specific_fxmutants: Given when specific mutants only should be calculated.
        :param amino_acids: Amino acids that mutation operations should use to mutate to.
        :param write_1_fasta_only: True to write any fasta output data to 1 fasta file, each separated by \n.
        :param write_fasta_per_mut: True to write any fasta output data as 1 fasta file per mutant. (Uses a lot of disk space).
        :param write_to_csv_dumpfile_after_each_mutant: True to write ddG values from fxout files to one csv file (for database
        dump).
        """
        if path_fastafiles:
            if operations[Scheduler.Strs.OPER_RUN_MUT_FSTA.value]:
                path_output_fastas_3dots = GUM.make_path_fastas_3dots_dirs(
                    path_output, path_fastafiles[0])
                mutate_fasta = MutateFasta(amino_acids)
                for path_fastafile in path_fastafiles:
                    sleep_secs = 0 if len(
                        path_fastafiles) < 200 else len(path_fastafiles) / 5000
                    time.sleep(sleep_secs)
                    if use_multithread:
                        # Scheduler._launch_thread(target=mutate_fasta.mutate_every_residue,
                        #                          args=[path_fastafile, write_1_fasta_only, write_fasta_per_mut,
                        #                                path_output_3dots])
                        Scheduler._launch_process(
                            target=mutate_fasta.mutate_every_residue,
                            args=[
                                path_fastafile, write_1_fasta_only,
                                write_fasta_per_mut, path_output_fastas_3dots
                            ])
                    elif not GUM.using_cluster():
                        mutate_fasta.mutate_every_residue(
                            path_fastafile, write_1_fasta_only,
                            write_fasta_per_mut, path_output_fastas_3dots)
                    if GUM.using_cluster():
                        jobname = Paths.PREFIX_MUTFSTA.value + path_fastafile.split(
                            '/')[-1]
                        write_1_fasta_only = True
                        write_fasta_per_mut = False
                        Cluster.write_job_q_bash(
                            jobname=jobname,
                            path_job_q_dir=Paths.SE_CONFIG_MUTFASTA_JOBQ.value,
                            python_script_with_paths=os.path.join(
                                Paths.SE_SRC.value, 'run_mutate_fasta_zeus.py')
                            + Str.SPCE.value + path_fastafile +
                            Str.SPCE.value + str(write_1_fasta_only) +
                            Str.SPCE.value + str(write_fasta_per_mut) +
                            Str.SPCE.value + path_output_fastas_3dots,
                            queue='',
                            n_slots='',
                            total_memory_GB='',
                            memory_limit_GB='3',
                            cluster_node='')
                        Cluster.run_job_q(
                            path_job_q_dir=Paths.SE_CONFIG_MUTFASTA_JOBQ.value)

            if operations[Scheduler.Strs.OPER_RUN_AGDR.value]:
                agadir = Agadir(Cond.INCELL_MAML_FX.value)
                for path_fastafile in path_fastafiles:
                    sleep_secs = 0 if len(
                        path_fastafiles) < 200 else len(path_fastafiles) / 1000
                    time.sleep(sleep_secs)
                    if GUM.using_cluster():
                        print(
                            'Calling scheduler.do_agadir using_cluster condition'
                        )
                        jobname = Paths.PREFIX_AGADIR.value + path_fastafile.split(
                            '/')[-1]
                        Cluster.write_job_q_bash(
                            jobname=jobname,
                            path_job_q_dir=Paths.SE_CONFIG_AGAD_JOBQ.value,
                            python_script_with_paths=os.path.join(
                                Paths.SE_SRC.value,
                                'run_agadir_on_multifastas_zeus.py' +
                                Str.SPCE.value + path_fastafile +
                                Str.SPCE.value + Paths.SE_OUTPUT.value))
                        Cluster.run_job_q(
                            path_job_q_dir=Paths.SE_CONFIG_AGAD_JOBQ.value)

                    path_dst = GUM.make_path_agadir_3dots_filename_mutants_dirs(
                        path_output, path_fastafile, add_filename_subdir=True)
                    if use_multithread:
                        # Scheduler._launch_thread(target=agadir.run_agadir_on_multifastas,
                        #                          args=[path_fastafile, path_output])
                        Scheduler._launch_process(
                            target=agadir.run_agadir_on_multifastas,
                            args=[path_fastafile, path_dst])
                    elif not GUM.using_cluster() and not use_multithread:
                        agadir.run_agadir_on_multifastas(
                            path_fastafile, path_dst)
        if path_pdbfiles:
            for path_pdbfile in path_pdbfiles:
                if operations[Scheduler.Strs.OPER_RUN_FX_BM.value]:
                    buildmodel = FoldX().BuildModel(Cond.INCELL_MAML_FX.value)
                    if use_multithread:
                        Scheduler._launch_thread(
                            target=buildmodel.mutate_protein_structure,
                            args=[
                                path_pdbfile, amino_acids, specific_fxmutants
                            ])
                    else:
                        buildmodel.mutate_protein_structure(
                            path_pdbfile,
                            amino_acids,
                            specific_fxmutants,
                            write_to_csv_dumpfile_after_each_mutant=
                            write_to_csv_dumpfile_after_each_mutant)
                if operations[Scheduler.Strs.OPER_RUN_FX_AC.value]:
                    analysecomplex = FoldX().AnalyseComplex(
                        Cond.INCELL_MAML_FX.value)
                    if use_multithread:
                        Scheduler._launch_thread(
                            target=analysecomplex.calculate_complex_energies,
                            args=path_pdbfile)
                    else:
                        analysecomplex.calculate_complex_energies(
                            path_pdbfile,
                            specific_fxmutants,
                            write_to_csv_dumpfile_after_each_mutant=
                            write_to_csv_dumpfile_after_each_mutant)
                if operations[Scheduler.Strs.OPER_RUN_FX_RPR.value]:
                    repair = FoldX().Repair(Cond.INCELL_MAML_FX.value)
                    if use_multithread:
                        Scheduler._launch_thread(target=repair.do_repair,
                                                 args=path_pdbfile)
                    else:
                        repair.do_repair(path_pdbfile)
Exemple #5
0
                fxmutantname = os.path.basename(
                    path_output_bm_pdb_fxmutant_dir)
                if fx.has_already_removed_config_logs_fxoutfile(
                        path_output_bm_pdb_fxmutant_dir):
                    print(
                        'No configs, logs, or unnecessary fxoutfiles found in '
                        + str(pdbname) + '_' + fxmutantname +
                        '. Hence, nothing to delete.')
                    continue
                if using_cluster:
                    path_jobq_dir = GUM.os_makedirs(Paths.CONFIG_BM_JOBQ,
                                                    pdbname, fxmutantname)
                    Cluster.write_job_q_bash(
                        jobname=Paths.PREFIX_FX_RM.value + fxmutantname,
                        path_job_q_dir=path_jobq_dir,
                        python_script_with_paths=os.path.join(
                            Paths.SE_SRC_CLSTR_PYSCRPTS.value,
                            'remove_files_zeus.py' + Str.SPCE.value +
                            path_output_bm_pdb_fxmutant_dir))
                    Cluster.run_job_q(path_job_q_dir=path_jobq_dir)
                else:
                    fx.rm_config_files(path_output_bm_pdb_fxmutant_dir)
                    fx.rm_cluster_logfiles(path_output_bm_pdb_fxmutant_dir,
                                           rm_non_empty_err_files=True)
                    fx.rm_unnecessary_fxoutfiles(
                        path_output_bm_pdb_fxmutant_dir)

if delete_from_analysecomplex_outputs:
    fx = FoldX()
    path_output_bm_pdb_fxmutant_dirs = []
    path_output_ac_pdb_fxmutant_dirs = []
class TestCluster(TestCase):

    # Currently the tests are copying over all configuration and input data from the main directory into the tests
    # before running the tests (i.e. here in the setUpClass method).
    # The data in those main folders will be programmatically generated but is currently manually transferred.
    @classmethod
    def setUpClass(cls):
        if not os.path.exists(TPLS.MC_TESTS_CONFIG.value):
            GUM.linux_copy_all_files_in_dir(path_src_dir=TPLS.CONFIG_FOR_READ_ONLY.value,
                                            path_dst_dir=TPLS.MC_TESTS_CONFIG.value)

        if not os.path.exists(TPLS.MC_TESTS_INPUT.value):
            GUM.linux_copy_all_files_in_dir(path_src_dir=TPLS.INPUT_FOR_READ_ONLY.value,
                                            path_dst_dir=TPLS.MC_TESTS_INPUT.value)

    # @classmethod
    # def tearDownClass(cls):
    #     # HM.remove_config_folders()

    def setUp(self):
        self.cluster = Cluster()

    def tearDown(self):
        self.cluster = None

    # The method has named keyword argument set to '' for following: python_script_with_paths, queue, n_slots,
    # total_memory_GB, memory_limit_GB, cluster_node. None of these values is given here so the default will be applied
    # such that the resulting bash string does not include them.
    def test_write_job_q_bash(self):
        # arrange
        pdbname = 'RepairPDB_1_A'
        fxbm_jobname_prefix = 'FXBM_'
        fx_mutant_name = 'RA1A'
        jobname = fxbm_jobname_prefix + fx_mutant_name
        expected_job_q = '#!/bin/bash\n' + '#$ -N ' + jobname + '\n' + '#$ -V\n' + '#$ -cwd\n' + \
                              'source ~/.bash_profile\n' + TPLS.ZEUS_FOLDX_EXE.value + ' -runfile runscript.txt\n'
        # not_expected_job_q is just the same as expected_job_q but has an extra single single
        single_space = ' '
        not_expected_job_q = '#!/bin/bash' + single_space + '\n' + '#$ -N ' + jobname + '\n' + '#$ -V\n' + \
                             '#$ -cwd\n' + 'source ~/.bash_profile\n' + TPLS.ZEUS_FOLDX_EXE.value + \
                             ' -runfile runscript.txt\n'
        # not_expected_job_q is just the same as expected_job_q but is missing a \n
        missing_new_line = ''
        not_expected_job_q_2 = '#!/bin/bash\n' + '#$ -N ' + jobname + missing_new_line + '#$ -V\n' + \
                             '#$ -cwd\n' + 'source ~/.bash_profile\n' + TPLS.ZEUS_FOLDX_EXE.value + \
                             ' -runfile runscript.txt\n'
        # act
        actual_job_q = self.cluster.write_job_q_bash(jobname, TPLS.MC_TESTS_CONFIG_JOBQ.value, using_runscript=True)
        # assert
        self.assertEqual(expected_job_q, actual_job_q)
        self.assertNotEqual(not_expected_job_q, actual_job_q)
        self.assertNotEqual(not_expected_job_q_2, actual_job_q)
        self._test_job_q_bash_file_created(os.path.join(TPLS.MC_TESTS_CONFIG_JOBQ.value, 'job.q'))

    def _test_job_q_bash_file_created(self, path_job_q_file):
        path_job_q_dir = path_job_q_file.strip(path_job_q_file.split('/')[-1])
        self.assertTrue(os.path.exists(path_job_q_dir), 'path to job.q directory (which should contain the job.q file) '
                                                        'does not exist: ' + path_job_q_dir)
        if os.path.exists(path_job_q_dir):
            self.assertTrue(os.path.exists(path_job_q_file), '(Absolute path to) job.q file does not exist: '
                            + path_job_q_file)
            self.assertTrue(os.path.isfile(path_job_q_file))
 def setUp(self):
     self.cluster = Cluster()
    def map_seq_to_swsprt_acc_id_and_write_files(
            path_input_fastafiles,
            path_output: str,
            write_idmaps_for_mysqldb: bool,
            write_csv=True,
            write_xml=True,
            write_json=False):
        """
        Maps the specified protein sequences in FASTA format to sequences in the SwissProt database, to find 100% identity
        hits. The results including the SwissProt Accession Id are written to a csv file.
        Expects a directory location of fastafiles (not a fastafile itself).
        (This method relies on the presence of fastafiles in the specified dir, in order for them to be run on Blastp.
        This transfer is currently done manually)
        :param path_input_fastafiles: Absolute path of root directory for input fastafiles (e.g. /input_data/fastas_10).
        :param path_output: Absolute path of root directory for blastp output files (..../output_data/).
        :param write_idmaps_for_mysqldb: True (by default) builds dictionary mapping RvdK's ids to swsprt accession nos & write files.
        :param write_csv: True to write csvfiles.
        :param write_xml: True to write xmlfiles.
        :param write_json: True to write jsonfiles.
        :return: List of dictionary data structure representations of each parsed & filtered Blastp run result.
        """
        if isinstance(path_input_fastafiles, str):
            path_input_fastafiles = [path_input_fastafiles]
        blastp_dict_list = []
        # There are problems with using Biopython.Blast on the cluster that I have not yet solved. I may use the
        # blast module that is loaded on the cluster (v 2.5.0+) instead of via Biopython.
        if GUM.using_cluster():
            # THE 18 OR SO LINES BELOW HERE ARE COMMENTED OUT BECAUSE BIOPYTHON BLAST DID NOT WORK ON THE CLUSTER AND I DON'T
            # YET KNOW WHY.
            # for path_fastafile in path_input_fastafile_list:
            #     with open(path_fastafile) as fastafile_opened:
            #         fastafilename = path_fastafile.split('/')[-1].split('.')[0]
            #         jobname = 'BLSTP_' + fastafilename
            #         Cluster.write_job_q_bash(job_name=jobname, path_job_q_dir=Paths.CONFIG_JOBQ)
            #         path_output_blastp_fastaname = GUM._os_makedirs(Paths.OUTPUT_BLASTP, fastafilename)
            #         os.chdir(path_output_blastp_fastaname)
            #         Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_JOBQ)
            #         Cluster.wait_for_grid_engine_job_to_complete(grid_engine_jobname=jobname)
            #         path_blstp_xml = IdProt._write_raw_blast_xml(path_output, fastafilename,
            #                                                 blastp_result=Biopy.run_blastp(fastafile_opened.read()))
            #         blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(path_blstp_xml, fastafilename,
            #         path_fastafile)
            #         # blastp_dict_list.append(blastp_dict)
            #         if write_idmaps_for_mysqldb:
            #             IdProt._write_idmaps_for_mysqldb(path_output, blastp_dict, write_csv=write_csv,
            #                                              write_xml=write_xml,
            #                                              write_json=write_json)

            python_script_w_paths = os.path.join(Paths.SRC, 'run_blastp_zeus.py') + ' ' + path_input_fastafiles + ' ' \
                                    + path_output + ' ' + Paths.CONFIG_BLST_JOBQ + ' ' + Paths.OUTPUT_BLASTP + ' ' + \
                                    str(write_idmaps_for_mysqldb) + ' ' + str(write_csv) + ' ' + str(write_xml) + \
                                    ' ' + str(write_json)
            Cluster.write_job_q_bash(
                jobname='IdProtJobs',
                path_job_q_dir=Paths.CONFIG_BLST_JOBQ,
                python_script_with_paths=python_script_w_paths)
            Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_BLST_JOBQ)
        else:
            for path_fastafile in path_input_fastafiles:
                with open(path_fastafile) as f:
                    fasta_str = f.read()
                    fastafilename = path_fastafile.split('/')[-1].split('.')[0]
                if IdProt._has_all_A_sequence(path_fastafile):
                    print(
                        'This sequence has all As, BLAST would think it is a nucleotide sequence and fail. So it is '
                        'not being run: ' + path_fastafile)
                    continue
                path_output_blastp_fastafilename = IdProt._build_dir_tree_with_intermed_dir(
                    path_root=path_output,
                    intermed_dir=Paths.DIR_BLASTP.value,
                    fastadir=fastafilename)
                if os.path.exists(
                        os.path.join(path_output_blastp_fastafilename,
                                     fastafilename + Str.XMLEXT.value)):
                    continue
                blastp_result = Biopy.run_blastp(fasta_str)
                path_raw_blstp_xml = IdProt._write_raw_blast_xml(
                    path_output, fastafilename, blastp_result)
                blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(
                    path_raw_blstp_xml, fastafilename, path_fastafile)
                blastp_dict_list.append(blastp_dict)
                if write_idmaps_for_mysqldb:
                    IdProt._write_idmaps_for_mysqldb(path_output,
                                                     blastp_dict,
                                                     write_csv=write_csv,
                                                     write_xml=write_xml,
                                                     write_json=write_json)
        return blastp_dict_list
Exemple #9
0
     path_output_bm_pdb_fxmutant_dirs = glob.glob(
         os.path.join(Paths.OUTPUT_BM, pdbname, '*'))
 for path_output_bm_pdb_fxmutant_dir in path_output_bm_pdb_fxmutant_dirs:
     fxmutantname = os.path.basename(
         path_output_bm_pdb_fxmutant_dir)
     os.chdir(path_output_bm_pdb_fxmutant_dir)
     bm = fx.BuildModel(Cond.INCELL_MAML_FX.value)
     while not bm.has_already_generated_avg_bm_fxoutfile(
             path_output_bm_pdb_fxmutant_dir):
         print(fx.Strs.AVG_BMDL_.value + pdbname +
               fx.Strs.FXOUTEXT.value +
               ' has not been created yet for ' + fxmutantname +
               '. Therefore cannot write to csv file just yet.')
         if using_cluster:
             jobname = Paths.PREFIX_FX_BM.value + fxmutantname
             Cluster.wait_for_grid_engine_job_to_complete(
                 jobname)
     fx.write_to_csvfile_for_db_dump(
         path_output_bm_pdb_fxmutant_dir)
     if using_cluster:
         path_jobq_dir = GUM.os_makedirs(
             Paths.CONFIG_BM_JOBQ, pdbname, fxmutantname)
         Cluster.write_job_q_bash(
             jobname=Paths.PREFIX_WRITE.value + fxmutantname,
             path_job_q_dir=path_jobq_dir,
             python_script_with_paths=os.path.join(
                 Paths.SE_SRC_CLSTR_PYSCRPTS.value,
                 'write_csvdumpfile_from_fxout_zeus.py' +
                 Str.SPCE.value +
                 path_output_bm_pdb_fxmutant_dir))
         Cluster.run_job_q(path_job_q_dir=path_jobq_dir)
     else: