def test_sub_4(self): directory = os.getcwd() jobs = [ make_script([["sleep 5"], ['echo "file {0}"'.format(i)]], directory=directory) for i in range(5) ] array_script, array_jobs = prep_array_script(jobs, directory, SunGridEngine.TASK_ID) jobid = SunGridEngine.sub(array_script, array=[1, 5], log=os.devnull, name=inspect.stack()[0][3], shell="/bin/sh") while SunGridEngine.stat(jobid): time.sleep(1) for i, j in enumerate(jobs): f = j.replace(".sh", ".log") self.assertTrue(os.path.isfile(f)) self.assertEqual("file {0}".format(i), open(f).read().strip()) os.unlink(f) for f in jobs + [array_script, array_jobs]: os.unlink(f)
def test_rls_1(self): jobs = [make_script(["touch", "pyjob_rls_test_1"])] jobid = LoadSharingFacility.sub(jobs, hold=True, name=inspect.stack()[0][3], shell="/bin/sh") time.sleep(5) LoadSharingFacility.rls(jobid) start, timeout = time.time(), False while LoadSharingFacility.stat(jobid): # Don't wait too long, one minute, then fail if ((time.time() - start) // 60) >= 1: LoadSharingFacility.kill(jobid) timeout = True time.sleep(10) for f in jobs: os.unlink(f) if timeout: self.assertEqual(1, 0, "Timeout") else: self.assertTrue(os.path.isfile('pyjob_rls_test_1')) os.unlink('pyjob_rls_test_1')
def test_sub_7(self): assert "PYJOB_ENV1" not in os.environ os.environ["PYJOB_ENV1"] = "pyjob_random1" jobs = [ make_script(["echo $PYJOB_ENV1"], directory=os.getcwd()) for _ in range(2) ] array_script, array_jobs = prep_array_script(jobs, os.getcwd(), SunGridEngine.TASK_ID) jobid = SunGridEngine.sub(array_script, array=[1, 2], directory=os.getcwd(), log=os.devnull, name=inspect.stack()[0][3], shell="/bin/sh") while SunGridEngine.stat(jobid): time.sleep(1) for i, j in enumerate(jobs): f = j.replace(".sh", ".log") self.assertTrue(os.path.isfile(f)) self.assertEqual(os.environ["PYJOB_ENV1"], open(f).read().strip()) os.unlink(f) for f in jobs + [array_script, array_jobs]: os.unlink(f)
def submit_jobs(self, results, nproc=1, process_all=False, submit_qtype=None, submit_queue=False, monitor=None): """Submit jobs to run in serial or on a cluster Parameters ---------- results : class Results from :obj: '_LatticeParameterScore' or :obj: '_AmoreRotationScore' nproc : int, optional Number of processors to use [default: 1] process_all : bool, optional Terminate MR after a success [default: True] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster monitor : str Returns ------- file Output pdb from mr file Output hkl from mr - if using phaser file Output log file from mr program file Output pdb from refinement file Output hkl from refinement file Output log file from refinement program """ if not os.path.isdir(self.output_dir): os.mkdir(self.output_dir) run_files = [] sol_cont = SolventContent(self.cell_parameters, self.space_group) mat_prob = MatthewsProbability(self.cell_parameters, self.space_group) for result in results: mr_workdir = os.path.join(self.output_dir, result.pdb_code, 'mr', self.mr_program) mr_logfile = os.path.join(mr_workdir, '{0}_mr.log'.format(result.pdb_code)) mr_pdbout = os.path.join(mr_workdir, '{0}_mr_output.pdb'.format(result.pdb_code)) mr_hklout = os.path.join(mr_workdir, '{0}_mr_output.mtz'.format(result.pdb_code)) ref_workdir = os.path.join(mr_workdir, 'refine') ref_hklout = os.path.join(ref_workdir, '{0}_refinement_output.mtz'.format(result.pdb_code)) ref_logfile = os.path.join(ref_workdir, '{0}_ref.log'.format(result.pdb_code)) ref_pdbout = os.path.join(ref_workdir, '{0}_refinement_output.pdb'.format(result.pdb_code)) diff_mapout1 = os.path.join(ref_workdir, '{0}_refmac_2fofcwt.map'.format(result.pdb_code)) diff_mapout2 = os.path.join(ref_workdir, '{0}_refmac_fofcwt.map'.format(result.pdb_code)) if os.path.isfile(ref_logfile): rp = refmac_parser.RefmacParser(ref_logfile) if _mr_job_succeeded(rp.final_r_fact, rp.final_r_free): score = MrScore(pdb_code=result.pdb_code) if self.mr_program == "molrep": mp = molrep_parser.MolrepParser(mr_logfile) score.molrep_score = mp.score score.molrep_tfscore = mp.tfscore elif self.mr_program == "phaser": pp = phaser_parser.PhaserParser(mr_logfile) score.phaser_tfz = pp.tfz score.phaser_llg = pp.llg score.phaser_rfz = pp.rfz rp = refmac_parser.RefmacParser(ref_logfile) score.final_r_free = rp.final_r_free score.final_r_fact = rp.final_r_fact self._search_results = [score] return if isinstance(result, AmoreRotationScore) or isinstance(result, PhaserRotationScore): pdb_struct = PdbStructure() pdb_struct.from_file(result.dat_path) mr_pdbin = os.path.join(self.output_dir, result.pdb_code + ".pdb") pdb_struct.save(mr_pdbin) elif isinstance(result, LatticeSearchResult): pdb_struct = PdbStructure() pdb_struct.from_file(result.pdb_path) mr_pdbin = result.pdb_path else: raise ValueError("Do not recognize result container") solvent_content = sol_cont.calculate_from_struct(pdb_struct) if solvent_content > 30: solvent_content, n_copies = mat_prob.calculate_content_ncopies_from_struct(pdb_struct) else: pdb_struct.keep_first_chain_only() pdb_struct.save(mr_pdbin) solvent_content, n_copies = mat_prob.calculate_content_ncopies_from_struct(pdb_struct) msg = "%s is predicted to be too large to fit in the unit "\ + "cell with a solvent content of at least 30 percent, "\ + "therefore MR will use only the first chain" logger.debug(msg, result.pdb_code) mr_cmd = [ CMD_PREFIX, "ccp4-python", "-m", self.mr_python_module, "-hklin", self.mtz, "-hklout", mr_hklout, "-pdbin", mr_pdbin, "-pdbout", mr_pdbout, "-logfile", mr_logfile, "-work_dir", mr_workdir, "-nmol", n_copies, "-sgalternative", self.sgalternative ] ref_cmd = [ CMD_PREFIX, "ccp4-python", "-m", self.refine_python_module, "-pdbin", mr_pdbout, "-pdbout", ref_pdbout, "-hklin", mr_hklout, "-hklout", ref_hklout, "-logfile", ref_logfile, "-work_dir", ref_workdir, "-refinement_type", self.refine_type, "-ncyc", self.refine_cycles ] if self.mr_program == "molrep": mr_cmd += ["-space_group", self.space_group] elif self.mr_program == "phaser": mr_cmd += [ "-i", self.i, "-sigi", self.sigi, "-f", self.f, "-sigf", self.sigf, "-solvent", solvent_content, "-timeout", self.timeout, ] if isinstance(result, LatticeSearchResult): mr_cmd += ['-autohigh', 4.0, '-hires', 5.0] # ==== # Create a run script - prefix __needs__ to contain mr_program so we can find log # Leave order of this as SGE does not like scripts with numbers as first char # ==== prefix, stem = self.mr_program + "_", result.pdb_code fft_cmd1, fft_stdin1 = self.fft(ref_hklout, diff_mapout1, "2mfo-dfc") run_stdin_1 = tmp_file(directory=self.output_dir, prefix=prefix, stem=stem, suffix="_1.stdin") with open(run_stdin_1, 'w') as f_out: f_out.write(fft_stdin1) fft_cmd2, fft_stdin2 = self.fft(ref_hklout, diff_mapout2, "mfo-dfc") run_stdin_2 = tmp_file(directory=self.output_dir, prefix=prefix, stem=stem, suffix="_2.stdin") with open(run_stdin_2, 'w') as f_out: f_out.write(fft_stdin2) ccp4_scr = os.environ["CCP4_SCR"] if self.tmp_dir: tmp_dir = os.path.join(self.tmp_dir) else: tmp_dir = os.path.join(self.output_dir) cmd = [ [EXPORT, "CCP4_SCR=" + tmp_dir], mr_cmd + [os.linesep], ref_cmd + [os.linesep], fft_cmd1 + ["<", run_stdin_1, os.linesep], fft_cmd2 + ["<", run_stdin_2, os.linesep], [EXPORT, "CCP4_SCR=" + ccp4_scr], ] run_script = make_script(cmd, directory=self.output_dir, prefix=prefix, stem=stem) run_log = run_script.rsplit(".", 1)[0] + '.log' run_files += [(run_script, run_stdin_1, run_stdin_2, run_log, mr_pdbout, mr_logfile, ref_logfile)] if not self.mute: logger.info("Running %s Molecular Replacement", self.mr_program) run_scripts, _, _, _, mr_pdbouts, mr_logfiles, ref_logfiles = zip(*run_files) j = Job(submit_qtype) j.submit( run_scripts, directory=self.output_dir, nproc=nproc, name='simbad_mr', queue=submit_queue, permit_nonzero=True) interval = int(numpy.log(len(run_scripts)) / 3) interval_in_seconds = interval if interval >= 5 else 5 if process_all: j.wait(interval=interval_in_seconds, monitor=monitor) else: j.wait(interval=interval_in_seconds, monitor=monitor, check_success=mr_succeeded_log) mr_results = [] for result, mr_logfile, mr_pdbout, ref_logfile in zip(results, mr_logfiles, mr_pdbouts, ref_logfiles): if not os.path.isfile(mr_logfile): logger.debug("Cannot find %s MR log file: %s", self.mr_program, mr_logfile) continue elif not os.path.isfile(ref_logfile): logger.debug("Cannot find %s refine log file: %s", self.mr_program, ref_logfile) continue elif not os.path.isfile(mr_pdbout): logger.debug("Cannot find %s output file: %s", self.mr_program, mr_pdbout) continue score = MrScore(pdb_code=result.pdb_code) if self.mr_program == "molrep": mp = molrep_parser.MolrepParser(mr_logfile) score.molrep_score = mp.score score.molrep_tfscore = mp.tfscore elif self.mr_program == "phaser": pp = phaser_parser.PhaserParser(mr_logfile) score.phaser_tfz = pp.tfz score.phaser_llg = pp.llg score.phaser_rfz = pp.rfz if self._dano is not None: try: anode = anomalous_util.AnodeSearch(self.mtz, self.output_dir, self.mr_program) anode.run(result) a = anode.search_results() score.dano_peak_height = a.dano_peak_height score.nearest_atom = a.nearest_atom except RuntimeError: logger.debug("RuntimeError: Unable to create DANO map for: %s", result.pdb_code) except IOError: logger.debug("IOError: Unable to create DANO map for: %s", result.pdb_code) if os.path.isfile(ref_logfile): rp = refmac_parser.RefmacParser(ref_logfile) score.final_r_free = rp.final_r_free score.final_r_fact = rp.final_r_fact else: logger.debug("Cannot find %s log file: %s", self.refine_program, ref_logfile) mr_results += [score] self._search_results = mr_results
def comparison(self, models, structures): """ Compare a list of model structures to a second list of reference structures Parameters ---------- models : list List containing the paths to the model structure files structures : list List containing the paths to the reference structure files Returns ------- entries : list List of TMscore data entries on a per-model basis """ if len(models) < 1 or len(structures) < 1: msg = 'No model structures provided' if len( models) < 1 else 'No reference structures provided' logger.critical(msg) raise RuntimeError(msg) elif len(structures) == 1: logger.info( 'Using single structure provided for all model comparisons') structures = [structures[0] for _ in xrange(len(models))] elif len(models) != len(structures): msg = "Unequal number of models and structures!" logger.critical(msg) raise RuntimeError(msg) if self.method == "tmalign": pt = tm_parser.TMalignLogParser() elif self.method == "tmscore": pt = tm_parser.TMscoreLogParser() else: msg = "Invalid method selected: %s", self.method logger.critical(msg) raise RuntimeError(msg) logger.info('Using algorithm: {0}'.format(self.method)) logger.info('------- Evaluating decoys -------') data_entries, job_scripts, log_files = [], [], [] for model_pdb, structure_pdb in zip(models, structures): model_name = os.path.splitext(os.path.basename(model_pdb))[0] structure_name = os.path.splitext( os.path.basename(structure_pdb))[0] stem = "_".join([model_name, structure_name, self.method]) if os.path.isfile(model_pdb) and os.path.isfile(structure_pdb): data_entries.append( [model_name, structure_name, model_pdb, structure_pdb]) script = make_script( [self.executable, model_pdb, structure_pdb], prefix="tmscore_", stem=stem, directory=self.tmp_dir) job_scripts.append(script) log_files.append(os.path.splitext(script)[0] + ".log") else: if not os.path.isfile(model_pdb): logger.warning("Cannot find: %s", model_pdb) if not os.path.isfile(structure_pdb): logger.warning("Cannot find: %s", structure_pdb) continue logger.info('Executing TManalysis scripts') j = Job(self._qtype) j.submit(job_scripts, nproc=self._nproc, max_array_jobs=self._max_array_jobs, queue=self._queue, name="tmscore") j.wait(interval=1) self.entries = [] for entry, log, script in zip(data_entries, log_files, job_scripts): try: pt.reset() pt.parse(log) except Exception: logger.critical("Error processing the %s log file: %s", self.method, log) log = "None" model_name, structure_name, model_pdb, structure_pdb = entry _entry = self._store(model_name, structure_name, model_pdb, structure_pdb, log, pt) self.entries.append(_entry) os.unlink(script) return self.entries
def create_morda_db(database, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000): """Create the MoRDa search database Parameters ---------- database : str The path to the database folder nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster chunk_size : int, optional The number of jobs to submit at the same time [default: 5000] Raises ------ RuntimeError Windows is currently not supported """ if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database))) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: download_morda() morda_installed_through_ccp4 = False morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') #simbad_dat_path = os.path.join(database, '**', '*.dat') simbad_pdb_path = os.path.join(database, '**', '*.pdb') morda_dat_files = set([os.path.basename(f) for f in glob.glob(morda_dat_path)]) # simbad_dat_files = set([os.path.basename(f) for f in glob.glob(simbad_dat_path)]) simbad_dat_files = set([os.path.basename(f).split('.')[0] + '.dat' for f in glob.glob(simbad_pdb_path)]) # erroneous_files = set(["1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat"]) erroneous_files = set(["1bbzA_0.pdb", "1gt0D_0.pdb", "1h3oA_0.pdb", "1kskA_1.pdb", "1l0sA_0.pdb"]) def delete_erroneous_files(erroneous_paths): for f in erroneous_paths: if os.path.isfile(f): logger.warning("File flagged to be erroneous ... " + "removing from database: %s", f) os.remove(f) erroneous_paths = [os.path.join(database, name[1:3], name) for name in erroneous_files] delete_erroneous_files(erroneous_paths) dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files) if len(dat_files) < 1: logger.info('SIMBAD database up-to-date') if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) leave_timestamp(os.path.join(database, 'simbad_morda.txt')) return else: logger.info("%d new entries were found in the MoRDa database, " + "updating SIMBAD database", len(dat_files)) exe = os.path.join(os.environ["MRD_PROG"], "get_model") run_dir = tmp_dir(directory=os.getcwd()) # Submit in chunks, so we don't take too much disk space # and can terminate without loosing the processed data total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0) for cycle, i in enumerate(range(0, len(dat_files), chunk_size)): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) chunk_dat_files = dat_files[i:i + chunk_size] # Create the database files what_to_do = [] for f in chunk_dat_files: code = os.path.basename(f).rsplit('.', 1)[0] #final_file = os.path.join(database, code[1:3], code + ".dat") final_file = os.path.join(database, code[1:3], code + '.pdb') # We need a temporary directory within because "get_model" uses non-unique file names tmp_d = tmp_dir(directory=run_dir) get_model_output = os.path.join(tmp_d, code + ".pdb") script = make_script( [["export CCP4_SCR=", tmp_d], ["export MRD_DB=" + os.environ['MRD_DB']], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]], directory=tmp_d) log = script.rsplit('.', 1)[0] + '.log' what_to_do += [(script, log, tmp_d, (get_model_output, final_file))] scripts, _, tmps, files = zip(*what_to_do) j = Job(submit_qtype) j.submit(scripts, name='morda_db', nproc=nproc, queue=submit_queue) j.wait() sub_dir_names = set([os.path.basename(f).rsplit('.', 1)[0][1:3] for f in chunk_dat_files]) for sub_dir_name in sub_dir_names: sub_dir = os.path.join(database, sub_dir_name) if os.path.isdir(sub_dir): continue os.makedirs(sub_dir) for output, final in files: if os.path.isfile(output): #simbad.db.convert_pdb_to_dat(output, final) shutil.move(output, final) else: logger.critical("File missing: {}".format(output)) for d in tmps: shutil.rmtree(d) shutil.rmtree(run_dir) if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) validate_compressed_database(database) leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
def create_contaminant_db(database, add_morda_domains, nproc=2, submit_qtype=None, submit_queue=False): """Create a contaminant database Parameters ---------- database : str The path to the database folder add_morda_domains : bool Retrospectively add morda domains to a contaminant database updated when morda was not installed nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster Raises ------ RuntimeError dimple.contaminants.prepare module not available RuntimeError Windows is currently not supported """ if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database))) import dimple.main logger.info('DIMPLE version: %s', dimple.main.__version__) if StrictVersion(dimple.main.__version__) < StrictVersion('2.5.7'): msg = "This feature will be available with dimple version 2.5.7" raise RuntimeError(msg) if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) import dimple.contaminants.prepare dimple.contaminants.prepare.main(verbose=False) simbad_dat_path = os.path.join(database, '*', '*', '*', '*.dat') existing_dat_files = [os.path.basename(f).split('.')[0].lower() for f in glob.iglob(simbad_dat_path)] erroneous_files = ['4v43'] dimple_files = ['cached', 'data.json', 'data.py'] with open("data.json") as data_file: data = json.load(data_file) results = [] for child in data["children"]: try: for child_2 in child["children"]: space_group = child_2["name"].replace(" ", "") for child_3 in child_2["children"]: pdb_code = child_3["name"].split()[0].lower() if (pdb_code in existing_dat_files or pdb_code in erroneous_files) and not add_morda_domains: continue uniprot_name = child["name"] uniprot_mnemonic = uniprot_name.split('_')[1] score = ContaminantSearchResult(pdb_code, space_group, uniprot_name, uniprot_mnemonic) results.append(score) except KeyError: pass if len(results) == 0: logger.info("Contaminant database up to date") else: if add_morda_domains: logger.info("Adding morda domains to contaminant database") else: logger.info("%d new entries were found in the contaminant database, " + "updating SIMBAD database", len(results)) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: morda_installed_through_ccp4 = False if add_morda_domains and not morda_installed_through_ccp4: logger.critical("Morda not installed locally, unable to add morda domains to contaminant database") if morda_installed_through_ccp4: morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') morda_dat_files = set([os.path.basename(f) for f in glob.iglob(morda_dat_path)]) exe = os.path.join(os.environ['MRD_PROG'], "get_model") else: logger.info( "Morda not installed locally, therefore morda domains will not be added to contaminant database") what_to_do = [] for result in results: stem = os.path.join(os.getcwd(), database, result.uniprot_mnemonic, result.uniprot_name, result.space_group) if not os.path.exists(stem): os.makedirs(stem) content = PdbStructure.get_pdb_content(result.pdb_code) if content is None: logger.debug("Encountered a problem downloading PDB %s - skipping entry", result.pdb_code) else: dat_content = simbad.db._str_to_dat(content) with open(os.path.join(stem, result.pdb_code + ".dat"), "w") as f_out: f_out.write(dat_content) if simbad.db.is_valid_dat(os.path.join(stem, result.pdb_code + ".dat")): pass else: logger.debug("Unable to convert %s to dat file", result.pdb_code) if morda_installed_through_ccp4: for dat_file in morda_dat_files: if result.pdb_code.lower() == dat_file[0:4]: stem = os.path.join(database, result.uniprot_mnemonic, result.uniprot_name, result.space_group, "morda") if not os.path.exists(stem): os.makedirs(stem) code = dat_file.rsplit('.', 1)[0] final_file = os.path.join(stem, dat_file) tmp_d = tmp_dir(directory=os.getcwd()) get_model_output = os.path.join(tmp_d, code + ".pdb") script = make_script( [["export CCP4_SCR=", tmp_d], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]], directory=tmp_d) log = script.rsplit('.', 1)[0] + '.log' what_to_do += [(script, log, tmp_d, (get_model_output, final_file))] if len(what_to_do) > 0: scripts, _, tmps, files = zip(*what_to_do) j = Job(submit_qtype) j.submit(scripts, name='cont_db', nproc=nproc, queue=submit_queue) j.wait() for output, final in files: if os.path.isfile(output): simbad.db.convert_pdb_to_dat(output, final) else: print "File missing: {}".format(output) for d in tmps: shutil.rmtree(d) for f in dimple_files: if os.path.isdir(f): shutil.rmtree(f) elif os.path.isfile(f): os.remove(f) validate_compressed_database(database)