def test_21(self): script = Script() script.append("test line") content = ["what the hell"] script.content = content assert script == ["what the hell"] assert script is not content
def write_jobscript(name, keyword_file, amoptd, directory=None, job_time=86400, extra_options={}): """ Create the script to run MrBump for this PDB. """ if not directory: directory = os.getcwd() # Next the script to run mrbump script = Script(directory=directory, prefix="", stem=name, suffix=ample_util.SCRIPT_EXT) if not sys.platform.startswith("win"): script.append('[[ ! -d $CCP4_SCR ]] && mkdir $CCP4_SCR\n\n') # Get the mrbump command-line jobcmd = mrbump_cmd.mrbump_cmd(name, amoptd['mtz'], amoptd['mr_sequence'], keyword_file) script.append(jobcmd) # Write script script.write() logger.debug("Wrote MRBUMP script: %s", script.path) return script
def write_script(self, work_dir, name, args, testcase_type): """Write script""" linechar = "^" if sys.platform.startswith('win') else "\\" script = Script(directory=work_dir, stem=name) test_exe = os.path.join(os.environ["CCP4"], "bin", "ample") test_exe = test_exe + ample_util.SCRIPT_EXT if sys.platform.startswith( "win") else test_exe if testcase_type == ENSEMBLER: test_exe = '{0} -m ample.ensembler'.format( os.path.join(os.environ["CCP4"], "bin", "ccp4-python")) elif testcase_type == MODELLING: test_exe = '{0} -m ample.modelling'.format( os.path.join(os.environ["CCP4"], "bin", "ccp4-python")) # All arguments need to be strings args = [map(str, a) for a in args] script.append("{0} {1}".format(test_exe, linechar)) for argt in args: script.append(" ".join(argt) + " " + linechar) return script
def generate_script(self, dat_model): logger.debug( "Generating script to perform AMORE rotation " + "function on %s", dat_model.pdb_code) pdb_model = self.template_model.format(dat_model.pdb_code) table1 = self.template_table1.format(dat_model.pdb_code) hklpck1 = self.template_hklpck1.format(dat_model.pdb_code) clmn0 = self.template_clmn0.format(dat_model.pdb_code) clmn1 = self.template_clmn1.format(dat_model.pdb_code) mapout = self.template_mapout.format(dat_model.pdb_code) conv_py = "\"from simbad.db import convert_dat_to_pdb; convert_dat_to_pdb('{}', '{}')\"" conv_py = conv_py.format(dat_model.dat_path, pdb_model) tab_cmd = [ self.amore_exe, "xyzin1", pdb_model, "xyzout1", pdb_model, "table1", table1 ] tab_stdin = self.tabfun_stdin_template.format(x=dat_model.x, y=dat_model.y, z=dat_model.z, a=90, b=90, c=120) rot_cmd = [ self.amore_exe, 'table1', table1, 'HKLPCK1', hklpck1, 'hklpck0', self.hklpck0, 'clmn1', clmn1, 'clmn0', clmn0, 'MAPOUT', mapout ] rot_stdin = self.rotfun_stdin_template.format(shres=self.shres, intrad=dat_model.intrad, pklim=self.pklim, npic=self.npic, step=self.rotastep) rot_log = self.template_rot_log.format(dat_model.pdb_code) tmp_dir = self.template_tmp_dir.format(dat_model.pdb_code) cmd = [ [EXPORT, "CCP4_SCR=" + tmp_dir], ["mkdir", "-p", "$CCP4_SCR\n"], [CMD_PREFIX, "$CCP4/bin/ccp4-python", "-c", conv_py, os.linesep], tab_cmd + ["<< eof >", os.devnull], [tab_stdin], ["eof"], [os.linesep], rot_cmd + ["<< eof >", rot_log], [rot_stdin], ["eof"], [os.linesep], ["grep", "-m 1", "SOLUTIONRCD", rot_log, os.linesep], ["rm", "-rf", "$CCP4_SCR\n"], [EXPORT, "CCP4_SCR=" + self.ccp4_scr], ] amore_script = Script(directory=self.script_log_dir, prefix="amore_", stem=dat_model.pdb_code) for c in cmd: amore_script.append(' '.join(map(str, c))) amore_log = amore_script.path.rsplit(".", 1)[0] + '.log' amore_files = (amore_log, dat_model.dat_path) amore_script.write() return amore_script, amore_files
def example_function(option): cmd = ["echo {}".format(option)] script = Script(directory=os.getcwd()) for c in cmd: script.append(c) return script
def test_20(self): script = Script() script.append("test line") script.content = ["what the hell"] assert script == ["what the hell"]
def _create_runscript(self): """Utility method to create runscript""" runscript = Script( directory=self.directory, prefix="slurm_", suffix=".script", stem=str(uuid.uuid1().int), ) runscript.append(self.__class__.SCRIPT_DIRECTIVE + " --export=ALL") runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" --job-name={self.name}") if self.dependency: cmd = f'--depend=afterok:{":".join(map(str, self.dependency))}' runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.queue: cmd = f"-p {self.queue}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.nprocesses: cmd = f"-n {self.nprocesses}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.directory: cmd = f"--workdir={self.directory}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.runtime: cmd = f"-t {self.runtime}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.extra: cmd = " ".join(map(str, self.extra)) runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if len(self.script) > 1: logf = runscript.path.replace(".script", ".log") jobsf = runscript.path.replace(".script", ".jobs") with open(jobsf, "w") as f_out: f_out.write("\n".join(self.script)) cmd = f"--array=1-{len(self.script)}%{self.max_array_size}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -o {logf}") runscript.extend(self.get_array_bash_extension(jobsf, 0)) else: runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -o {self.log[0]}") runscript.append(self.script[0]) return runscript
def _create_runscript(self): """Utility method to create runscript""" runscript = Script( directory=self.directory, prefix="lsf_", suffix=".script", stem=str(uuid.uuid1().int), ) if self.dependency: cmd = "-w {}".format(" && ".join( [f"deps({d})" for d in self.dependency])) runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.directory: cmd = f"-cwd {self.directory}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.priority: cmd = f"-sp {self.priority}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.queue: cmd = f"-q {self.queue}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.runtime: cmd = f"-W {self.runtime}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.shell: cmd = f"-L {self.shell}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.nprocesses: cmd = f'-R "span[ptile={self.nprocesses}]"' runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.extra: cmd = " ".join(map(str, self.extra)) runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if len(self.script) > 1: logf = runscript.path.replace(".script", ".log") jobsf = runscript.path.replace(".script", ".jobs") with open(jobsf, "w") as f_out: f_out.write("\n".join(self.script)) cmd = f"-J {self.name}[1-{len(self.script)}]%{self.max_array_size}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -o {logf}") runscript.extend(self.get_array_bash_extension(jobsf, 1)) else: runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -J {self.name}") runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -o {self.log[0]}") runscript.append(self.script[0]) return runscript
def _create_runscript(self): """Utility method to create runscript""" runscript = Script( directory=self.directory, prefix="sge_", suffix=".script", stem=str(uuid.uuid1().int), ) runscript.append(self.__class__.SCRIPT_DIRECTIVE + " -V") runscript.append(self.__class__.SCRIPT_DIRECTIVE + " -w e") runscript.append(self.__class__.SCRIPT_DIRECTIVE + " -j yes") runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -N {self.name}") if self.dependency: cmd = f'-hold_jid {",".join(map(str, self.dependency))}' runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.priority: cmd = f"-p {self.priority}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.queue: cmd = f"-q {self.queue}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.runtime: cmd = f"-l h_rt={self.get_time(self.runtime)}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.shell: cmd = f"-S {self.shell}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.nprocesses and self.environment: cmd = f"-pe {self.environment} {self.nprocesses}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.directory: cmd = f"-wd {self.directory}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if self.extra: cmd = " ".join(map(str, self.extra)) runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) if len(self.script) > 1: logf = runscript.path.replace(".script", ".log") jobsf = runscript.path.replace(".script", ".jobs") with open(jobsf, "w") as f_out: f_out.write("\n".join(self.script)) cmd = f"-t 1-{len(self.script)} -tc {self.max_array_size}" runscript.append(self.__class__.SCRIPT_DIRECTIVE + " " + cmd) runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -o {logf}") runscript.extend(self.get_array_bash_extension(jobsf, 0)) else: runscript.append(self.__class__.SCRIPT_DIRECTIVE + f" -o {self.log[0]}") runscript.append(self.script[0]) return runscript
def generate_script(self, dat_model): logger.debug( "Generating script to perform PHASER rotation " + "function on %s", dat_model.pdb_code) pdb_model = self.template_model.format(dat_model.pdb_code) template_rot_log = os.path.join("$CCP4_SCR", "{0}_rot.log") conv_py = "\"from simbad.db import convert_dat_to_pdb; convert_dat_to_pdb('{}', '{}')\"" conv_py = conv_py.format(dat_model.dat_path, pdb_model) rot_log = template_rot_log.format(dat_model.pdb_code) tmp_dir = self.template_tmp_dir.format(dat_model.pdb_code) phaser_cmd = [ "simbad.rotsearch.phaser_rotation_search", "-eid", self.eid, "-hklin", self.mtz, "-f", self.mtz_labels.f, "-sigf", self.mtz_labels.sigf, "-i", self.mtz_labels.i, "-sigi", self.mtz_labels.sigi, "-pdbin", pdb_model, "-logfile", rot_log, "-solvent", dat_model.solvent, "-nmol", dat_model.nmol, "-work_dir", tmp_dir, ] phaser_cmd = " ".join(str(e) for e in phaser_cmd) cmd = [ [EXPORT, "CCP4_SCR=" + tmp_dir], ["mkdir", "-p", "$CCP4_SCR\n"], [CMD_PREFIX, "$CCP4/bin/ccp4-python", "-c", conv_py, os.linesep], [ CMD_PREFIX, "$CCP4/bin/ccp4-python", "-m", phaser_cmd, os.linesep ], ["rm", "-rf", "$CCP4_SCR\n"], [EXPORT, "CCP4_SCR=" + self.ccp4_scr], ] phaser_script = Script(directory=self.script_log_dir, prefix="phaser_", stem=dat_model.pdb_code) for c in cmd: phaser_script.append(' '.join(map(str, c))) phaser_log = phaser_script.path.rsplit(".", 1)[0] + '.log' phaser_files = (phaser_log, dat_model.dat_path) phaser_script.write() return phaser_script, phaser_files
def comparison(self, models, structures): """ Compare a list of model structures to a second list of reference structures Parameters ---------- models : list List containing the paths to the model structure files structures : list List containing the paths to the reference structure files Returns ------- entries : list List of TMscore data entries on a per-model basis """ if len(models) < 1 or len(structures) < 1: msg = 'No model structures provided' if len(models) < 1 else 'No reference structures provided' logger.critical(msg) raise RuntimeError(msg) elif len(structures) == 1: logger.info('Using single structure provided for all model comparisons') structures = [structures[0] for _ in xrange(len(models))] elif len(models) != len(structures): msg = "Unequal number of models and structures!" logger.critical(msg) raise RuntimeError(msg) if self.method == "tmalign": pt = tm_parser.TMalignLogParser() elif self.method == "tmscore": pt = tm_parser.TMscoreLogParser() else: msg = "Invalid method selected: %s", self.method logger.critical(msg) raise RuntimeError(msg) logger.info('Using algorithm: {0}'.format(self.method)) logger.info('------- Evaluating decoys -------') data_entries, log_files = [], [] collector = ScriptCollector(None) for model_pdb, structure_pdb in zip(models, structures): model_name = os.path.splitext(os.path.basename(model_pdb))[0] structure_name = os.path.splitext(os.path.basename(structure_pdb))[0] stem = "_".join([model_name, structure_name, self.method]) if os.path.isfile(model_pdb) and os.path.isfile(structure_pdb): data_entries.append([model_name, structure_name, model_pdb, structure_pdb]) script = Script(directory=self.tmp_dir, prefic="tmscore_", stem=stem) script.append(" ".join([self.executable, model_pdb, structure_pdb])) collector.add(script) log_files.append(os.path.splitext(script)[0] + ".log") else: if not os.path.isfile(model_pdb): logger.warning("Cannot find: %s", model_pdb) if not os.path.isfile(structure_pdb): logger.warning("Cannot find: %s", structure_pdb) continue logger.info('Executing TManalysis scripts') j = Job(self._qtype) j.submit(job_scripts, nproc=self._nproc, max_array_jobs=self._max_array_jobs, queue=self._queue, name="tmscore") j.wait(interval=1) with TaskFactory( self._qtype, collector, name="tmscore", nprocesses=self._nproc, max_array_size=self._max_array_jobs, queue=self._queue, shell="/bin/bash", ) as task: task.run() task.wait(interval=1) self.entries = [] for entry, log, script in zip(data_entries, log_files, job_scripts): try: pt.reset() pt.parse(log) except Exception: logger.critical("Error processing the %s log file: %s", self.method, log) log = "None" model_name, structure_name, model_pdb, structure_pdb = entry _entry = self._store(model_name, structure_name, model_pdb, structure_pdb, log, pt) self.entries.append(_entry) os.unlink(script) return self.entries
def create_ensemble_db(database, pdb_db, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000): """Create the MoRDa search database Parameters ---------- database : str The path to the database folder pdb_db : str The path to a local copy of the Protein Data Bank nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster chunk_size : int, optional The number of jobs to submit at the same time [default: 5000] Raises ------ RuntimeError Windows is currently not supported """ if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format( os.path.dirname(database))) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: download_morda() morda_installed_through_ccp4 = False morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') simbad_dat_path = os.path.join(database, '**', '*.dat') morda_dat_files = set( [os.path.basename(f) for f in glob.glob(morda_dat_path)]) simbad_dat_files = set( [os.path.basename(f) for f in glob.glob(simbad_dat_path)]) erroneous_files = { "1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat" } def delete_erroneous_files(erroneous_paths): for f in erroneous_paths: if os.path.isfile(f): logger.warning( "File flagged to be erroneous ... " + "removing from database: %s", f) os.remove(f) erroneous_paths = [ os.path.join(database, name[1:3], name) for name in erroneous_files ] delete_erroneous_files(erroneous_paths) dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files) if len(dat_files) < 1: logger.info('SIMBAD ensemble database up-to-date') if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) leave_timestamp(os.path.join(database, 'simbad_morda.txt')) return else: logger.info( "%d new entries were found in the MoRDa database, " + "updating SIMBAD ensemble database", len(dat_files)) exe = os.path.join(os.environ["MRD_PROG"], "get_model") mrbump_stdin = """ MDLS True MDLC False MDLD False MDLP False MDLM False MDLU False CHECK False UPDATE False PICKLE False MRNUM 5 SCOP False DEBUG False RLEVEL 100 GESAMT_MERGE False USEE True GESE True GEST True AMPT False DOPHMMER True DOHHPRED False PDBLOCAL {} END """.format(pdb_db) run_dir = tmp_dir(directory=os.getcwd()) # Generate the sub directories in advance sub_dir_names = set( [os.path.basename(f).rsplit('.', 1)[0][1:3] for f in dat_files]) for sub_dir_name in sub_dir_names: sub_dir = os.path.join(database, sub_dir_name) if os.path.isdir(sub_dir): continue os.makedirs(sub_dir) # Submit in chunks, so we don't take too much disk space # and can terminate without loosing the processed data total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0) for cycle, i in enumerate(range(0, len(dat_files), chunk_size)): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) chunk_dat_files = dat_files[i:i + chunk_size] # Create the database files files = [] collector = ScriptCollector(None) for f in chunk_dat_files: code = os.path.basename(f).rsplit('.', 1)[0] final_file = os.path.join(database, code[1:3], code + ".dat") # We need a temporary directory within because "get_model" uses non-unique file names tmp_d = tmp_dir(directory=run_dir) get_model_output = os.path.join(tmp_d, code + ".pdb") get_seq_output = os.path.join(tmp_d, code + ".seq") mrbump_directory = os.path.join(tmp_d, 'search_mrbump_1') cmd = [["export CCP4_SCR=".format(tmp_d)], ["export MRD_DB=".format(os.environ['MRD_DB'])], ["cd", tmp_d], [exe, "-c", code, "-m", "d"], [ 'ccp4-python', '-c', "'import simbad.util; " "simbad.util.get_sequence(\"{0}\", \"{1}\")'".format( get_model_output, get_seq_output) ], ['mrbump', 'seqin', get_seq_output, '<< eof'], [mrbump_stdin], ['eof'], [ 'ccp4-python', '-c', "'import simbad.util; " "simbad.util.get_mrbump_ensemble(\"{0}\", \"{1}\")'". format(mrbump_directory, final_file) ]] script = Script(directory=tmp_d) for c in cmd: script.append(' '.join(map(str, c))) collector.add(script) log = script.path.rsplit('.', 1)[0] + '.log' files += [(script.path, log, tmp_d)] scripts, _, tmps = zip(*files) submit_chunk(collector=collector, run_dir=os.getcwd(), nproc=nproc, job_name='ensemble_db', submit_qtype=submit_qtype, submit_queue=submit_queue, permit_nonzero=True, monitor=None, success_func=None) for d in tmps: shutil.rmtree(d) shutil.rmtree(run_dir) if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) validate_compressed_database(database) leave_timestamp(os.path.join(database, 'simbad_ensemble.txt'))
def create_morda_db(database, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000): """Create the MoRDa search database Parameters ---------- database : str The path to the database folder nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster chunk_size : int, optional The number of jobs to submit at the same time [default: 5000] Raises ------ RuntimeError Windows is currently not supported """ if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format( os.path.dirname(database))) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: download_morda() morda_installed_through_ccp4 = False morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') simbad_dat_path = os.path.join(database, '**', '*.dat') morda_dat_files = set( [os.path.basename(f) for f in glob.glob(morda_dat_path)]) simbad_dat_files = set( [os.path.basename(f) for f in glob.glob(simbad_dat_path)]) erroneous_files = { "1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat" } def delete_erroneous_files(erroneous_paths): for f in erroneous_paths: if os.path.isfile(f): logger.warning( "File flagged to be erroneous ... " + "removing from database: %s", f) os.remove(f) erroneous_paths = [ os.path.join(database, name[1:3], name) for name in erroneous_files ] delete_erroneous_files(erroneous_paths) dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files) if len(dat_files) < 1: logger.info('SIMBAD database up-to-date') if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) leave_timestamp(os.path.join(database, 'simbad_morda.txt')) return else: logger.info( "%d new entries were found in the MoRDa database, " + "updating SIMBAD database", len(dat_files)) exe = os.path.join(os.environ["MRD_PROG"], "get_model") run_dir = tmp_dir(directory=os.getcwd()) # Submit in chunks, so we don't take too much disk space # and can terminate without loosing the processed data total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0) for cycle, i in enumerate(range(0, len(dat_files), chunk_size)): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) chunk_dat_files = dat_files[i:i + chunk_size] # Create the database files files = [] collector = ScriptCollector(None) for f in chunk_dat_files: code = os.path.basename(f).rsplit('.', 1)[0] final_file = os.path.join(database, code[1:3], code + ".dat") # We need a temporary directory within because "get_model" uses non-unique file names tmp_d = tmp_dir(directory=run_dir) get_model_output = os.path.join(tmp_d, code + ".pdb") cmd = [["export CCP4_SCR=" + tmp_d], ["export MRD_DB=" + os.environ['MRD_DB']], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]] script = Script(directory=tmp_d) for c in cmd: script.append(' '.join(map(str, c))) collector.add(script) log = script.path.rsplit('.', 1)[0] + '.log' files += [(script.path, log, tmp_d, (get_model_output, final_file)) ] scripts, _, tmps, files = zip(*files) submit_chunk(collector=collector, run_dir=os.getcwd(), nproc=nproc, job_name='morda_db', submit_qtype=submit_qtype, submit_queue=submit_queue, permit_nonzero=True, monitor=None, success_func=None) sub_dir_names = set([ os.path.basename(f).rsplit('.', 1)[0][1:3] for f in chunk_dat_files ]) for sub_dir_name in sub_dir_names: sub_dir = os.path.join(database, sub_dir_name) if os.path.isdir(sub_dir): continue os.makedirs(sub_dir) for output, final in files: if os.path.isfile(output): simbad.db.convert_pdb_to_dat(output, final) else: logger.critical("File missing: {}".format(output)) for d in tmps: shutil.rmtree(d) shutil.rmtree(run_dir) if not morda_installed_through_ccp4: shutil.rmtree(os.environ["MRD_DB"]) validate_compressed_database(database) leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
def create_contaminant_db(database, add_morda_domains, nproc=2, submit_qtype=None, submit_queue=False): """Create a contaminant database Parameters ---------- database : str The path to the database folder add_morda_domains : bool Retrospectively add morda domains to a contaminant database updated when morda was not installed nproc : int, optional The number of processors [default: 2] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster Raises ------ RuntimeError dimple.contaminants.prepare module not available RuntimeError Windows is currently not supported """ if not is_valid_db_location(database): raise RuntimeError("Permission denied! Cannot write to {}!".format( os.path.dirname(database))) import dimple.main logger.info('DIMPLE version: %s', dimple.main.__version__) if StrictVersion(dimple.main.__version__) < StrictVersion('2.5.7'): msg = "This feature will be available with dimple version 2.5.7" raise RuntimeError(msg) if CUSTOM_PLATFORM == "windows": msg = "Windows is currently not supported" raise RuntimeError(msg) import dimple.contaminants.prepare dimple.contaminants.prepare.main(verbose=False) simbad_dat_path = os.path.join(database, '*', '*', '*', '*.dat') existing_dat_files = [ os.path.basename(f).split('.')[0].lower() for f in glob.iglob(simbad_dat_path) ] erroneous_files = ['4v43'] dimple_files = ['cached', 'data.json', 'data.py'] with open("data.json") as data_file: data = json.load(data_file) results = [] for child in data["children"]: try: for child_2 in child["children"]: space_group = child_2["name"].replace(" ", "") for child_3 in child_2["children"]: pdb_code = child_3["name"].split()[0].lower() if (pdb_code in existing_dat_files or pdb_code in erroneous_files) and not add_morda_domains: continue uniprot_name = child["name"] uniprot_mnemonic = uniprot_name.split('_')[1] score = ContaminantSearchResult(pdb_code, space_group, uniprot_name, uniprot_mnemonic) results.append(score) except KeyError: pass if len(results) == 0: logger.info("Contaminant database up to date") else: if add_morda_domains: logger.info("Adding morda domains to contaminant database") else: logger.info( "%d new entries were found in the contaminant database, " + "updating SIMBAD database", len(results)) if "MRD_DB" in os.environ: morda_installed_through_ccp4 = True else: morda_installed_through_ccp4 = False if add_morda_domains and not morda_installed_through_ccp4: logger.critical( "Morda not installed locally, unable to add morda domains to contaminant database" ) if morda_installed_through_ccp4: morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat') morda_dat_files = set( [os.path.basename(f) for f in glob.iglob(morda_dat_path)]) exe = os.path.join(os.environ['MRD_PROG'], "get_model") else: logger.info( "Morda not installed locally, therefore morda domains will not be added to contaminant database" ) files = [] collector = ScriptCollector(None) for result in results: stem = os.path.join(os.getcwd(), database, result.uniprot_mnemonic, result.uniprot_name, result.space_group) if not os.path.exists(stem): os.makedirs(stem) content = PdbStructure.get_pdb_content(result.pdb_code) if content is None: logger.debug( "Encountered a problem downloading PDB %s - skipping entry", result.pdb_code) else: dat_content = simbad.db._str_to_dat(content) with open(os.path.join(stem, result.pdb_code + ".dat"), "w") as f_out: f_out.write(dat_content) if simbad.db.is_valid_dat( os.path.join(stem, result.pdb_code + ".dat")): pass else: logger.debug("Unable to convert %s to dat file", result.pdb_code) if morda_installed_through_ccp4: for dat_file in morda_dat_files: if result.pdb_code.lower() == dat_file[0:4]: stem = os.path.join(database, result.uniprot_mnemonic, result.uniprot_name, result.space_group, "morda") if not os.path.exists(stem): os.makedirs(stem) code = dat_file.rsplit('.', 1)[0] final_file = os.path.join(stem, dat_file) tmp_d = tmp_dir(directory=os.getcwd()) get_model_output = os.path.join(tmp_d, code + ".pdb") cmd = [["export CCP4_SCR=", tmp_d], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]] script = Script(directory=tmp_d) for c in cmd: script.append(' '.join(map(str, c))) collector.add(script) log = script.path.rsplit('.', 1)[0] + '.log' files += [(script.path, log, tmp_d, (get_model_output, final_file))] if len(files) > 0: scripts, _, tmps, files = zip(*files) submit_chunk(collector=collector, run_dir=os.getcwd(), nproc=nproc, job_name='cont_db', submit_qtype=submit_qtype, submit_queue=submit_queue, permit_nonzero=True, monitor=None, success_func=None) for output, final in files: if os.path.isfile(output): simbad.db.convert_pdb_to_dat(output, final) else: print "File missing: {}".format(output) for d in tmps: shutil.rmtree(d) for f in dimple_files: if os.path.isdir(f): shutil.rmtree(f) elif os.path.isfile(f): os.remove(f) validate_compressed_database(database)