def _add_datasets(project: Project): """ For all the project's crystals, look for datasets on the file system. Add database entries for all new datasets found on the file system. """ def dataset_exists(crystal, run: int): dset = crystal.get_dataset(run) return dset is not None for dset_dir in project.get_dataset_dirs(): crystal_id = dset_dir.name crystal = project.get_crystal(crystal_id) if crystal is None: # not part of the project continue for run in get_dataset_runs(dset_dir): if dataset_exists(crystal, run): # skip already existing dataset continue meta_data = get_dataset_metadata(project, dset_dir, crystal_id, run) if meta_data is None: print( f"warning: no meta-data found for {crystal_id} {run}, skipping the dataset" ) continue # TODO: this is MAXIV specific, think about site-independent style shift_dir = dset_dir.parents[2].relative_to(project.proposal_dir) dataset = project.db.DataSet( crystal=crystal, data_root_dir=str(shift_dir), run=run, detector=meta_data.detector, resolution=meta_data.resolution, images=meta_data.images, start_time=meta_data.start_time, end_time=meta_data.end_time, wavelength=meta_data.wavelength, start_angle=meta_data.start_angle, angle_increment=meta_data.angle_increment, exposure_time=meta_data.exposure_time, detector_distance=meta_data.detector_distance, xbeam=meta_data.xbeam, ybeam=meta_data.ybeam, beam_shape=meta_data.beam_shape, transmission=meta_data.transmission, slit_gap_horizontal=meta_data.slit_gap_horizontal, slit_gap_vertical=meta_data.slit_gap_vertical, flux=meta_data.flux, beam_size_at_sample_x=meta_data.beam_size_at_sample_x, beam_size_at_sample_y=meta_data.beam_size_at_sample_y, ) for snapshot_index in meta_data.snapshot_indices: project.db.DataSetSnapshot(dataset=dataset, index=snapshot_index)
def _save_pdb(project: Project, pdb_id, filename, pdb_data): name = path.splitext(filename)[0] nohet_filename = f"{name}_noHETATM.pdb" noanisou_filename = f"{name}_noANISOU.pdb" nohetanisou_filename = f"{name}_noANISOU_noHETATM.pdb" txc_filename = f"{name}_txc.pdb" orig_pdb = _add_pdb_entry(project, filename, pdb_id) nohet_pdb = _add_pdb_entry(project, nohet_filename, pdb_id) noanisou_pdb = _add_pdb_entry(project, noanisou_filename, pdb_id) nohetnoanisou_pdb = _add_pdb_entry(project, nohetanisou_filename, pdb_id) # write original pdb file 'as-is' to models folder with open_proj_file(project, project.get_pdb_file(orig_pdb)) as dest: dest.write(pdb_data) # filter out all non-ATOM entries from pdb and write it as *_noHETATM.pdb with open_proj_file(project, project.get_pdb_file(nohetnoanisou_pdb)) as dest: for line in pdb_data.splitlines(keepends=True): if not line.startswith(b"HETATM") or not line.startswith(b"ANISOU"): dest.write(line) with open_proj_file(project, project.get_pdb_file(nohet_pdb)) as dest: for line in pdb_data.splitlines(keepends=True): if not line.startswith(b"HETATM"): dest.write(line) with open_proj_file(project, project.get_pdb_file(noanisou_pdb)) as dest: for line in pdb_data.splitlines(keepends=True): if not line.startswith(b"ANISOU"): dest.write(line) n_chains = pdb_chains(pdb_data.splitlines(keepends=True)) if n_chains > 1: txc_pdb = _add_pdb_entry(project, txc_filename, pdb_id) input_pdb_name = path.join(project.models_dir, f"{name}.pdb") jobs = JobsSet("phenix ensembler") batch = SITE.get_hpc_runner().new_batch_file( "phenix ensembler", project_script(project, "phenix_ensembler.sh"), project_syslog_path(project, "phenix_ensembler_%j.out"), project_syslog_path(project, "phenix_ensembler_%j.err"), ) batch.load_modules(["gopresto", PHENIX_MOD]) batch.add_commands( f"cd {project.models_dir}", f"phenix.ensembler {input_pdb_name} trim=TRUE output.location='{project.models_dir}'", f"mv {project.models_dir}/ensemble_merged.pdb {project.get_pdb_file(txc_pdb)}", ) batch.save() jobs.add_job(batch) jobs.submit()
def _migrate_proc_tool(project: Project, old_paths: OldPaths, tool: str): _log(f"MIGRATE PROC TOOL {tool} RESULTS") tool_dir = "xdsxscale" if tool == "xds" else tool for dset in _dbg_cutoff(project.get_datasets()): src_dir = Path(old_paths.process_dir, dset.crystal.id, dset.name, tool_dir) if not src_dir.is_dir(): # no tool results found for this dataset continue dest_dir = Path(project.get_dataset_process_dir(dset), tool) _log(f"{src_dir} -> {dest_dir}") _copytree(src_dir, dest_dir) update_proc_tool_status(project, tool, dset)
def get_summary_url(project: Project, method: str, dataset_name: str) -> str: summary_path = Path( project.pandda_processed_dataset_dir(method, dataset_name), "html", f"{dataset_name}.html", ) return str(summary_path.relative_to(project.project_dir))
def get_refine_log_files(project: Project, dataset, processing_tool) -> Iterator[Path]: res_dir = Path( project.get_dataset_results_dir(dataset), processing_tool, "fspipeline" ) project_dir = project.project_dir for path in res_dir.glob("**/*.log"): if path.is_file(): yield path.relative_to(project_dir)
def scrape_results(project: Project, dataset) -> Optional[ProcStats]: xds_dir = Path(project.get_dataset_process_dir(dataset), "xds") stats = xia2.scrape_results(project, _get_xds_dir(project, dataset)) if stats is None: return None stats.isa = _scrape_isa(project, xds_dir) return stats
def get_project_libraries(project: Project) -> Set[Library]: libs = set() for crystal in project.get_crystals(): if crystal.is_apo(): continue frag = Fragment.get_by_id(crystal.fragment_id) libs.add(frag.library) return libs
def scrape_results(project: Project, dataset) -> Iterator[LigfitResult]: res_dir = project.get_dataset_results_dir(dataset) for ref_dir in subdirs(res_dir, 2): score = _scrape_score(ref_dir) proc_tool = ref_dir.parent.name refine_tool = ref_dir.name if score is None: status = ToolStatus.FAILURE else: status = ToolStatus.SUCCESS yield LigfitResult(proc_tool, refine_tool, "rhofit", status, score)
def _migrate_refine(project: Project, old_paths: OldPaths): _log("MIGRATE REFINE RESULTS") for dset in _dbg_cutoff(project.get_datasets()): for proc_tool in ["xds", "xdsapp", "dials"]: proc_dir = "xdsxscale" if proc_tool == "xds" else proc_tool src_dir = Path(old_paths.results_dir, dset.name, proc_dir) if not src_dir.is_dir(): # no tool results found for this dataset continue dest_dir = Path(project.get_dataset_results_dir(dset), proc_tool) _log(f"{src_dir} -> {dest_dir}") # note, we can't use ignore_dangling_symlinks=True # here due to https://bugs.python.org/issue38523 _copytree(src_dir, dest_dir) for tool in REFINE_TOOLS: update_refine_tool_status(project, tool, dset) for tool in LIGFIT_TOOLS: _maybe_resync_ligfit_tool(project, tool, dset)
def _get_fragments(project: Project) -> Dict[Fragment, List]: fragments: Dict = {} for crystal in project.get_crystals(): if crystal.is_apo(): continue frag = get_crystals_fragment(crystal) crystals = fragments.get(frag, []) crystals.append(crystal) fragments[frag] = crystals return fragments
def pandda_worker(project: Project, method, methodshort, options, cif_method): rn = str(randint(10000, 99999)) prepare_scripts = [] proc_tool, refine_tool = method.split("_") refine_results = _get_best_results(project, proc_tool, refine_tool) selection = PanddaSelectedDatasets() for refine_result in refine_results: res_dir = project.get_refine_result_dir(refine_result) final_pdb = Path(res_dir, "final.pdb") final_mtz = Path(res_dir, "final.mtz") selection.add(refine_result.dataset.name, final_pdb) res_high, free_r_flag, native_f, sigma_fp = read_info(project, str(final_mtz)) script = _write_prepare_script( project, rn, method, refine_result.dataset, final_pdb, final_mtz, res_high, free_r_flag, native_f, sigma_fp, cif_method, ) prepare_scripts.append(script) pandda_dir = Path(project.pandda_dir, method) pandda_dir.mkdir(parents=True, exist_ok=True) selection.save(Path(pandda_dir)) main_script = _write_main_script(project, pandda_dir, method, methodshort, options) # # submit all pandda script to the HPC # jobs = JobsSet("PanDDa") for prep_script in prepare_scripts: jobs.add_job(prep_script) jobs.add_job(main_script, run_after=prepare_scripts) jobs.submit()
def _get_key(project: Project) -> str: """ get project's encryption key in BASE64 format raises CryptoKeyError if encryption key is not uploaded or of project is not encrypted """ if not project.encrypted: raise CryptoKeyError(ENCRYPTION_DISABLED_MSG) if not project.has_encryption_key(): raise CryptoKeyError("no key uploaded") return base64.b64encode(project.encryption_key).decode()
def _find_results(project: Project, dataset): edna_res_dir = Path( project.get_dataset_root_dir(dataset), "process", project.protein, f"{dataset.crystal.id}", f"xds_{dataset.name}_1", "EDNA_proc", "results", ) if edna_res_dir.is_dir(): mtz_file = next(edna_res_dir.glob("*.mtz"), None) return edna_res_dir, mtz_file return None, None
def _get_refine_results(project: Project, filters: str, use_ligandfit: bool, use_rhofit: bool): datasets = [] if filters == "NEW": if use_ligandfit: datasets.append(get_ligfit_datasets(project, filters, "ligandfit")) if use_rhofit: datasets.append(get_ligfit_datasets(project, filters, "rhofit")) else: datasets.append(get_ligfit_datasets(project, filters, None)) for dataset in itertools.chain(*datasets): for result in project.get_datasets_refine_results(dataset): yield result
def find_site_centroids(project: Project, method: str): sites_csv = Path( project.pandda_method_dir(method), "pandda", "analyses", "pandda_analyse_sites.csv", ) with sites_csv.open("r") as f: sitelist = f.readlines() centroids = list() for _site in sitelist[1:]: centroid = literal_eval(",".join( _site.replace('"', "").split(",")[8:11])) centroids.append(list(centroid)) return centroids
def _get_summary_report(project: Project, dataset) -> Optional[Path]: autoproc_res_dir = Path( project.get_dataset_root_dir(dataset), "process", project.protein, f"{dataset.crystal.id}", f"xds_{dataset.name}_1", "autoPROC", ) glob = autoproc_res_dir.glob(str(Path("cn*", "AutoPROCv1_0_anom", "summary.html"))) summary_file = next(glob, None) if summary_file is not None and summary_file.is_file(): return summary_file # no autoPROC summary report found return None
def get_proc_datasets(project: Project, filter: str, tool: str): """ perform datasets filtering for 'data processing' jobs filters supported: 'ALL' - all of the project's datasets 'NEW' - datasets that have not been processes yet with specified tool otherwise the filter is expected to be a comma separated list of dataset IDs """ if filter == "ALL": return project.get_datasets() if filter == "NEW": return _no_results_datasets(project, tool) return _dataset_by_ids(project, filter)
def get_refine_datasets(project: Project, filter: str, refine_tool: str): """ perform datasets filtering for 'structure refinement' jobs filters supported: 'ALL' - all of the project's datasets 'NEW' - datasets that have not been processes with specified refinement tool yet otherwise the filter is expected to be a comma separated list of dataset names """ if filter == "ALL": return project.get_datasets() if filter == "NEW": return _no_results_datasets(project, refine_tool) return _dataset_by_ids(project, filter)
def _get_best_results(project: Project, proc_tool, refine_tool): if proc_tool == "frag": proc_tool = None if refine_tool == "plex": refine_tool = None for dataset in project.get_datasets(): refine_results = _get_refine_results(project, dataset, proc_tool, refine_tool) # sort refine results by R-work, R-free and resolution, # the result with lowest value(s) is the 'best' one best_result = refine_results.order_by( lambda r: (r.r_work, r.r_free, r.resolution) ).first() if best_result is None: # no refine results with request proc/refine tool combination continue yield best_result
def get_ligfit_datasets(project: Project, filter: str, ligfit_tool: Optional[str]): """ perform datasets filtering for 'ligand fitting' jobs filters supported: 'ALL' - all of the project's datasets 'NEW' - datasets that have not been processes with specified ligand fitting tools yet otherwise the filter is expected to be a comma separated list of dataset names """ if filter == "ALL": return project.get_datasets() if filter == "NEW": assert ligfit_tool is not None return _no_results_datasets(project, ligfit_tool) return _dataset_by_ids(project, filter)
def _get_refine_logs(project: Project, dataset): def _add_logs(proc_tool, ref_tool, log_files): if ref_tool not in logs: logs[ref_tool] = {} logs[ref_tool][proc_tool] = log_files logs: Dict[str, List] = {} for ref_result in project.get_datasets_refine_results(dataset): log_files = get_refine_log_files( project, dataset, ref_result.process_tool, ref_result.refine_tool ) if ref_result.refine_tool == "fspipeline": # wrap fspipeline logs into custom object, to handle # log files inside subdirectories log_files = [FSPipelineLogPath(log) for log in log_files] # type: ignore _add_logs(ref_result.process_tool, ref_result.refine_tool, sorted(log_files)) return logs
def get_ligfit_result_by_id(project: Project, result_id): result = project.get_ligfit_result(result_id) if result is None: raise Http404(f"no ligfit result with id '{result_id}' exist") return result
def get_dataset_by_id(project: Project, dataset_id): dataset = project.get_dataset(dataset_id) if dataset is None: raise Http404(f"no dataset with id '{dataset_id}' exist") return dataset
def get_pdb_by_id(project: Project, pdb_id): pdb = project.get_pdb(pdb_id) if pdb is None: raise Http404(f"no PDB with id '{pdb_id}' found") return pdb
def launch_refine_jobs( project: Project, filters, pdb_file, space_group, run_aimless, refine_tool, refine_tool_commands, cpus, ): epoch = round(time.time()) jobs = JobsSet("Refine") hpc = SITE.get_hpc_runner() for dset in get_refine_datasets(project, filters, refine_tool): for tool, input_mtz in _find_input_mtzs(project, dset): batch = hpc.new_batch_file( f"refine {tool} {dset.name}", project_script(project, f"refine_{tool}_{refine_tool}_{dset.name}.sh"), project_log_path( project, f"refine_{tool}_{dset.name}_{epoch}_%j_out.txt"), project_log_path( project, f"refine_{tool}_{dset.name}_{epoch}_%j_err.txt"), cpus, ) batch.set_options( time=Duration(hours=12), nodes=1, mem_per_cpu=DataSize(gigabyte=5), ) batch.add_commands(crypt_shell.crypt_cmd(project)) batch.assign_variable("WORK_DIR", "`mktemp -d`") batch.add_commands( "cd $WORK_DIR", crypt_shell.fetch_file(project, pdb_file, "model.pdb"), crypt_shell.fetch_file(project, input_mtz, "input.mtz"), ) # TODO: load tool specific modules? batch.load_modules(HPC_MODULES) if run_aimless: batch.add_commands( _aimless_cmd(space_group.short_name, "input.mtz")) results_dir = Path(project.get_dataset_results_dir(dset), tool) batch.add_commands( *refine_tool_commands, _upload_result_cmd(project, results_dir), "cd", "rm -rf $WORK_DIR", ) batch.save() jobs.add_job(batch) add_update_job(jobs, hpc, project, refine_tool, dset, batch) jobs.submit()
def _get_processing_info(project: Project, dataset) -> Iterator[ProcessingInfo]: for proc_res in project.get_datasets_process_results(dataset): yield ProcessingInfo(proc_res)
def _get_xds_dir(project: Project, dataset) -> Path: return Path(project.get_dataset_process_dir(dataset), "xds")
def _write_prepare_script( project: Project, rn, method, dataset, pdb, mtz, resHigh, free_r_flag, native_f, sigma_fp, cif_method, ): epoch = round(time.time()) output_dir = Path(project.pandda_method_dir(method), dataset.name) hpc = SITE.get_hpc_runner() batch = hpc.new_batch_file( f"PnD{rn}", project_script(project, f"pandda_prepare_{method}_{dataset.name}.sh"), project_log_path(project, f"{dataset.name}_PanDDA_{epoch}_%j_out.txt"), project_log_path(project, f"{dataset.name}_PanDDA_{epoch}_%j_err.txt"), cpus=1, ) batch.set_options(time=Duration(minutes=15), memory=DataSize(gigabyte=5)) batch.add_command(crypt_shell.crypt_cmd(project)) batch.assign_variable("DEST_DIR", output_dir) batch.assign_variable("WORK_DIR", "`mktemp -d`") batch.add_commands( "cd $WORK_DIR", crypt_shell.fetch_file(project, pdb, "final.pdb"), crypt_shell.fetch_file(project, mtz, "final.mtz"), ) batch.purge_modules() batch.load_modules( ["gopresto", versions.PHENIX_MOD, versions.CCP4_MOD, versions.BUSTER_MOD] ) if not dataset.crystal.is_apo(): fragment = get_crystals_fragment(dataset.crystal) # non-apo crystal should have a fragment assert fragment if cif_method == "elbow": cif_cmd = f"phenix.elbow --smiles='{fragment.smiles}' --output=$WORK_DIR/{fragment.code} --opt\n" else: assert cif_method == "grade" cif_cmd = ( f"grade '{fragment.smiles}' -ocif $WORK_DIR/{fragment.code}.cif " f"-opdb $WORK_DIR/{fragment.code}.pdb -nomogul\n" ) batch.add_command(cif_cmd) batch.add_commands( f'printf "monitor BRIEF\\n labin file 1 -\\n ALL\\n resolution file 1 999.0 {resHigh}\\n" | \\\n' " cad hklin1 $WORK_DIR/final.mtz hklout $WORK_DIR/final.mtz", "uniqueify -f FreeR_flag $WORK_DIR/final.mtz $WORK_DIR/final.mtz", f'printf "COMPLETE FREE={free_r_flag} \\nEND\\n" | \\\n' " freerflag hklin $WORK_DIR/final.mtz hklout $WORK_DIR/final_rfill.mtz", f"phenix.maps final_rfill.mtz final.pdb maps.input.reflection_data.labels='{native_f},{sigma_fp}'", "mv final.mtz final_original.mtz", "mv final_map_coeffs.mtz final.mtz", "rm -rf $DEST_DIR", crypt_shell.upload_dir(project, "$WORK_DIR", "$DEST_DIR"), "rm -rf $WORK_DIR", ) batch.save() return batch
def _no_results_datasets(project: Project, tool: str): for dataset in project.get_datasets(): res = dataset.result.select(tool=tool).first() if res is None: yield dataset
def _dataset_by_ids(project: Project, dataset_ids: str): for dset_id in dataset_ids.split(","): yield project.get_dataset(dset_id)