Ejemplo n.º 1
0
def _add_datasets(project: Project):
    """
    For all the project's crystals, look for datasets on the file system.
    Add database entries for all new datasets found on the file system.
    """

    def dataset_exists(crystal, run: int):
        dset = crystal.get_dataset(run)
        return dset is not None

    for dset_dir in project.get_dataset_dirs():
        crystal_id = dset_dir.name
        crystal = project.get_crystal(crystal_id)
        if crystal is None:
            # not part of the project
            continue

        for run in get_dataset_runs(dset_dir):
            if dataset_exists(crystal, run):
                # skip already existing dataset
                continue

            meta_data = get_dataset_metadata(project, dset_dir, crystal_id, run)
            if meta_data is None:
                print(
                    f"warning: no meta-data found for {crystal_id} {run}, skipping the dataset"
                )
                continue

            # TODO: this is MAXIV specific, think about site-independent style
            shift_dir = dset_dir.parents[2].relative_to(project.proposal_dir)

            dataset = project.db.DataSet(
                crystal=crystal,
                data_root_dir=str(shift_dir),
                run=run,
                detector=meta_data.detector,
                resolution=meta_data.resolution,
                images=meta_data.images,
                start_time=meta_data.start_time,
                end_time=meta_data.end_time,
                wavelength=meta_data.wavelength,
                start_angle=meta_data.start_angle,
                angle_increment=meta_data.angle_increment,
                exposure_time=meta_data.exposure_time,
                detector_distance=meta_data.detector_distance,
                xbeam=meta_data.xbeam,
                ybeam=meta_data.ybeam,
                beam_shape=meta_data.beam_shape,
                transmission=meta_data.transmission,
                slit_gap_horizontal=meta_data.slit_gap_horizontal,
                slit_gap_vertical=meta_data.slit_gap_vertical,
                flux=meta_data.flux,
                beam_size_at_sample_x=meta_data.beam_size_at_sample_x,
                beam_size_at_sample_y=meta_data.beam_size_at_sample_y,
            )

            for snapshot_index in meta_data.snapshot_indices:
                project.db.DataSetSnapshot(dataset=dataset, index=snapshot_index)
Ejemplo n.º 2
0
def _save_pdb(project: Project, pdb_id, filename, pdb_data):
    name = path.splitext(filename)[0]
    nohet_filename = f"{name}_noHETATM.pdb"
    noanisou_filename = f"{name}_noANISOU.pdb"
    nohetanisou_filename = f"{name}_noANISOU_noHETATM.pdb"
    txc_filename = f"{name}_txc.pdb"

    orig_pdb = _add_pdb_entry(project, filename, pdb_id)
    nohet_pdb = _add_pdb_entry(project, nohet_filename, pdb_id)
    noanisou_pdb = _add_pdb_entry(project, noanisou_filename, pdb_id)
    nohetnoanisou_pdb = _add_pdb_entry(project, nohetanisou_filename, pdb_id)

    # write original pdb file 'as-is' to models folder
    with open_proj_file(project, project.get_pdb_file(orig_pdb)) as dest:
        dest.write(pdb_data)

    # filter out all non-ATOM entries from pdb and write it as *_noHETATM.pdb
    with open_proj_file(project, project.get_pdb_file(nohetnoanisou_pdb)) as dest:
        for line in pdb_data.splitlines(keepends=True):
            if not line.startswith(b"HETATM") or not line.startswith(b"ANISOU"):
                dest.write(line)

    with open_proj_file(project, project.get_pdb_file(nohet_pdb)) as dest:
        for line in pdb_data.splitlines(keepends=True):
            if not line.startswith(b"HETATM"):
                dest.write(line)

    with open_proj_file(project, project.get_pdb_file(noanisou_pdb)) as dest:
        for line in pdb_data.splitlines(keepends=True):
            if not line.startswith(b"ANISOU"):
                dest.write(line)

    n_chains = pdb_chains(pdb_data.splitlines(keepends=True))

    if n_chains > 1:
        txc_pdb = _add_pdb_entry(project, txc_filename, pdb_id)

        input_pdb_name = path.join(project.models_dir, f"{name}.pdb")

        jobs = JobsSet("phenix ensembler")
        batch = SITE.get_hpc_runner().new_batch_file(
            "phenix ensembler",
            project_script(project, "phenix_ensembler.sh"),
            project_syslog_path(project, "phenix_ensembler_%j.out"),
            project_syslog_path(project, "phenix_ensembler_%j.err"),
        )
        batch.load_modules(["gopresto", PHENIX_MOD])
        batch.add_commands(
            f"cd {project.models_dir}",
            f"phenix.ensembler {input_pdb_name} trim=TRUE output.location='{project.models_dir}'",
            f"mv {project.models_dir}/ensemble_merged.pdb {project.get_pdb_file(txc_pdb)}",
        )
        batch.save()
        jobs.add_job(batch)
        jobs.submit()
Ejemplo n.º 3
0
def _migrate_proc_tool(project: Project, old_paths: OldPaths, tool: str):
    _log(f"MIGRATE PROC TOOL {tool} RESULTS")
    tool_dir = "xdsxscale" if tool == "xds" else tool

    for dset in _dbg_cutoff(project.get_datasets()):
        src_dir = Path(old_paths.process_dir, dset.crystal.id, dset.name,
                       tool_dir)
        if not src_dir.is_dir():
            # no tool results found for this dataset
            continue

        dest_dir = Path(project.get_dataset_process_dir(dset), tool)
        _log(f"{src_dir} -> {dest_dir}")
        _copytree(src_dir, dest_dir)
        update_proc_tool_status(project, tool, dset)
Ejemplo n.º 4
0
def get_summary_url(project: Project, method: str, dataset_name: str) -> str:
    summary_path = Path(
        project.pandda_processed_dataset_dir(method, dataset_name),
        "html",
        f"{dataset_name}.html",
    )

    return str(summary_path.relative_to(project.project_dir))
Ejemplo n.º 5
0
def get_refine_log_files(project: Project, dataset, processing_tool) -> Iterator[Path]:
    res_dir = Path(
        project.get_dataset_results_dir(dataset), processing_tool, "fspipeline"
    )
    project_dir = project.project_dir

    for path in res_dir.glob("**/*.log"):
        if path.is_file():
            yield path.relative_to(project_dir)
Ejemplo n.º 6
0
def scrape_results(project: Project, dataset) -> Optional[ProcStats]:
    xds_dir = Path(project.get_dataset_process_dir(dataset), "xds")

    stats = xia2.scrape_results(project, _get_xds_dir(project, dataset))
    if stats is None:
        return None

    stats.isa = _scrape_isa(project, xds_dir)

    return stats
Ejemplo n.º 7
0
def get_project_libraries(project: Project) -> Set[Library]:
    libs = set()

    for crystal in project.get_crystals():
        if crystal.is_apo():
            continue

        frag = Fragment.get_by_id(crystal.fragment_id)
        libs.add(frag.library)

    return libs
Ejemplo n.º 8
0
def scrape_results(project: Project, dataset) -> Iterator[LigfitResult]:
    res_dir = project.get_dataset_results_dir(dataset)

    for ref_dir in subdirs(res_dir, 2):
        score = _scrape_score(ref_dir)
        proc_tool = ref_dir.parent.name
        refine_tool = ref_dir.name
        if score is None:
            status = ToolStatus.FAILURE
        else:
            status = ToolStatus.SUCCESS
        yield LigfitResult(proc_tool, refine_tool, "rhofit", status, score)
Ejemplo n.º 9
0
def _migrate_refine(project: Project, old_paths: OldPaths):
    _log("MIGRATE REFINE RESULTS")
    for dset in _dbg_cutoff(project.get_datasets()):
        for proc_tool in ["xds", "xdsapp", "dials"]:
            proc_dir = "xdsxscale" if proc_tool == "xds" else proc_tool
            src_dir = Path(old_paths.results_dir, dset.name, proc_dir)
            if not src_dir.is_dir():
                # no tool results found for this dataset
                continue

            dest_dir = Path(project.get_dataset_results_dir(dset), proc_tool)
            _log(f"{src_dir} -> {dest_dir}")
            # note, we can't use ignore_dangling_symlinks=True
            # here due to https://bugs.python.org/issue38523
            _copytree(src_dir, dest_dir)

        for tool in REFINE_TOOLS:
            update_refine_tool_status(project, tool, dset)

        for tool in LIGFIT_TOOLS:
            _maybe_resync_ligfit_tool(project, tool, dset)
Ejemplo n.º 10
0
def _get_fragments(project: Project) -> Dict[Fragment, List]:
    fragments: Dict = {}

    for crystal in project.get_crystals():
        if crystal.is_apo():
            continue

        frag = get_crystals_fragment(crystal)
        crystals = fragments.get(frag, [])
        crystals.append(crystal)
        fragments[frag] = crystals

    return fragments
Ejemplo n.º 11
0
def pandda_worker(project: Project, method, methodshort, options, cif_method):
    rn = str(randint(10000, 99999))
    prepare_scripts = []

    proc_tool, refine_tool = method.split("_")
    refine_results = _get_best_results(project, proc_tool, refine_tool)

    selection = PanddaSelectedDatasets()

    for refine_result in refine_results:
        res_dir = project.get_refine_result_dir(refine_result)
        final_pdb = Path(res_dir, "final.pdb")
        final_mtz = Path(res_dir, "final.mtz")

        selection.add(refine_result.dataset.name, final_pdb)

        res_high, free_r_flag, native_f, sigma_fp = read_info(project, str(final_mtz))

        script = _write_prepare_script(
            project,
            rn,
            method,
            refine_result.dataset,
            final_pdb,
            final_mtz,
            res_high,
            free_r_flag,
            native_f,
            sigma_fp,
            cif_method,
        )

        prepare_scripts.append(script)

    pandda_dir = Path(project.pandda_dir, method)
    pandda_dir.mkdir(parents=True, exist_ok=True)

    selection.save(Path(pandda_dir))

    main_script = _write_main_script(project, pandda_dir, method, methodshort, options)

    #
    # submit all pandda script to the HPC
    #
    jobs = JobsSet("PanDDa")

    for prep_script in prepare_scripts:
        jobs.add_job(prep_script)

    jobs.add_job(main_script, run_after=prepare_scripts)
    jobs.submit()
Ejemplo n.º 12
0
def _get_key(project: Project) -> str:
    """
    get project's encryption key in BASE64 format

    raises CryptoKeyError if encryption key is not uploaded
    or of project is not encrypted
    """
    if not project.encrypted:
        raise CryptoKeyError(ENCRYPTION_DISABLED_MSG)

    if not project.has_encryption_key():
        raise CryptoKeyError("no key uploaded")

    return base64.b64encode(project.encryption_key).decode()
Ejemplo n.º 13
0
def _find_results(project: Project, dataset):
    edna_res_dir = Path(
        project.get_dataset_root_dir(dataset),
        "process",
        project.protein,
        f"{dataset.crystal.id}",
        f"xds_{dataset.name}_1",
        "EDNA_proc",
        "results",
    )

    if edna_res_dir.is_dir():
        mtz_file = next(edna_res_dir.glob("*.mtz"), None)
        return edna_res_dir, mtz_file

    return None, None
Ejemplo n.º 14
0
def _get_refine_results(project: Project, filters: str, use_ligandfit: bool,
                        use_rhofit: bool):
    datasets = []

    if filters == "NEW":
        if use_ligandfit:
            datasets.append(get_ligfit_datasets(project, filters, "ligandfit"))

        if use_rhofit:
            datasets.append(get_ligfit_datasets(project, filters, "rhofit"))
    else:
        datasets.append(get_ligfit_datasets(project, filters, None))

    for dataset in itertools.chain(*datasets):
        for result in project.get_datasets_refine_results(dataset):
            yield result
Ejemplo n.º 15
0
def find_site_centroids(project: Project, method: str):
    sites_csv = Path(
        project.pandda_method_dir(method),
        "pandda",
        "analyses",
        "pandda_analyse_sites.csv",
    )
    with sites_csv.open("r") as f:
        sitelist = f.readlines()

    centroids = list()
    for _site in sitelist[1:]:
        centroid = literal_eval(",".join(
            _site.replace('"', "").split(",")[8:11]))
        centroids.append(list(centroid))

    return centroids
Ejemplo n.º 16
0
def _get_summary_report(project: Project, dataset) -> Optional[Path]:
    autoproc_res_dir = Path(
        project.get_dataset_root_dir(dataset),
        "process",
        project.protein,
        f"{dataset.crystal.id}",
        f"xds_{dataset.name}_1",
        "autoPROC",
    )

    glob = autoproc_res_dir.glob(str(Path("cn*", "AutoPROCv1_0_anom", "summary.html")))

    summary_file = next(glob, None)
    if summary_file is not None and summary_file.is_file():
        return summary_file

    # no autoPROC summary report found
    return None
Ejemplo n.º 17
0
def get_proc_datasets(project: Project, filter: str, tool: str):
    """
    perform datasets filtering for 'data processing' jobs

    filters supported:

    'ALL' - all of the project's datasets

    'NEW' - datasets that have not been processes yet with specified tool

    otherwise the filter is expected to be a comma separated list of dataset IDs
    """
    if filter == "ALL":
        return project.get_datasets()

    if filter == "NEW":
        return _no_results_datasets(project, tool)

    return _dataset_by_ids(project, filter)
Ejemplo n.º 18
0
def get_refine_datasets(project: Project, filter: str, refine_tool: str):
    """
    perform datasets filtering for 'structure refinement' jobs

    filters supported:

    'ALL' - all of the project's datasets

    'NEW' - datasets that have not been processes with specified refinement tool yet

    otherwise the filter is expected to be a comma separated list of dataset names
    """
    if filter == "ALL":
        return project.get_datasets()

    if filter == "NEW":
        return _no_results_datasets(project, refine_tool)

    return _dataset_by_ids(project, filter)
Ejemplo n.º 19
0
def _get_best_results(project: Project, proc_tool, refine_tool):
    if proc_tool == "frag":
        proc_tool = None

    if refine_tool == "plex":
        refine_tool = None

    for dataset in project.get_datasets():
        refine_results = _get_refine_results(project, dataset, proc_tool, refine_tool)

        # sort refine results by R-work, R-free and resolution,
        # the result with lowest value(s) is the 'best' one
        best_result = refine_results.order_by(
            lambda r: (r.r_work, r.r_free, r.resolution)
        ).first()

        if best_result is None:
            # no refine results with request proc/refine tool combination
            continue

        yield best_result
Ejemplo n.º 20
0
def get_ligfit_datasets(project: Project, filter: str,
                        ligfit_tool: Optional[str]):
    """
    perform datasets filtering for 'ligand fitting' jobs

    filters supported:

    'ALL' - all of the project's datasets

    'NEW' - datasets that have not been processes with specified ligand fitting tools yet

    otherwise the filter is expected to be a comma separated list of dataset names
    """
    if filter == "ALL":
        return project.get_datasets()

    if filter == "NEW":
        assert ligfit_tool is not None
        return _no_results_datasets(project, ligfit_tool)

    return _dataset_by_ids(project, filter)
Ejemplo n.º 21
0
def _get_refine_logs(project: Project, dataset):
    def _add_logs(proc_tool, ref_tool, log_files):
        if ref_tool not in logs:
            logs[ref_tool] = {}

        logs[ref_tool][proc_tool] = log_files

    logs: Dict[str, List] = {}

    for ref_result in project.get_datasets_refine_results(dataset):
        log_files = get_refine_log_files(
            project, dataset, ref_result.process_tool, ref_result.refine_tool
        )

        if ref_result.refine_tool == "fspipeline":
            # wrap fspipeline logs into custom object, to handle
            # log files inside subdirectories
            log_files = [FSPipelineLogPath(log) for log in log_files]  # type: ignore

        _add_logs(ref_result.process_tool, ref_result.refine_tool, sorted(log_files))

    return logs
Ejemplo n.º 22
0
def get_ligfit_result_by_id(project: Project, result_id):
    result = project.get_ligfit_result(result_id)
    if result is None:
        raise Http404(f"no ligfit result with id '{result_id}' exist")

    return result
Ejemplo n.º 23
0
def get_dataset_by_id(project: Project, dataset_id):
    dataset = project.get_dataset(dataset_id)
    if dataset is None:
        raise Http404(f"no dataset with id '{dataset_id}' exist")

    return dataset
Ejemplo n.º 24
0
def get_pdb_by_id(project: Project, pdb_id):
    pdb = project.get_pdb(pdb_id)
    if pdb is None:
        raise Http404(f"no PDB with id '{pdb_id}' found")

    return pdb
Ejemplo n.º 25
0
def launch_refine_jobs(
    project: Project,
    filters,
    pdb_file,
    space_group,
    run_aimless,
    refine_tool,
    refine_tool_commands,
    cpus,
):
    epoch = round(time.time())
    jobs = JobsSet("Refine")
    hpc = SITE.get_hpc_runner()

    for dset in get_refine_datasets(project, filters, refine_tool):
        for tool, input_mtz in _find_input_mtzs(project, dset):
            batch = hpc.new_batch_file(
                f"refine {tool} {dset.name}",
                project_script(project,
                               f"refine_{tool}_{refine_tool}_{dset.name}.sh"),
                project_log_path(
                    project, f"refine_{tool}_{dset.name}_{epoch}_%j_out.txt"),
                project_log_path(
                    project, f"refine_{tool}_{dset.name}_{epoch}_%j_err.txt"),
                cpus,
            )
            batch.set_options(
                time=Duration(hours=12),
                nodes=1,
                mem_per_cpu=DataSize(gigabyte=5),
            )

            batch.add_commands(crypt_shell.crypt_cmd(project))

            batch.assign_variable("WORK_DIR", "`mktemp -d`")
            batch.add_commands(
                "cd $WORK_DIR",
                crypt_shell.fetch_file(project, pdb_file, "model.pdb"),
                crypt_shell.fetch_file(project, input_mtz, "input.mtz"),
            )

            # TODO: load tool specific modules?
            batch.load_modules(HPC_MODULES)

            if run_aimless:
                batch.add_commands(
                    _aimless_cmd(space_group.short_name, "input.mtz"))

            results_dir = Path(project.get_dataset_results_dir(dset), tool)

            batch.add_commands(
                *refine_tool_commands,
                _upload_result_cmd(project, results_dir),
                "cd",
                "rm -rf $WORK_DIR",
            )

            batch.save()
            jobs.add_job(batch)

            add_update_job(jobs, hpc, project, refine_tool, dset, batch)

    jobs.submit()
Ejemplo n.º 26
0
def _get_processing_info(project: Project, dataset) -> Iterator[ProcessingInfo]:
    for proc_res in project.get_datasets_process_results(dataset):
        yield ProcessingInfo(proc_res)
Ejemplo n.º 27
0
def _get_xds_dir(project: Project, dataset) -> Path:
    return Path(project.get_dataset_process_dir(dataset), "xds")
Ejemplo n.º 28
0
def _write_prepare_script(
    project: Project,
    rn,
    method,
    dataset,
    pdb,
    mtz,
    resHigh,
    free_r_flag,
    native_f,
    sigma_fp,
    cif_method,
):
    epoch = round(time.time())
    output_dir = Path(project.pandda_method_dir(method), dataset.name)

    hpc = SITE.get_hpc_runner()
    batch = hpc.new_batch_file(
        f"PnD{rn}",
        project_script(project, f"pandda_prepare_{method}_{dataset.name}.sh"),
        project_log_path(project, f"{dataset.name}_PanDDA_{epoch}_%j_out.txt"),
        project_log_path(project, f"{dataset.name}_PanDDA_{epoch}_%j_err.txt"),
        cpus=1,
    )
    batch.set_options(time=Duration(minutes=15), memory=DataSize(gigabyte=5))

    batch.add_command(crypt_shell.crypt_cmd(project))
    batch.assign_variable("DEST_DIR", output_dir)
    batch.assign_variable("WORK_DIR", "`mktemp -d`")
    batch.add_commands(
        "cd $WORK_DIR",
        crypt_shell.fetch_file(project, pdb, "final.pdb"),
        crypt_shell.fetch_file(project, mtz, "final.mtz"),
    )

    batch.purge_modules()
    batch.load_modules(
        ["gopresto", versions.PHENIX_MOD, versions.CCP4_MOD, versions.BUSTER_MOD]
    )

    if not dataset.crystal.is_apo():
        fragment = get_crystals_fragment(dataset.crystal)
        # non-apo crystal should have a fragment
        assert fragment
        if cif_method == "elbow":
            cif_cmd = f"phenix.elbow --smiles='{fragment.smiles}' --output=$WORK_DIR/{fragment.code} --opt\n"
        else:
            assert cif_method == "grade"
            cif_cmd = (
                f"grade '{fragment.smiles}' -ocif $WORK_DIR/{fragment.code}.cif "
                f"-opdb $WORK_DIR/{fragment.code}.pdb -nomogul\n"
            )

        batch.add_command(cif_cmd)

    batch.add_commands(
        f'printf "monitor BRIEF\\n labin file 1 -\\n  ALL\\n resolution file 1 999.0 {resHigh}\\n" | \\\n'
        "    cad hklin1 $WORK_DIR/final.mtz hklout $WORK_DIR/final.mtz",
        "uniqueify -f FreeR_flag $WORK_DIR/final.mtz $WORK_DIR/final.mtz",
        f'printf "COMPLETE FREE={free_r_flag} \\nEND\\n" | \\\n'
        "    freerflag hklin $WORK_DIR/final.mtz hklout $WORK_DIR/final_rfill.mtz",
        f"phenix.maps final_rfill.mtz final.pdb maps.input.reflection_data.labels='{native_f},{sigma_fp}'",
        "mv final.mtz final_original.mtz",
        "mv final_map_coeffs.mtz final.mtz",
        "rm -rf $DEST_DIR",
        crypt_shell.upload_dir(project, "$WORK_DIR", "$DEST_DIR"),
        "rm -rf $WORK_DIR",
    )

    batch.save()
    return batch
Ejemplo n.º 29
0
def _no_results_datasets(project: Project, tool: str):
    for dataset in project.get_datasets():
        res = dataset.result.select(tool=tool).first()
        if res is None:
            yield dataset
Ejemplo n.º 30
0
def _dataset_by_ids(project: Project, dataset_ids: str):
    for dset_id in dataset_ids.split(","):
        yield project.get_dataset(dset_id)