Beispiel #1
0
    def import_bedfiles(
        cls, technique, targets_path, baits_path, assembly, species, description=None
    ):
        """
        Register input_bed_path in technique's storage dir and update `data`.

        Arguments:
            technique (str): technique slug.
            targets_path (str): path to targets bedfile.
            baits_path (str): path to baits bedfile.
            assembly (str): name of reference genome for bedfile.
            species (str): name of genome species.
            description (str): a description of the BED files.

        Returns:
            dict: updated technique instance as retrieved from API.
        """
        utils.check_admin()
        technique = api.get_instance("techniques", technique)
        targets_key = f"{assembly}_targets_bedfile"
        baits_key = f"{assembly}_baits_bedfile"

        if targets_key in technique["reference_data"]:
            raise click.UsageError(
                f"Technique '{technique['slug']}' "
                f"has registered BED files for '{assembly}':\n"
                f'\n\t{technique["reference_data"][targets_key]}'
                f'\n\t{technique["reference_data"][baits_key]}'
            )

        if not technique["storage_url"]:
            technique = update_storage_url("techniques", technique["pk"])

        api.create_instance("assemblies", name=assembly, species=species)
        beds_dir = join(technique["storage_url"], "bed_files", assembly)
        base_name = slugify(f'{technique["slug"]}.{assembly}')
        targets_dst = join(beds_dir, f"{base_name}.targets.bed")
        baits_dst = join(beds_dir, f"{base_name}.baits.bed")
        os.makedirs(beds_dir, exist_ok=True)

        for src, dst in [(targets_path, targets_dst), (baits_path, baits_dst)]:
            cls.echo_src_dst("Copying", src, dst)
            shutil.copy(src, dst)
            click.secho(f"\nProcessing {basename(dst)}...", fg="blue")
            cls.process_bedfile(dst)

        click.secho(f'\nSuccess! patching {technique["slug"]}...', fg="green")

        for i, j in [(targets_key, targets_dst), (baits_key, baits_dst)]:
            technique["reference_data"][i] = {
                "url": j + ".gz",
                "description": description,
            }

        return api.patch_instance(
            endpoint="techniques",
            instance_id=technique["pk"],
            storage_usage=utils.get_tree_size(technique["storage_url"]),
            reference_data=technique["reference_data"],
        )
Beispiel #2
0
    def import_data(
        cls,
        identifier,
        data_src,
        data_id,
        symlink,
        description,
        sub_dir=None,
        model="assemblies",
    ):
        """
        Register reference resources for a given assembly.

        Arguments:
            identifier (str): name of assembly or technique.
            model (str): either `techniques` or `assemblies`.
            data_src (str): path to reference data.
            data_id (str): identifier that will be used for reference data.
            symlink (str): symlink instead of move.
            description (str): reference data description.
            sub_dir (str): target sub dir for the resource, default is data_id.

        Returns:
            dict: updated assembly instance as retrieved from API.
        """
        utils.check_admin()
        data_id = slugify(data_id, separator="_")
        click.echo(f'`data_id` set to: {click.style(data_id, fg="green")}')
        instance = api.get_instance(model, identifier)

        if data_id in instance["reference_data"]:
            raise click.UsageError(
                f"{instance['name']} has already reference data registered with id "
                f'"{data_id}":\n\n\t{instance["reference_data"][data_id]}'
            )

        if not instance["storage_url"]:
            instance = update_storage_url(model, instance["name"])

        data_dir = join(instance["storage_url"], sub_dir or data_id)
        data_dst = join(data_dir, basename(data_src))
        os.makedirs(data_dir, exist_ok=True)

        if symlink:
            cls.echo_src_dst("Linking", data_src, data_dst)
            cls.symlink(data_src, data_dst)
        else:
            cls.echo_src_dst("Moving", data_src, data_dst)
            cls.move(data_src, data_dst)

        click.secho(f'\nSuccess! patching {instance["name"]}...', fg="green")
        instance["reference_data"][data_id] = {}
        instance["reference_data"][data_id]["url"] = data_dst
        instance["reference_data"][data_id]["description"] = description
        return api.patch_instance(
            endpoint=model,
            instance_id=instance["pk"],
            storage_usage=utils.get_tree_size(instance["storage_url"]),
            reference_data=instance["reference_data"],
        )
Beispiel #3
0
def process_finished(filters):
    """Process and update finished analyses."""
    utils.check_admin()
    filters.update(status="FINISHED")

    for i in api.get_instances("analyses", verbose=True, **filters):
        if i["status"] == "FINISHED":
            api.patch_analysis_status(i, "SUCCEEDED")
Beispiel #4
0
def test_check_admin():
    admin = _DEFAULTS["ADMIN_USER"]
    _DEFAULTS["ADMIN_USER"] = "******"

    with pytest.raises(PermissionError) as error:
        utils.check_admin()

    assert "not the admin" in str(error.value)
    _DEFAULTS["ADMIN_USER"] = admin
Beispiel #5
0
def _set_analysis_permissions(analysis):
    protect_results = analysis.status == "SUCCEEDED"
    unique_analysis_per_individual = False
    application_protect_results = True
    chgrp_cmd = (
        ["false"]
        if not system_settings.DEFAULT_LINUX_GROUP
        else ["chgrp", "-R", system_settings.DEFAULT_LINUX_GROUP, analysis.storage_url]
    )

    try:
        application = import_from_string(analysis.application.application_class)()
        unique_analysis_per_individual = application.unique_analysis_per_individual
        application_protect_results = application.application_protect_results
    except ImportError:
        pass

    if (
        # dont protect results if project level analysis
        analysis.project_level_analysis
        # dont protect results if individual level automerge
        or (analysis.individual_level_analysis and not unique_analysis_per_individual)
        # dont protect results if the application says so
        or not application_protect_results
    ):
        protect_results = False

    if protect_results:
        utils.check_admin()

        if analysis.ran_by != system_settings.api_username:
            src = analysis.storage_url + "__tmp"
            shutil.move(analysis.storage_url, src)
            cmd = utils.get_rsync_command(src, analysis.storage_url, chmod="a-w")
            subprocess.check_call(cmd, shell=True)
        else:
            subprocess.check_call(["chmod", "-R", "a-w", analysis.storage_url])

        try:
            subprocess.check_output(chgrp_cmd, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError:
            pass

    elif not protect_results or analysis.status in {"FAILED", "FINISHED"}:
        for i in [chgrp_cmd, ["chmod", "-R", "g+rwX", analysis.storage_url]]:
            try:
                subprocess.check_output(i, stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError:
                pass
Beispiel #6
0
def patch_results(filters, force):
    """Update the results field of many analyses."""
    utils.check_admin()
    skipped = []

    with click.progressbar(
            api.get_instances("analyses", verbose=True, **filters),
            label="Patching analyses...",
    ) as bar:
        for i in bar:
            if force or not i.results:
                results = api._get_analysis_results(i, raise_error=False)
                api.patch_instance("analyses", i.pk, results=results)
            else:  # pragma: no cover
                skipped.append(i)

    if skipped:  # pragma: no cover
        click.echo(
            f"{len(skipped)} analyses had results, use --force to update...")
Beispiel #7
0
def update_experiment_bam_file(experiment, assembly_name, analysis_pk, bam_url):
    """
    Update default bam for a experiment given the assembly.

    Arguments:
        experiment (dict): experiment dict.
        assembly_name (str): assembly name.
        analysis_pk (int): analysis primary key.
        bam_url (str): bam url.

    Returns:
        dict: patched experiment instance
    """
    utils.check_admin()
    pk = experiment["pk"]
    bam_files = experiment["bam_files"]

    if bam_files.get(assembly_name, None):  # pragma: no cover
        raise click.UsageError(f"Experiment {pk} already has {assembly_name} bam")

    bam_files[assembly_name] = {"url": bam_url, "analysis": analysis_pk}
    return api.patch_instance("experiments", pk, bam_files=bam_files)
Beispiel #8
0
    def import_data(
        self,
        directories,
        symlink=False,
        commit=False,
        key=lambda x: x["system_id"],
        files_data=None,
        dtypes=None,
        **filters,
    ):
        """
        Import raw data for multiple experiments.

        Experiments's `storage_url`, `storage_usage`, `raw_data` are
        updated.

        Arguments:
            directories (list): list of directories to be recursively explored.
            symlink (bool): if True symlink instead of moving.
            commit (bool): if True perform import operation.
            key (function): given a experiment dict returns id to match.
            filters (dict): key value pairs to use as API query params.
            dtypes (list): data types that should be matched (e.g. BAM, PNG. etc.).
            files_data (dict): keys are files basenames and values are
                dicts with extra annotations such as PL, LB, or any other,
                see also annotate_file_data.

        Raises:
            click.UsageError: if `key` returns the same identifier for multiple
                experiments. If a experiment matches both fastq and bam files.
                if cant determine read 1 or read 2 from matched fastq files.

        Returns:
            tuple: list of experiments for which data has been matched and a
                summary of the operation.
        """
        utils.check_admin()
        files_data = files_data or {}
        experiments_matched = []
        cache = defaultdict(dict)
        patterns = []
        identifiers = {}
        dtypes = set(dtypes or [])

        # validate files_data
        for i, j in files_data.items():
            if not isinstance(j, dict):  # pragma: no cover
                raise click.UsageError(f"Invalid file data, expected dict {i}: {j}")

        # get experiments and load cache dictionary
        for i in api.get_instances("experiments", verbose=True, **filters):
            index = f"primary_key_{i['pk']}"
            using_id = f"{i['system_id']} (Skipped, identifier is NULL)"
            identifier = key(i)

            if identifier in identifiers:  # duplicated identifiers not valid
                raise click.UsageError(
                    f"Can't use same identifier for {i['system_id']} "
                    f"and {identifiers[identifier]}: {identifier}"
                )

            if i["raw_data"] or i["bam_files"]:
                using_id = f"{i['system_id']} (Skipped, experiment has raw data)"
            elif identifier:
                identifiers[identifier] = i["system_id"]
                patterns.append(self.get_regex_pattern(index, identifier))
                using_id = f"{i['system_id']} (using {identifier})"

            cache[index]["using_id"] = using_id
            cache[index]["instance"] = i
            cache[index]["files"] = []

        if patterns:
            # see http://stackoverflow.com/questions/8888567 for pattern
            pattern = re.compile("|".join(patterns))
            data_storage_dir = system_settings.BASE_STORAGE_DIRECTORY
            label = f"Exploring directories..."

            # explore dirs
            for directory in set(directories):
                with click.progressbar(os.walk(directory), label=label) as bar:
                    for root, _, files in bar:
                        if not root.startswith(data_storage_dir):
                            for i in files:
                                if len(patterns) > 500:  # pragma: no cover
                                    click.echo(
                                        f"Matching {i} against "
                                        f"{len(patterns)} experiments..."
                                    )

                                path = join(root, i)
                                match = self.match_path(path, pattern)

                                if match and (not dtypes or match["dtype"] in dtypes):
                                    cache[match.pop("index")]["files"].append(match)

            # process files if needed
            label = "Processing..."
            bar = sorted(cache.values(), key=lambda x: x["instance"]["pk"])
            with click.progressbar(bar, label=label) as bar:
                for i in bar:
                    if commit and i["files"]:
                        experiments_matched.append(
                            self.import_files(
                                instance=i["instance"],
                                files=i["files"],
                                symlink=symlink,
                                files_data=files_data,
                            )
                        )
                    elif i["files"]:  # pragma: no cover
                        experiments_matched.append(i["instance"])

        return experiments_matched, self.get_summary(cache)