def import_bedfiles( cls, technique, targets_path, baits_path, assembly, species, description=None ): """ Register input_bed_path in technique's storage dir and update `data`. Arguments: technique (str): technique slug. targets_path (str): path to targets bedfile. baits_path (str): path to baits bedfile. assembly (str): name of reference genome for bedfile. species (str): name of genome species. description (str): a description of the BED files. Returns: dict: updated technique instance as retrieved from API. """ utils.check_admin() technique = api.get_instance("techniques", technique) targets_key = f"{assembly}_targets_bedfile" baits_key = f"{assembly}_baits_bedfile" if targets_key in technique["reference_data"]: raise click.UsageError( f"Technique '{technique['slug']}' " f"has registered BED files for '{assembly}':\n" f'\n\t{technique["reference_data"][targets_key]}' f'\n\t{technique["reference_data"][baits_key]}' ) if not technique["storage_url"]: technique = update_storage_url("techniques", technique["pk"]) api.create_instance("assemblies", name=assembly, species=species) beds_dir = join(technique["storage_url"], "bed_files", assembly) base_name = slugify(f'{technique["slug"]}.{assembly}') targets_dst = join(beds_dir, f"{base_name}.targets.bed") baits_dst = join(beds_dir, f"{base_name}.baits.bed") os.makedirs(beds_dir, exist_ok=True) for src, dst in [(targets_path, targets_dst), (baits_path, baits_dst)]: cls.echo_src_dst("Copying", src, dst) shutil.copy(src, dst) click.secho(f"\nProcessing {basename(dst)}...", fg="blue") cls.process_bedfile(dst) click.secho(f'\nSuccess! patching {technique["slug"]}...', fg="green") for i, j in [(targets_key, targets_dst), (baits_key, baits_dst)]: technique["reference_data"][i] = { "url": j + ".gz", "description": description, } return api.patch_instance( endpoint="techniques", instance_id=technique["pk"], storage_usage=utils.get_tree_size(technique["storage_url"]), reference_data=technique["reference_data"], )
def import_data( cls, identifier, data_src, data_id, symlink, description, sub_dir=None, model="assemblies", ): """ Register reference resources for a given assembly. Arguments: identifier (str): name of assembly or technique. model (str): either `techniques` or `assemblies`. data_src (str): path to reference data. data_id (str): identifier that will be used for reference data. symlink (str): symlink instead of move. description (str): reference data description. sub_dir (str): target sub dir for the resource, default is data_id. Returns: dict: updated assembly instance as retrieved from API. """ utils.check_admin() data_id = slugify(data_id, separator="_") click.echo(f'`data_id` set to: {click.style(data_id, fg="green")}') instance = api.get_instance(model, identifier) if data_id in instance["reference_data"]: raise click.UsageError( f"{instance['name']} has already reference data registered with id " f'"{data_id}":\n\n\t{instance["reference_data"][data_id]}' ) if not instance["storage_url"]: instance = update_storage_url(model, instance["name"]) data_dir = join(instance["storage_url"], sub_dir or data_id) data_dst = join(data_dir, basename(data_src)) os.makedirs(data_dir, exist_ok=True) if symlink: cls.echo_src_dst("Linking", data_src, data_dst) cls.symlink(data_src, data_dst) else: cls.echo_src_dst("Moving", data_src, data_dst) cls.move(data_src, data_dst) click.secho(f'\nSuccess! patching {instance["name"]}...', fg="green") instance["reference_data"][data_id] = {} instance["reference_data"][data_id]["url"] = data_dst instance["reference_data"][data_id]["description"] = description return api.patch_instance( endpoint=model, instance_id=instance["pk"], storage_usage=utils.get_tree_size(instance["storage_url"]), reference_data=instance["reference_data"], )
def process_finished(filters): """Process and update finished analyses.""" utils.check_admin() filters.update(status="FINISHED") for i in api.get_instances("analyses", verbose=True, **filters): if i["status"] == "FINISHED": api.patch_analysis_status(i, "SUCCEEDED")
def test_check_admin(): admin = _DEFAULTS["ADMIN_USER"] _DEFAULTS["ADMIN_USER"] = "******" with pytest.raises(PermissionError) as error: utils.check_admin() assert "not the admin" in str(error.value) _DEFAULTS["ADMIN_USER"] = admin
def _set_analysis_permissions(analysis): protect_results = analysis.status == "SUCCEEDED" unique_analysis_per_individual = False application_protect_results = True chgrp_cmd = ( ["false"] if not system_settings.DEFAULT_LINUX_GROUP else ["chgrp", "-R", system_settings.DEFAULT_LINUX_GROUP, analysis.storage_url] ) try: application = import_from_string(analysis.application.application_class)() unique_analysis_per_individual = application.unique_analysis_per_individual application_protect_results = application.application_protect_results except ImportError: pass if ( # dont protect results if project level analysis analysis.project_level_analysis # dont protect results if individual level automerge or (analysis.individual_level_analysis and not unique_analysis_per_individual) # dont protect results if the application says so or not application_protect_results ): protect_results = False if protect_results: utils.check_admin() if analysis.ran_by != system_settings.api_username: src = analysis.storage_url + "__tmp" shutil.move(analysis.storage_url, src) cmd = utils.get_rsync_command(src, analysis.storage_url, chmod="a-w") subprocess.check_call(cmd, shell=True) else: subprocess.check_call(["chmod", "-R", "a-w", analysis.storage_url]) try: subprocess.check_output(chgrp_cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError: pass elif not protect_results or analysis.status in {"FAILED", "FINISHED"}: for i in [chgrp_cmd, ["chmod", "-R", "g+rwX", analysis.storage_url]]: try: subprocess.check_output(i, stderr=subprocess.STDOUT) except subprocess.CalledProcessError: pass
def patch_results(filters, force): """Update the results field of many analyses.""" utils.check_admin() skipped = [] with click.progressbar( api.get_instances("analyses", verbose=True, **filters), label="Patching analyses...", ) as bar: for i in bar: if force or not i.results: results = api._get_analysis_results(i, raise_error=False) api.patch_instance("analyses", i.pk, results=results) else: # pragma: no cover skipped.append(i) if skipped: # pragma: no cover click.echo( f"{len(skipped)} analyses had results, use --force to update...")
def update_experiment_bam_file(experiment, assembly_name, analysis_pk, bam_url): """ Update default bam for a experiment given the assembly. Arguments: experiment (dict): experiment dict. assembly_name (str): assembly name. analysis_pk (int): analysis primary key. bam_url (str): bam url. Returns: dict: patched experiment instance """ utils.check_admin() pk = experiment["pk"] bam_files = experiment["bam_files"] if bam_files.get(assembly_name, None): # pragma: no cover raise click.UsageError(f"Experiment {pk} already has {assembly_name} bam") bam_files[assembly_name] = {"url": bam_url, "analysis": analysis_pk} return api.patch_instance("experiments", pk, bam_files=bam_files)
def import_data( self, directories, symlink=False, commit=False, key=lambda x: x["system_id"], files_data=None, dtypes=None, **filters, ): """ Import raw data for multiple experiments. Experiments's `storage_url`, `storage_usage`, `raw_data` are updated. Arguments: directories (list): list of directories to be recursively explored. symlink (bool): if True symlink instead of moving. commit (bool): if True perform import operation. key (function): given a experiment dict returns id to match. filters (dict): key value pairs to use as API query params. dtypes (list): data types that should be matched (e.g. BAM, PNG. etc.). files_data (dict): keys are files basenames and values are dicts with extra annotations such as PL, LB, or any other, see also annotate_file_data. Raises: click.UsageError: if `key` returns the same identifier for multiple experiments. If a experiment matches both fastq and bam files. if cant determine read 1 or read 2 from matched fastq files. Returns: tuple: list of experiments for which data has been matched and a summary of the operation. """ utils.check_admin() files_data = files_data or {} experiments_matched = [] cache = defaultdict(dict) patterns = [] identifiers = {} dtypes = set(dtypes or []) # validate files_data for i, j in files_data.items(): if not isinstance(j, dict): # pragma: no cover raise click.UsageError(f"Invalid file data, expected dict {i}: {j}") # get experiments and load cache dictionary for i in api.get_instances("experiments", verbose=True, **filters): index = f"primary_key_{i['pk']}" using_id = f"{i['system_id']} (Skipped, identifier is NULL)" identifier = key(i) if identifier in identifiers: # duplicated identifiers not valid raise click.UsageError( f"Can't use same identifier for {i['system_id']} " f"and {identifiers[identifier]}: {identifier}" ) if i["raw_data"] or i["bam_files"]: using_id = f"{i['system_id']} (Skipped, experiment has raw data)" elif identifier: identifiers[identifier] = i["system_id"] patterns.append(self.get_regex_pattern(index, identifier)) using_id = f"{i['system_id']} (using {identifier})" cache[index]["using_id"] = using_id cache[index]["instance"] = i cache[index]["files"] = [] if patterns: # see http://stackoverflow.com/questions/8888567 for pattern pattern = re.compile("|".join(patterns)) data_storage_dir = system_settings.BASE_STORAGE_DIRECTORY label = f"Exploring directories..." # explore dirs for directory in set(directories): with click.progressbar(os.walk(directory), label=label) as bar: for root, _, files in bar: if not root.startswith(data_storage_dir): for i in files: if len(patterns) > 500: # pragma: no cover click.echo( f"Matching {i} against " f"{len(patterns)} experiments..." ) path = join(root, i) match = self.match_path(path, pattern) if match and (not dtypes or match["dtype"] in dtypes): cache[match.pop("index")]["files"].append(match) # process files if needed label = "Processing..." bar = sorted(cache.values(), key=lambda x: x["instance"]["pk"]) with click.progressbar(bar, label=label) as bar: for i in bar: if commit and i["files"]: experiments_matched.append( self.import_files( instance=i["instance"], files=i["files"], symlink=symlink, files_data=files_data, ) ) elif i["files"]: # pragma: no cover experiments_matched.append(i["instance"]) return experiments_matched, self.get_summary(cache)