def test_failed_signal(): analysis = api.create_instance("analyses", **factories.AnalysisFactory()) get_kwargs = dict( target_endpoint="analyses", endpoint="signals", target_id=analysis.pk ) # check signals work and nothing is created api._run_signals("analyses", analysis, [besuhof_signal]) assert len(api.get_instances(**get_kwargs)) == 0 # check signals failed analysis = api.patch_instance("analyses", analysis.pk, notes="please fail") api._run_signals("analyses", analysis, [besuhof_signal]) instances = api.get_instances(**get_kwargs) assert len(instances) == 1 assert _FAILED_SIGNAL_MESSAGE in instances[0].data["failure_traceback"] # assert that error traceback is updated runner = CliRunner() args = f"-fi target_endpoint analyses -fi target_id {analysis.pk}".split() api.patch_instance("analyses", analysis.pk, notes="fail with different msg") runner.invoke(commands.rerun_signals, args, catch_exceptions=False) instances = api.get_instances(**get_kwargs) assert len(instances) == 1 assert "but with a different msg..." in instances[0].data["failure_traceback"] # assert that signal is deleted after no failure is detected api.patch_instance("analyses", analysis.pk, notes="") runner.invoke(commands.rerun_signals, args, catch_exceptions=False) assert len(api.get_instances(**get_kwargs)) == 0
def test_get_bed(): runner = CliRunner() technique = api.create_instance("techniques", **factories.TechniqueFactory()) args = [str(technique.pk)] result = runner.invoke(commands.get_bed, args, catch_exceptions=False) assert "No BED files" in result.output api.patch_instance( "techniques", technique.pk, reference_data={"test_targets_bedfile": { "url": "/hello/world" }}, ) result = runner.invoke(commands.get_bed, args, catch_exceptions=False) assert "/hello/world" in result.output api.patch_instance( "techniques", technique.pk, reference_data={ "test_targets_bedfile": { "url": "/hello/world" }, "another_targets_bedfile": { "url": "/hello/world" }, }, ) result = runner.invoke(commands.get_bed, args, catch_exceptions=False) assert "Multiple BEDs" in result.output
def test_get_experiments_from_default_cli_options(tmpdir): app = ExperimentsFromDefaulCLIApplication() experiments = [ api.create_instance("experiments", **factories.ExperimentFactory()) for i in range(4) ] analysis = api.create_instance( "analyses", **{ **factories.AnalysisFactory(), "targets": experiments, "references": experiments, }, ) pairs_file = tmpdir.join("pairs.txt") pairs_file.write(experiments[1].system_id + "\t" + experiments[0].system_id + "\n") # get coverage for invalid experiments api.patch_instance("experiments", experiments[0].system_id, notes="raise validation error") command = ExperimentsFromDefaulCLIApplication.as_cli_command() runner = CliRunner() result = runner.invoke( command, [ "--pair", experiments[0].system_id, experiments[1].system_id, "--pairs", experiments[2].system_id, experiments[3].system_id, "--targets-filters", "pk", experiments[3].pk, "--references-filters", "pk", experiments[2].pk, "--analyses-filters", "pk", analysis.pk, "--pairs-from-file", str(pairs_file), ], catch_exceptions=False, ) assert experiments[0].system_id in result.output assert "INVALID" in result.output # just get coverage for get_job_name assert ExperimentsFromDefaulCLIApplication.get_job_name(analysis)
def test_get_bams(): runner = CliRunner() experiment = api.create_instance("experiments", **factories.ExperimentFactory()) args = [str(experiment.pk)] result = runner.invoke(commands.get_bams, args, catch_exceptions=False) assert "No bams for" in result.output result = runner.invoke(commands.get_bams, args + ["--verbose"], catch_exceptions=False) assert experiment.system_id in result.output assert "None" in result.output api.patch_instance( "experiments", experiment.pk, bam_files={"grch": { "url": "/hello/world", "analysis": 1 }}, ) result = runner.invoke(commands.get_bams, args, catch_exceptions=False) assert "/hello/world" in result.output api.patch_instance( "experiments", experiment.pk, bam_files={ "a1": { "url": "/hello/world", "analysis": 1 }, "a2": { "url": "/hello/mars", "analysis": 2 }, }, ) result = runner.invoke(commands.get_bams, args, catch_exceptions=False) assert "Multiple bams" in result.output result = runner.invoke(commands.get_bams, args + ["--assembly", "a2"], catch_exceptions=False) assert "/hello/mars" in result.output
def import_bedfiles( cls, technique, targets_path, baits_path, assembly, species, description=None ): """ Register input_bed_path in technique's storage dir and update `data`. Arguments: technique (str): technique slug. targets_path (str): path to targets bedfile. baits_path (str): path to baits bedfile. assembly (str): name of reference genome for bedfile. species (str): name of genome species. description (str): a description of the BED files. Returns: dict: updated technique instance as retrieved from API. """ utils.check_admin() technique = api.get_instance("techniques", technique) targets_key = f"{assembly}_targets_bedfile" baits_key = f"{assembly}_baits_bedfile" if targets_key in technique["reference_data"]: raise click.UsageError( f"Technique '{technique['slug']}' " f"has registered BED files for '{assembly}':\n" f'\n\t{technique["reference_data"][targets_key]}' f'\n\t{technique["reference_data"][baits_key]}' ) if not technique["storage_url"]: technique = update_storage_url("techniques", technique["pk"]) api.create_instance("assemblies", name=assembly, species=species) beds_dir = join(technique["storage_url"], "bed_files", assembly) base_name = slugify(f'{technique["slug"]}.{assembly}') targets_dst = join(beds_dir, f"{base_name}.targets.bed") baits_dst = join(beds_dir, f"{base_name}.baits.bed") os.makedirs(beds_dir, exist_ok=True) for src, dst in [(targets_path, targets_dst), (baits_path, baits_dst)]: cls.echo_src_dst("Copying", src, dst) shutil.copy(src, dst) click.secho(f"\nProcessing {basename(dst)}...", fg="blue") cls.process_bedfile(dst) click.secho(f'\nSuccess! patching {technique["slug"]}...', fg="green") for i, j in [(targets_key, targets_dst), (baits_key, baits_dst)]: technique["reference_data"][i] = { "url": j + ".gz", "description": description, } return api.patch_instance( endpoint="techniques", instance_id=technique["pk"], storage_usage=utils.get_tree_size(technique["storage_url"]), reference_data=technique["reference_data"], )
def import_data( cls, identifier, data_src, data_id, symlink, description, sub_dir=None, model="assemblies", ): """ Register reference resources for a given assembly. Arguments: identifier (str): name of assembly or technique. model (str): either `techniques` or `assemblies`. data_src (str): path to reference data. data_id (str): identifier that will be used for reference data. symlink (str): symlink instead of move. description (str): reference data description. sub_dir (str): target sub dir for the resource, default is data_id. Returns: dict: updated assembly instance as retrieved from API. """ utils.check_admin() data_id = slugify(data_id, separator="_") click.echo(f'`data_id` set to: {click.style(data_id, fg="green")}') instance = api.get_instance(model, identifier) if data_id in instance["reference_data"]: raise click.UsageError( f"{instance['name']} has already reference data registered with id " f'"{data_id}":\n\n\t{instance["reference_data"][data_id]}' ) if not instance["storage_url"]: instance = update_storage_url(model, instance["name"]) data_dir = join(instance["storage_url"], sub_dir or data_id) data_dst = join(data_dir, basename(data_src)) os.makedirs(data_dir, exist_ok=True) if symlink: cls.echo_src_dst("Linking", data_src, data_dst) cls.symlink(data_src, data_dst) else: cls.echo_src_dst("Moving", data_src, data_dst) cls.move(data_src, data_dst) click.secho(f'\nSuccess! patching {instance["name"]}...', fg="green") instance["reference_data"][data_id] = {} instance["reference_data"][data_id]["url"] = data_dst instance["reference_data"][data_id]["description"] = description return api.patch_instance( endpoint=model, instance_id=instance["pk"], storage_usage=utils.get_tree_size(instance["storage_url"]), reference_data=instance["reference_data"], )
def patch_results(filters, force): """Update the results field of many analyses.""" utils.check_admin() skipped = [] with click.progressbar( api.get_instances("analyses", verbose=True, **filters), label="Patching analyses...", ) as bar: for i in bar: if force or not i.results: results = api._get_analysis_results(i, raise_error=False) api.patch_instance("analyses", i.pk, results=results) else: # pragma: no cover skipped.append(i) if skipped: # pragma: no cover click.echo( f"{len(skipped)} analyses had results, use --force to update...")
def get_analysis_results(self, analysis): target = analysis["targets"][0] outdir = analysis["storage_url"] multiqc = join(outdir, "multiqc") multiqc_data = join(multiqc, "multiqc_data") results = { "multiqc_html": join(multiqc, "multiqc_report.html"), "multiqc_data": join(multiqc_data, "multiqc_data.json"), "multiqc_stats": join(multiqc_data, "multiqc_general_stats.txt"), "read_length": None, } for key, i in results.items(): if key == "multiqc_data": continue assert True if i is None else isfile(i), f"Missing result {i}" if target["technique"]["category"] == "DNA": read_length_column = "MEAN_READ_LENGTH" read_length_path = "multiqc_picard_AlignmentSummaryMetrics.txt" read_length_path = join(multiqc_data, read_length_path) else: read_length_column = "Read Length" read_length_path = join(multiqc_data, "multiqc_rna_seqc.txt") with open(read_length_path) as f: row = next(csv.DictReader(f, delimiter="\t")) results["read_length"] = float(row[read_length_column]) if "read_length" in target: api.patch_instance( endpoint="experiments", instance_id=target["pk"], read_length=results["read_length"], ) return results
def test_get_data(tmpdir): runner = CliRunner() experiment = api.create_instance("experiments", **factories.ExperimentFactory()) experiment = data.update_storage_url("experiments", experiment.pk) args = [str(experiment.pk)] result = runner.invoke(commands.get_data, args, catch_exceptions=False) assert "No data for" in result.output result = runner.invoke(commands.get_bams, args + ["--verbose"], catch_exceptions=False) assert experiment.system_id in result.output assert "None" in result.output api.patch_instance( "experiments", experiment.pk, raw_data=[ { "file_url": "/hello/world", "file_type": "TXT" }, { "file_url": "/hello/mars", "file_type": "PNG" }, ], ) result = runner.invoke(commands.get_data, args, catch_exceptions=False) assert "/hello/world" in result.output assert "/hello/mars" in result.output result = runner.invoke(commands.get_data, args + ["--dtypes", "TXT"], catch_exceptions=False) assert "/hello/mars" not in result.output
def test_system_id(): data_a = factories.ExperimentFactory() data_b = factories.ExperimentFactory(sample=data_a["sample"]) instance_a = api.create_instance("experiments", **data_a) instance_b = api.create_instance("experiments", **data_b) system_ids = [instance_a["system_id"], instance_b["system_id"]] assert instance_a["sample"]["pk"] == instance_b["sample"]["pk"] assert api.get_instance("experiments", system_ids[0])["pk"] == instance_a["pk"] assert len(api.get_instances("experiments", system_ids)) == 2 instance_a["sample"]["data"]["key"] = "value" instance_a["sample"]["notes"] = "a note" patched = api.patch_instance("experiments", instance_a["pk"], sample=instance_a["sample"]) assert patched["sample"]["data"]["key"] == "value" assert patched["sample"]["notes"] == "a note"
def test_api_methods(): endpoint = "diseases" diseases = [factories.DiseaseFactory() for _ in range(3)] created = [api.create_instance(endpoint, **i) for i in diseases] pk = created[0]["pk"] pks = [i["pk"] for i in created[:2]] patched = api.patch_instance(endpoint, pk, data={"one": 1}) assert patched["data"]["one"] == 1 assert api.get_instance(endpoint, pk)["pk"] == pk assert api.get_instances(endpoint, pk=pk)[0]["pk"] == pk assert api.get_instances_count(endpoint, pk=pk) == 1 assert len( api.get_instances(endpoint)) == api.get_instances_count(endpoint) assert len(api.get_instances(endpoint, pks)) == 2 assert len(api.get_instances(endpoint, pks, pk__in=pks)) == 2 assert len(api.get_instances(endpoint, pks, pk__in=pks[0])) == 1 for i in created: assert api.delete_instance(endpoint, i["pk"]) is None assert api.get_token_headers()["Authorization"]
def update_experiment_bam_file(experiment, assembly_name, analysis_pk, bam_url): """ Update default bam for a experiment given the assembly. Arguments: experiment (dict): experiment dict. assembly_name (str): assembly name. analysis_pk (int): analysis primary key. bam_url (str): bam url. Returns: dict: patched experiment instance """ utils.check_admin() pk = experiment["pk"] bam_files = experiment["bam_files"] if bam_files.get(assembly_name, None): # pragma: no cover raise click.UsageError(f"Experiment {pk} already has {assembly_name} bam") bam_files[assembly_name] = {"url": bam_url, "analysis": analysis_pk} return api.patch_instance("experiments", pk, bam_files=bam_files)
def import_files(self, instance, files, files_data, symlink): """ Move/link files into instance's `storage_url` and update database. Arguments: instance (dict): experiment instance. files (dict): list of files to be imported. symlink (dict): whether to symlink or move the data. files_data (dict): keys are files basenames and values are dicts with extra annotations such as PL, LB, or any other. Raises: click.UsageError: if multiple data formats are found. Returns: dict: patched experiment instance. """ raw_data = [] src_dst = [] if not instance["storage_url"]: instance = update_storage_url( endpoint="experiments", identifier=instance["pk"], use_hash=True ) data_dir = join(instance["storage_url"], "data") os.makedirs(data_dir, exist_ok=True) for src, file_type in [(i["path"], i["dtype"]) for i in files]: file_name = basename(src) file_data = files_data.get(file_name, {}) # make sure there are no duplicate file names if not file_name.startswith(instance["system_id"]): file_hash = hex(abs(hash(dirname(src))))[2:] file_name = f'{instance["system_id"]}_{file_hash}_{file_name}' # make sure we don't add the same file twice if all(i != src for i, _ in src_dst): dst = join(data_dir, file_name) src_dst.append((src, dst)) raw_data.append( dict( hash_value=getsize(src), hash_method="os.path.getsize", file_url=dst, file_type=file_type, file_data=self.annotate_file_data( experiment=instance, file_type=file_type, file_data=file_data, src=src, dst=dst, ), ) ) for src, dst in src_dst: if symlink: self.symlink(src, dst) else: self.move(src, dst) return api.patch_instance( endpoint="experiments", instance_id=instance["pk"], storage_url=instance["storage_url"], storage_usage=utils.get_tree_size(instance["storage_url"]), raw_data=sorted(raw_data, key=lambda i: i["file_url"]), )
def cmd(assembly, symlink, genome_path, dont_index): """ Register an assembly reference genome. By default, an attempt to create indexes will be perfomed. """ assembly = LocalReferenceDataImporter.import_data( data_id="genome_fasta", symlink=symlink, data_src=genome_path, identifier=assembly, model="assemblies", description="Reference Genome Fasta File.", ) genome_fasta = assembly["reference_data"]["genome_fasta"]["url"] genome_dir = dirname(genome_fasta) commands = [ ["bwa", "index", genome_fasta], ["samtools", "faidx", genome_fasta], [ "samtools", "dict", genome_fasta, "-a", assembly["name"], "-s", assembly["species"], "-o", join(genome_fasta + ".dict"), ], ] for i in commands: if dont_index: click.secho(f"Skipping indexing:\n\n\t{' '.join(i)}", fg="yellow") continue try: # pragma: no cover subprocess.check_call(i) except subprocess.CalledProcessError: # pragma: no cover click.secho( f"INDEX FAILED, MUST BE FIXED:\n\n\t{' '.join(i)}", fg="red" ) indexes = { "bwa index": ["amb", "ann", "bwt", "pac", "sa"], "samtools faidx": ["fai"], "samtools dict": ["dict"], } for i, indexes in indexes.items(): for j in indexes: assembly["reference_data"][f"genome_fasta_{j}"] = { "url": join(genome_fasta + f".{j}"), "description": f"Index generated by: {i}", } for i in glob(genome_fasta.split(".", 1)[0] + "*"): dst = join(genome_dir, assembly["name"] + "." + i.split(".", 1)[-1]) if i != dst: utils.force_symlink(i, dst) api.patch_instance( endpoint="assemblies", instance_id=assembly["pk"], storage_usage=utils.get_tree_size(assembly["storage_url"]), reference_data=assembly["reference_data"], )
def update_storage_url(endpoint, identifier, use_hash=False, **data): """Make storage directory and return patched instance.""" data["storage_url"] = get_storage_url(endpoint, identifier, use_hash) return api.patch_instance(endpoint, identifier, **data)
def test_local_data_import(tmpdir): dirs = [tmpdir.strpath] projects = [api.create_instance("projects", **factories.ProjectFactory())] experiments = [ factories.ExperimentFactory(projects=projects) for i in range(4) ] experiments = [ api.create_instance("experiments", **i) for i in experiments ] keys = [i["pk"] for i in experiments] importer = data.LocalDataImporter() _, summary = importer.import_data(directories=dirs, pk__in=keys) obtained = len(summary.rsplit("no files matched")) assert obtained == 4 + 1 # test can't determine type of fastq with pytest.raises(click.UsageError) as error: path_1 = tmpdir.join(f'{experiments[0]["system_id"]}.fastq') path_1.write("foo") importer.import_data(directories=dirs, pk__in=keys) path_1.remove() assert "cant determine fastq type from" in str(error.value) # test imports fastq path_1 = tmpdir.join(f'{experiments[0]["system_id"]}_R1_foo.fastq') path_2 = tmpdir.join(f'{experiments[0]["system_id"]}_R2_foo.fastq') path_1.write("foo") path_2.write("foo") _, summary = importer.import_data(directories=dirs, pk__in=keys, commit=True) assert "samples matched: 1" in summary assert api.Experiment(experiments[0].pk).get_fastq() # test can exclude formats path_1 = tmpdir.join(f'{experiments[1]["system_id"]}_1.fastq') path_2 = tmpdir.join(f'{experiments[1]["system_id"]}.bam') path_1.write("foo") path_2.write("foo") _, summary = importer.import_data(directories=dirs, pk__in=keys, dtypes=["BAM"]) assert "FASTQ_R1" not in str(summary) assert "BAM" in str(summary) # test can import multiple formats _, summary = importer.import_data(directories=dirs, pk__in=keys, commit=True) assert "FASTQ_R1" in str(summary) assert "BAM" in str(summary) # test raise error if duplicated ids with pytest.raises(click.UsageError) as error: api.patch_instance("experiments", experiments[2]["pk"], identifier="dup_id") api.patch_instance("experiments", experiments[3]["pk"], identifier="dup_id") importer.import_data(key=lambda x: x["identifier"], directories=dirs, pk__in=keys) assert "same identifier for" in str(error.value) # test summary path_1 = tmpdir.join(f'_{experiments[2]["system_id"]}_cram1_.cram') path_2 = tmpdir.join(f'_{experiments[2]["system_id"]}_cram2_.cram') path_3 = tmpdir.join(f'_{experiments[3]["system_id"]}_bam1_.bam') path_4 = tmpdir.join(f'_{experiments[3]["system_id"]}_bam2_.bam') path_1.write("foo") path_2.write("foo") path_3.write("foo") path_4.write("foo") imported, summary = importer.import_data(directories=dirs, commit=True, symlink=True, pk__in=keys) project = api.get_instance("projects", projects[0]["pk"]) assert project["storage_url"] assert imported[0]["storage_usage"] > 0 assert imported[0]["raw_data"] assert imported[1]["raw_data"] assert "experiments" in imported[1]["storage_url"] assert len(os.listdir(os.path.join(imported[1]["storage_url"], "data"))) == 2 assert "samples matched: 2" in summary assert "samples skipped: 2" in summary # test import data from command line and files_data functionality path_1 = tmpdir.join(f'{experiments[1]["system_id"]}_1.fastq') path_2 = tmpdir.join(f'{experiments[1]["system_id"]}_2.fastq') path_1.write("foo") path_2.write("foo") api.patch_instance("experiments", experiments[1]["pk"], raw_data=None) file_data = tmpdir.join("file_data.yaml") with open(file_data.strpath, "w") as f: yaml.dump( { os.path.basename(path_1.strpath): { "PU": "TEST_PU" }, os.path.basename(path_2.strpath): { "PU": "TEST_PU" }, }, f, default_flow_style=False, ) command = data.LocalDataImporter.as_cli_command() runner = CliRunner() args = [ "-di", tmpdir.strpath, "-id", "system_id", "-fi", "pk__in", keys, "--files-data", file_data.strpath, "--commit", ] result = runner.invoke(command, args, catch_exceptions=False) assert "samples matched: 1" in result.output experiments[1] = api.get_instance("experiments", experiments[1]["pk"]) assert experiments[1]["raw_data"][0]["file_data"]["PU"] == "TEST_PU" assert experiments[1]["raw_data"][1]["file_data"]["PU"] == "TEST_PU" # test import using invalid identifier args = ["-di", tmpdir.strpath, "-id", "sample", "-fi", "pk__in", keys] result = runner.invoke(command, args) assert "invalid type for identifier" in result.output