def test_validate_report_spec(self):
     for path in walker(DATA_DIR_REPORT_SPECS, json_filter):
         if os.path.basename(path).startswith("report-specs"):
             f = _to_assertion(path, validate_report_spec)
             f(self)
             self.assertIsInstance(load_report_spec_from_json(path),
                                   ReportSpec)
 def test_validate_pipeline_datastore_view_rules(self):
     for path in walker(DATA_DIR_DSVIEW, json_filter):
         f = _to_assertion(path, validate_datastore_view_rules)
         f(self)
         assert isinstance(
             load_pipeline_datastore_view_rules_from_json(path),
             PipelineDataStoreViewRules)
 def test_validate_pipeline_datastore_view_rules(self):
     for path in walker(DATA_DIR_DSVIEW, json_filter):
         f = _to_assertion(path, validate_datastore_view_rules)
         f(self)
         self.assertIsInstance(
             load_pipeline_datastore_view_rules_from_json(path),
             PipelineDataStoreViewRules)
Example #4
0
 def test_validate_report_spec(self):
     for path in walker(DATA_DIR_REPORT_SPECS, json_filter):
         if os.path.basename(path).startswith("report-specs"):
             f = _to_assertion(path, validate_report_spec)
             f(self)
             self.assertIsInstance(load_report_spec_from_json(path),
                                   ReportSpec)
Example #5
0
def dataset_walker(root_dir):
    filter_func = is_xml_dataset
    return walker(root_dir, filter_func)
Example #6
0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer,
                      input_file_name, output_file_name, tmp_dir=None,
                      seqid_prefix=None, subreads_in=None):
    """
    Converts a dataset to a set of fastx file, possibly archived.
    Can take a subreadset or consensusreadset as input.
    Will convert to either fasta or fastq.
    If the dataset is barcoded, it will split the fastx files per-barcode.
    If the output file is .zip, the fastx file(s) will be archived accordingly.
    """
    assert isinstance(program_name, str)
    barcode_mode = False
    barcode_sets = set()
    output_is_archive = (output_file_name.endswith(".zip") or
                         output_file_name.endswith(".tar.gz") or
                         output_file_name.endswith(".tgz"))
    if output_is_archive:
        with openDataSet(input_file_name) as ds_in:
            barcode_mode = ds_in.isBarcoded
            if barcode_mode:
                # attempt to collect the labels of barcodes used on this
                # dataset.  assumes that all BAM files used the same barcodes
                for bam in ds_in.externalResources:
                    if bam.barcodes is not None:
                        barcode_sets.add(bam.barcodes)
    barcode_labels = []
    bio_samples_to_bc = None
    if barcode_mode:
        if len(barcode_sets) == 1:
            bc_file = list(barcode_sets)[0]
            log.info("Reading barcode labels from %s", bc_file)
            try:
                with BarcodeSet(bc_file) as bc_in:
                    for bc in bc_in:
                        barcode_labels.append(bc.id)
            except IOError as e:
                log.error("Can't read %s", bc_file)
                log.error(e)
        elif len(barcode_sets) > 1:
            log.warning("Multiple barcode sets used for this SubreadSet:")
            for fn in sorted(list(barcode_sets)):
                log.warning("  %s", fn)
        else:
            log.info("No barcode labels available")
        if subreads_in is not None:
            bio_samples_to_bc = {}
            with SubreadSet(subreads_in, strict=True) as subread_ds:
                if subread_ds.isBarcoded:  # pylint: disable=no-member
                    bio_samples_to_bc = get_barcode_sample_mappings(subread_ds)
    base_ext = re.sub("bam2", ".", program_name)
    suffix = "{f}.gz".format(f=base_ext)
    tmp_out_dir = tempfile.mkdtemp(dir=tmp_dir)
    tmp_out_prefix = op.join(tmp_out_dir, "tmp_fastx")
    args = [
        program_name,
        "-o", tmp_out_prefix,
        input_file_name,
    ]
    if barcode_mode:
        args.insert(1, "--split-barcodes")
    if seqid_prefix is not None:
        args.extend(["--seqid-prefix", pipes.quote(seqid_prefix)])
    log.info(" ".join(args))
    remove_files = []
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)

    def _is_fastx_file(fn):
        return fn.startswith(tmp_out_prefix) and fn.endswith(suffix)

    try:
        assert result.exit_code == 0, "{p} exited with code {c}".format(
            p=program_name, c=result.exit_code)
        if output_is_archive:
            tc_out_dir = op.dirname(output_file_name)
            fastx_file_names = []
            # find the barcoded FASTX files and un-gzip them to the same
            # output directory and file prefix as the ultimate output
            for fn in walker(tmp_out_dir, _is_fastx_file):
                if barcode_mode:
                    # bam2fastx outputs files with the barcode indices
                    # encoded in the file names; here we attempt to
                    # translate these to barcode labels, falling back on
                    # the original indices if necessary
                    bc_fwd_rev = fn.split(".")[-3].split("_")
                    bc_label = "unbarcoded"
                    if (bc_fwd_rev != ["65535", "65535"] and
                            bc_fwd_rev != ["-1", "-1"]):
                        def _label_or_none(x):
                            try:
                                bc = int(x)
                                if bc < 0:
                                    return "none"
                                elif bc < len(barcode_labels):
                                    return barcode_labels[bc]
                            except ValueError as e:
                                pass
                            return x
                        bc_fwd_label = _label_or_none(bc_fwd_rev[0])
                        bc_rev_label = _label_or_none(bc_fwd_rev[1])
                        bc_label = "{f}--{r}".format(f=bc_fwd_label,
                                                     r=bc_rev_label)
                    suffix2 = ".{l}{t}".format(l=bc_label, t=base_ext)
                    if bio_samples_to_bc is not None:
                        sample = bio_samples_to_bc.get(bc_label, "unknown")
                        suffix2 = ".{}".format(sample) + suffix2
                else:
                    suffix2 = base_ext
                base = re.sub(".zip$", "",
                              re.sub(".tar.gz", "",
                                     re.sub(".tgz", "",
                                            op.basename(output_file_name))))
                fn_out = base
                if not fn_out.endswith(suffix2):
                    fn_out = re.sub(base_ext, suffix2, fn_out)
                fastx_out = op.join(tc_out_dir, fn_out)
                _ungzip_fastx(fn, fastx_out)
                fastx_file_names.append(fastx_out)
                remove_files.append(fn)
            assert len(fastx_file_names) > 0
            remove_files.extend(fastx_file_names)
            return archive_files(fastx_file_names, output_file_name)
        else:
            tmp_out = "{p}{b}.gz".format(p=tmp_out_prefix, b=base_ext)
            _ungzip_fastx(tmp_out, output_file_name)
            remove_files = [tmp_out]
    finally:
        for fn in remove_files:
            os.remove(fn)
    return 0
 def test_validate_pipeline_presets(self):
     for path in walker(DATA_DIR_PRESETS, json_filter):
         f = _to_assertion(path, validate_presets)
         f(self)
         assert isinstance(load_pipeline_presets_from(path), PipelinePreset)
Example #8
0
def dataset_walker(root_dir):
    filter_func = is_xml_dataset
    return walker(root_dir, filter_func)
 def test_validate_pipeline_presets(self):
     for path in walker(DATA_DIR_PRESETS, json_filter):
         f = _to_assertion(path, validate_presets)
         f(self)
         self.assertIsInstance(load_pipeline_presets_from(path), PipelinePreset)
 def test_validate_tool_contracts(self):
     for path in walker(DATA_DIR_TC, json_filter):
         f = _to_assertion(path, validate_tc)
         f(self)
         self.assertIsInstance(load_tool_contract_from(path), ToolContract)
Example #11
0
 def test_validate_tool_contracts(self):
     for path in walker(DATA_DIR_TC, json_filter):
         f = _to_assertion(path, validate_tc)
         f(self)
         self.assertIsInstance(load_tool_contract_from(path), ToolContract)