def _metadata_to_dataset(metadata_xml): output = tempfile.NamedTemporaryFile(suffix=".hdfsubreadset.xml").name log.debug("Generating temporary dataset: {x}".format(x=output)) cmd = "{m} {p} {o}".format(m=Constants.RS_MOVIE_TO_DS, p=metadata_xml, o=output) # the output from movie-metadata-to-dataset is not properly wrapped in pbds namespace, # but the tempfile indicated in the stdout is. Not sure why there are two # outputs stderr_path = tempfile.NamedTemporaryFile(suffix=".stderr").name stderr_fh = open(stderr_path, "w") run_cmd(cmd, stdout_fh=sys.stdout, stderr_fh=stderr_fh) with open(stderr_path, "r") as f: stderr = f.readlines() def _get_tmpfile(stderr): for line in stderr: path = line.split(" ")[-1].rstrip() if os.path.exists(path): if is_dataset(path): return path tmp_dataset_xml = _get_tmpfile(stderr) return tmp_dataset_xml
def archive_files(input_file_names, output_file_name, remove_path=True): """ Create a gzipped tarball from a list of input files. :param remove_path: if True, the directory will be removed from the input file names before archiving. All inputs and the output file must be in the same directory for this to work. """ if remove_path: input_file_names = [op.basename(fn) for fn in input_file_names] args = ["tar", "-czf", output_file_name] + input_file_names log.info("Running '{a}'".format(a=" ".join(args))) _cwd = os.getcwd() try: # we want the files to have no leading path os.chdir(op.dirname(output_file_name)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) except Exception: raise else: if result.exit_code != 0: return result.exit_code finally: os.chdir(_cwd) assert op.isfile(output_file_name) return 0
def _run_cmd(cmd): print cmd result = run_cmd(cmd, sys.stdout, sys.stderr) if result.exit_code != 0: print result raise ValueError("Failed to generate TC from {c}".format(c=cmd)) return result
def run_fasta_to_reference(input_file_name, output_file_name, organism=None, reference_name=None, ploidy="haploid"): if reference_name is None or reference_name == "": reference_name = op.splitext(op.basename(input_file_name))[0] ds_in = ContigSet(input_file_name) if len(ds_in.externalResources) > 1: raise TypeError("Only a single FASTA file is supported as input.") fasta_file_name = ds_in.externalResources[0].resourceId output_dir_name = op.dirname(output_file_name) args = [ "fasta-to-reference", "--organism", str(organism) if organism != "" else "unknown", "--ploidy", str(ploidy) if ploidy != "" else "unknown", "--debug", fasta_file_name, output_dir_name, reference_name ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code ref_file = op.join(output_dir_name, reference_name, "referenceset.xml") assert op.isfile(ref_file) with ReferenceSet(ref_file, strict=True) as ds_ref: ds_ref.makePathsAbsolute() log.info("saving final ReferenceSet to {f}".format(f=output_file_name)) ds_ref.write(output_file_name) return 0
def run_fasta_to_referenceset(input_file_name, output_file_name): args = ["dataset create", "--type ReferenceSet", "--generateIndices", output_file_name, input_file_name] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh = sys.stdout, stderr_fh=sys.stderr) # the '.py' name difference will be resolved in pbdataset/pbcoretools, but # for now, work with either if result.exit_code == 127: args = ["dataset.py create", "--type ReferenceSet", "--generateIndices", output_file_name, input_file_name] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh = sys.stdout, stderr_fh=sys.stderr) return result.exit_code
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") new_prefix = re.sub(".subreadset.xml$", "", output_file_name) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_set_file, "--scoreMode", score_mode, subread_set_file ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code assert op.isfile(output_file_name) tmp_out = op.join(op.dirname(output_file_name), "tmp_" + op.basename(output_file_name)) shutil.move(output_file_name, tmp_out) with SubreadSet(tmp_out, strict=True) as ds: with SubreadSet(subread_set_file) as ds_in: ds.metadata = ds_in.metadata ds.name = ds_in.name + " (barcoded)" ds.updateCounts() ds.newUuid() ds.write(output_file_name) return 0
def _run_bax_to_bam(input_file_name, output_file_name): base_name = ".".join(output_file_name.split(".")[:-2]) input_file_name_tmp = input_file_name # XXX bax2bam won't write an hdfsubreadset unless the input is XML too if input_file_name.endswith(".bax.h5"): input_file_name_tmp = tempfile.NamedTemporaryFile( suffix=".hdfsubreadset.xml").name ds_tmp = HdfSubreadSet(input_file_name) ds_tmp.write(input_file_name_tmp) args =[ "bax2bam", "--subread", "-o", base_name, "--output-xml", output_file_name, "--xml", input_file_name_tmp ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code with SubreadSet(output_file_name) as ds: ds.assertIndexed() return 0
def run_bam_to_fastx(program_name, input_file_name, output_file_name): def _splitext(path): base, ext = os.path.splitext(path) if ext == ".gz": base, ext2 = os.path.splitext(base) ext = ext2 + ext return base, ext args = [ program_name, "-o", _splitext(output_file_name)[0], input_file_name, ] logging.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code else: if not output_file_name.endswith(".gz"): output_file_name_ = output_file_name + ".gz" with gzip.open(output_file_name_) as f_in: with open(output_file_name, "w") as f_out: f_out.write(f_in.read()) return 0
def __run_fasta_to_reference(program_name, dataset_class, input_file_name, output_file_name, organism=None, reference_name=None, ploidy="haploid"): if reference_name is None or reference_name == "": reference_name = op.splitext(op.basename(input_file_name))[0] ds_in = ContigSet(input_file_name) if len(ds_in.externalResources) > 1: raise TypeError("Only a single FASTA file is supported as input.") fasta_file_name = ds_in.externalResources[0].resourceId output_dir_name = op.dirname(output_file_name) args = [ program_name, "--organism", str(organism) if organism != "" else "unknown", "--ploidy", str(ploidy) if ploidy != "" else "unknown", "--debug", fasta_file_name, output_dir_name, reference_name ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code ref_file = op.join(output_dir_name, reference_name, "{t}.xml".format(t=dataset_class.__name__.lower())) assert op.isfile(ref_file), ref_file with dataset_class(ref_file, strict=True) as ds_ref: ds_ref.makePathsAbsolute() log.info("saving final {t} to {f}".format( f=output_file_name, t=dataset_class.__name__)) ds_ref.write(output_file_name) return 0
def _run_bax_to_bam(input_file_name, output_file_name): base_name = ".".join(output_file_name.split(".")[:-2]) input_file_name_tmp = input_file_name # XXX bax2bam won't write an hdfsubreadset unless the input is XML too if input_file_name.endswith(".bax.h5"): input_file_name_tmp = tempfile.NamedTemporaryFile( suffix=".hdfsubreadset.xml").name ds_tmp = HdfSubreadSet(input_file_name) ds_tmp.write(input_file_name_tmp) args =[ "bax2bam", "--subread", "-o", base_name, "--output-xml", output_file_name, "--xml", input_file_name_tmp ] logging.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name shutil.move(output_file_name, tmp) # FIXME it would be better to leave this to bax2bam with SubreadSet(tmp) as ds: if not ds.isIndexed: ds.induceIndices() ds.write(output_file_name) return 0
def run_fasta_to_fofn(input_file_name, output_file_name): args = ["echo", input_file_name, ">", output_file_name] logging.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) return result.exit_code
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1): bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, subreads_bam, scraps_bam ] print args log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) # FIXME we need a more general method for this ext_res_new = ExternalResource() ext_res_new.resourceId = subreads_bam ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile' ext_res_new.addIndices([subreads_bam + ".pbi"]) ext_res_inner = ExternalResources() ext_res_scraps = ExternalResource() ext_res_scraps.resourceId = scraps_bam ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile' ext_res_scraps.addIndices([scraps_bam + ".pbi"]) ext_res_inner.append(ext_res_scraps) ext_res_new.append(ext_res_inner) ds_new.externalResources.append(ext_res_new) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.updateCounts() ds_new.write(output_file_name) return 0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError( "Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join( op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format( f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def run_rtc(rtc): """Dev Task for calling a subprocess exe. In this case it's python""" nrecords = rtc.task.options[_to_opt_id("nrecords")] _d = dict(i=rtc.task.input_files[0], o=rtc.task.output_files[0], r=nrecords) exe = "hello-world.py {i} {p} --nrecords {r}".format(**_d) result = run_cmd(exe, sys.stdout, sys.stderr) log.info("Completed running {e} Result {r}".format(e=exe, r=result)) return result
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, "--scoreMode", score_mode, subreads_bam, scraps_bam ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) add_subread_resources(ds_new, subreads=subreads_bam, scraps=scraps_bam, barcodes=barcode_set_file) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.metadata = ds.metadata ds_new.name = ds.name + " (barcoded)" ds_new.updateCounts() ds_new.newUuid() ds_new.write(output_file_name) return 0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer, input_file_name, output_file_name, tmp_dir=None): assert isinstance(program_name, basestring) barcode_mode = False if output_file_name.endswith(".gz"): with openDataSet(input_file_name) as ds_in: barcode_mode = ds_in.isBarcoded tmp_out_prefix = tempfile.NamedTemporaryFile(dir=tmp_dir).name args = [ program_name, "-o", tmp_out_prefix, input_file_name, ] if barcode_mode: args.insert(1, "--split-barcodes") log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code else: base_ext = re.sub("bam2", "", program_name) if not barcode_mode: tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext) assert os.path.isfile(tmp_out), tmp_out if output_file_name.endswith(".gz"): log.info("cp {t} {f}".format(t=tmp_out, f=output_file_name)) shutil.copyfile(tmp_out, output_file_name) else: _unzip_fastx(tmp_out, output_file_name) os.remove(tmp_out) else: suffix = "{f}.gz".format(f=base_ext) tmp_out_dir = op.dirname(tmp_out_prefix) tc_out_dir = op.dirname(output_file_name) barcoded_file_names = [] # find the barcoded FASTX files and unzip them to the same # output directory and file prefix as the ultimate output for fn in os.listdir(tmp_out_dir): fn = op.join(tmp_out_dir, fn) if fn.startswith(tmp_out_prefix) and fn.endswith(suffix): bc_fwd_rev = fn.split(".")[-3].split("_") suffix2 = ".{f}_{r}.{t}".format( f=bc_fwd_rev[0], r=bc_fwd_rev[1], t=base_ext) assert fn == tmp_out_prefix + suffix2 + ".gz" fn_out = re.sub(".gz$", suffix2, output_file_name) fastx_out = op.join(tc_out_dir, fn_out) _unzip_fastx(fn, fastx_out) barcoded_file_names.append(fn_out) os.remove(fn) assert len(barcoded_file_names) > 0 return archive_files(barcoded_file_names, output_file_name) return 0
def run_bax_to_bam(input_file_name, output_file_name): base_name = os.path.splitext(output_file_name)[0] args = [ "bax2bam", "--subread", "-o", base_name, "--output-xml", output_file_name, "--xml", input_file_name ] logging.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) return result.exit_code
def test_simple_run_cmd(self): d = get_temp_dir("simple-cmd") txt_in = get_temp_file(".txt", d) txt_out = get_temp_file("*.txt", d) exe = "cat {i} > {o}".format(i=txt_in, o=txt_out) # this could all be bundled into a context manager # with RunCommand('/path/stdout', '/path/to/stderr') as r: # r.exe("echo 'exe1') # r.exe("echo 'exe2') # result = r.get_result() # close the file handles stdout = get_temp_file("-stdout", d) stderr = get_temp_file("-stderr", d) with open(stdout, 'w') as fo: with open(stderr, 'w') as fe: result = run_cmd(exe, fo, fe) emgs = "Command {e} failed".format(e=exe) self.assertEquals(result.exit_code, 0, emgs)
def run_bam_to_fastx(program_name, fastx_reader, fastx_writer, input_file_name, output_file_name, min_subread_length=0): assert isinstance(program_name, basestring) # XXX this is really annoying; bam2fastx needs a --no-gzip feature tmp_out_prefix = tempfile.NamedTemporaryFile().name args = [ program_name, "-o", tmp_out_prefix, input_file_name, ] logging.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code else: base_ext = re.sub("bam2", "", program_name) tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext) assert os.path.isfile(tmp_out), tmp_out logging.info("raw output in {f}".format(f=tmp_out)) def _open_file(file_name): if file_name.endswith(".gz"): return gzip.open(file_name) else: return open(file_name) if min_subread_length > 0: logging.info("Filtering subreads by minimum length = {l}".format( l=min_subread_length)) elif min_subread_length < 0: logging.warn("min_subread_length = {l}, ignoring".format( l=min_subread_length)) with _open_file(tmp_out) as raw_in: with fastx_reader(raw_in) as fastx_in: with fastx_writer(output_file_name) as fastx_out: for rec in fastx_in: if (min_subread_length < 1 or min_subread_length < len(rec.sequence)): fastx_out.writeRecord(rec) os.remove(tmp_out) return 0
def run_bax_to_bam(input_file_name, output_file_name): base_name = os.path.splitext(output_file_name)[0] args = [ "bax2bam", "--subread", "-o", base_name, "--output-xml", output_file_name, "--xml", input_file_name ] logging.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name shutil.move(output_file_name, tmp) # FIXME it would be better to leave this to bax2bam with openDataSet(tmp) as ds: if not ds.isIndexed: ds.induceIndices() ds.write(output_file_name, validate=False) # FIXME bad XML! return 0
def run_basecaller(trc_file, baz_file, nproc=1, stdout=sys.stdout, stderr=sys.stderr, basecaller_exe=Constants.BASECALLER_EXE, basecaller_options=Constants.BASECALLER_OPTIONS, basecaller_module=Constants.BASECALLER_MODULE): """ Run the offline basecaller on a trace file. """ exe = " && ".join([Constants.GNU_MODULE_INIT, ' '.join([Constants.GNU_MODULE_LOAD, basecaller_module]), basecaller_exe]) args = [ # Wrap with a bash invocation (instead of sh) "/bin/bash", "-c", "'", exe, "--inputfile={i}".format(i=trc_file), "--outputbazfile={o}".format(o=baz_file), "--numthreads={n}".format(n=nproc), ] if basecaller_options != "": args.extend(basecaller_options.split(' ')) # finish bash invocation wrap args.append("'") logging.info("Command " + ' '.join(args)) result = run_cmd(' '.join(args), stdout_fh=stdout, stderr_fh=stderr) if not op.isfile(baz_file): stderr.write("Result {}".format(result)) stderr.write("Unable to produce Baz file from command: {a}\n".format(a=args)) return result.exit_code
def run(args): output_dir = os.getcwd() if len(args) == 1: output_dir = args[0] assert os.path.isdir(output_dir), "Not a directory: %s"%output_dir module_dir = os.path.join(os.path.dirname(__file__), "pbcoretools", "tasks") for file_name in os.listdir(module_dir): if file_name.endswith(".py") and not file_name.startswith("_"): if file_name in ["converters.py", "filters.py"]: continue module_name = "pbcoretools.tasks.{m}".format(m=file_name[:-3]) json_file = os.path.join(output_dir, "{m}_tool_contract.json".format(m=module_name)) cmd = "python -m {m} --emit-tool-contract > {j}".format( m=module_name, j=json_file) run_cmd(cmd, sys.stdout, sys.stderr) cmd = "python -m pbcoretools.tasks.converters emit-tool-contracts -o {d}".format(d=output_dir) run_cmd(cmd, sys.stdout, sys.stderr) cmd = "python -m pbcoretools.tasks.filters emit-tool-contracts -o {d}".format(d=output_dir) run_cmd(cmd, sys.stdout, sys.stderr)
def run(args): output_dir = os.getcwd() if len(args) == 1: output_dir = args[0] assert os.path.isdir(output_dir), "Not a directory: %s" % output_dir module_dir = os.path.join(os.path.dirname(__file__), "pbcoretools", "tasks") for file_name in os.listdir(module_dir): if file_name.endswith(".py") and not file_name.startswith("_"): if file_name in ["converters.py", "filters.py"]: continue module_name = "pbcoretools.tasks.{m}".format(m=file_name[:-3]) json_file = os.path.join( output_dir, "{m}_tool_contract.json".format(m=module_name)) cmd = "python -m {m} --emit-tool-contract > {j}".format( m=module_name, j=json_file) run_cmd(cmd, sys.stdout, sys.stderr) cmd = "python -m pbcoretools.tasks.converters emit-tool-contracts -o {d}".format( d=output_dir) run_cmd(cmd, sys.stdout, sys.stderr) cmd = "python -m pbcoretools.tasks.filters emit-tool-contracts -o {d}".format( d=output_dir) run_cmd(cmd, sys.stdout, sys.stderr)
def _run_cmd(cmd): x = run_cmd(cmd, sys.stdout, sys.stderr) if x.exit_code != 0: log.error(x) return x
def run_baz2bam(baz_file, adapter_fa, metadata_xml, output_file, nproc=1, min_subread_length=Constants.MIN_SUBREAD_LENGTH, baz2bam_exe=Constants.BAZ2BAM_EXE, ppa_module=Constants.PPA_MODULE, stdout=sys.stdout, stderr=sys.stderr, dataset_name_suffix=None): """ Convert the .baz file from the basecaller to a SubreadSet. Note, the emitted SubreadSet will have a new UUID :param output_file: Base prefix for output files :param dataset_name_suffix: Will update the dataset name with the supplied suffix :type dataset_name_suffix: str | None """ assert output_file.endswith(".subreadset.xml") output_base = re.sub(".subreadset.xml", "", output_file) output_dir = op.dirname(output_file) exe = " && ".join([Constants.GNU_MODULE_INIT, ' '.join([Constants.GNU_MODULE_LOAD, ppa_module]), baz2bam_exe]) args = [ "/bin/bash", "-c", "'", exe, baz_file, "--silent", "--minSubLength", str(min_subread_length), "--metadata={x}".format(x=metadata_xml), "--adapter={f}".format(f=adapter_fa), "-o", output_base, "-j", str(nproc), "-b", str(nproc), "'", ] logging.info(" ".join(args)) result = run_cmd(' '.join(args), stdout, stderr) assert result.exit_code == 0, \ "Failed with exit code {c}".format(c=result.exit_code) subreads_file = output_base + ".subreads.bam" scraps_file = output_base + ".scraps.bam" assert op.isfile(subreads_file), subreads_file assert op.isfile(scraps_file), scraps_file subreadset_file = output_base + ".subreadset.xml" assert op.isfile(subreadset_file) tmp_ds = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name # Must copy the adapters file to new SubreadSet output dir # otherwise, the file will be invalid new_adapters = op.join(output_dir, op.basename(adapter_fa)) if not op.exists(new_adapters): shutil.copy(adapter_fa, new_adapters) # FIXME, This should really update the PA version (SigProcVer) or at a minimum, # augment the version with SubreadSet(subreadset_file) as ds: ds.makePathsAbsolute() if dataset_name_suffix is not None: name = ds.name new_ds_name = "_".join([name, dataset_name_suffix]) ds.name = new_ds_name ds.newUuid(setter=True) ds.write(tmp_ds) log.info("Wrote new SubreadSet {u} to {p}".format(u=ds.uuid, p=subreads_file)) shutil.move(tmp_ds, subreadset_file) return 0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer, input_file_name, output_file_name, tmp_dir=None, seqid_prefix=None, subreads_in=None): """ Converts a dataset to a set of fastx file, possibly archived. Can take a subreadset or consensusreadset as input. Will convert to either fasta or fastq. If the dataset is barcoded, it will split the fastx files per-barcode. If the output file is .zip, the fastx file(s) will be archived accordingly. """ assert isinstance(program_name, str) barcode_mode = False barcode_sets = set() output_is_archive = (output_file_name.endswith(".zip") or output_file_name.endswith(".tar.gz") or output_file_name.endswith(".tgz")) if output_is_archive: with openDataSet(input_file_name) as ds_in: barcode_mode = ds_in.isBarcoded if barcode_mode: # attempt to collect the labels of barcodes used on this # dataset. assumes that all BAM files used the same barcodes for bam in ds_in.externalResources: if bam.barcodes is not None: barcode_sets.add(bam.barcodes) barcode_labels = [] bio_samples_to_bc = None if barcode_mode: if len(barcode_sets) == 1: bc_file = list(barcode_sets)[0] log.info("Reading barcode labels from %s", bc_file) try: with BarcodeSet(bc_file) as bc_in: for bc in bc_in: barcode_labels.append(bc.id) except IOError as e: log.error("Can't read %s", bc_file) log.error(e) elif len(barcode_sets) > 1: log.warning("Multiple barcode sets used for this SubreadSet:") for fn in sorted(list(barcode_sets)): log.warning(" %s", fn) else: log.info("No barcode labels available") if subreads_in is not None: bio_samples_to_bc = {} with SubreadSet(subreads_in, strict=True) as subread_ds: if subread_ds.isBarcoded: # pylint: disable=no-member bio_samples_to_bc = get_barcode_sample_mappings(subread_ds) base_ext = re.sub("bam2", ".", program_name) suffix = "{f}.gz".format(f=base_ext) tmp_out_dir = tempfile.mkdtemp(dir=tmp_dir) tmp_out_prefix = op.join(tmp_out_dir, "tmp_fastx") args = [ program_name, "-o", tmp_out_prefix, input_file_name, ] if barcode_mode: args.insert(1, "--split-barcodes") if seqid_prefix is not None: args.extend(["--seqid-prefix", pipes.quote(seqid_prefix)]) log.info(" ".join(args)) remove_files = [] result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) def _is_fastx_file(fn): return fn.startswith(tmp_out_prefix) and fn.endswith(suffix) try: assert result.exit_code == 0, "{p} exited with code {c}".format( p=program_name, c=result.exit_code) if output_is_archive: tc_out_dir = op.dirname(output_file_name) fastx_file_names = [] # find the barcoded FASTX files and un-gzip them to the same # output directory and file prefix as the ultimate output for fn in walker(tmp_out_dir, _is_fastx_file): if barcode_mode: # bam2fastx outputs files with the barcode indices # encoded in the file names; here we attempt to # translate these to barcode labels, falling back on # the original indices if necessary bc_fwd_rev = fn.split(".")[-3].split("_") bc_label = "unbarcoded" if (bc_fwd_rev != ["65535", "65535"] and bc_fwd_rev != ["-1", "-1"]): def _label_or_none(x): try: bc = int(x) if bc < 0: return "none" elif bc < len(barcode_labels): return barcode_labels[bc] except ValueError as e: pass return x bc_fwd_label = _label_or_none(bc_fwd_rev[0]) bc_rev_label = _label_or_none(bc_fwd_rev[1]) bc_label = "{f}--{r}".format(f=bc_fwd_label, r=bc_rev_label) suffix2 = ".{l}{t}".format(l=bc_label, t=base_ext) if bio_samples_to_bc is not None: sample = bio_samples_to_bc.get(bc_label, "unknown") suffix2 = ".{}".format(sample) + suffix2 else: suffix2 = base_ext base = re.sub(".zip$", "", re.sub(".tar.gz", "", re.sub(".tgz", "", op.basename(output_file_name)))) fn_out = base if not fn_out.endswith(suffix2): fn_out = re.sub(base_ext, suffix2, fn_out) fastx_out = op.join(tc_out_dir, fn_out) _ungzip_fastx(fn, fastx_out) fastx_file_names.append(fastx_out) remove_files.append(fn) assert len(fastx_file_names) > 0 remove_files.extend(fastx_file_names) return archive_files(fastx_file_names, output_file_name) else: tmp_out = "{p}{b}.gz".format(p=tmp_out_prefix, b=base_ext) _ungzip_fastx(tmp_out, output_file_name) remove_files = [tmp_out] finally: for fn in remove_files: os.remove(fn) return 0
def run_fasta_to_fofn(input_file_name, output_file_name): args = ["echo", input_file_name, ">", output_file_name] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh = sys.stdout, stderr_fh=sys.stderr) return result.exit_code