def makeNanoporeRead(f5_path): # here we load the NanoporeRead and write it to a file np = NanoporeRead(fast_five_file=f5_path, twoD=False) # make this a config arg ok = np.Initialize(job) if not ok: return None _l = np.read_label tF = job.fileStore.getLocalTempFile() fH = open(tF, "w") ok = np.Write(job, fH, initialize=False) if not ok: fH.close() return None fH.close() # then we gzip it and deliver it to the readstore and return the ledger line fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l) fH = open(tF, "rb") gz = gzip.open(fn.fullpathGetter(), "wb") shutil.copyfileobj(fH, gz) fH.close() gz.close() try: deliverOutput(job, fn, readstore_dir) except RuntimeError: job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l) return None return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))
def chainSamFileJobFunction(job, config, aln_struct): # Cull the files from the job store that we want if config["chain"] is None and config["realign"] is None: job.fileStore.logToMaster("[chainSamFileJobFunction]Nothing to do.") return if config["chain"] is not None: sam_file = job.fileStore.readGlobalFile(aln_struct.FileStoreID()) reference = job.fileStore.readGlobalFile(config["reference_FileStoreID"]) reads = job.fileStore.readGlobalFile(config["sample_FileStoreID"]) workdir = job.fileStore.getLocalTempDir() output_sam = LocalFile(workdir=workdir, filename="{}_chained.bam".format(config["sample_label"])) if config["debug"]: job.fileStore.logToMaster("[chainSamFileJobFunction] chaining {bwa_out} (locally: {sam})" "".format(bwa_out=aln_struct.FileStoreID(), sam=sam_file)) chainSamFile(parent_job=job, samFile=sam_file, outputSamFile=output_sam.fullpathGetter(), readFastqFile=reads, referenceFastaFile=reference) chainedSamFileId = job.fileStore.writeGlobalFile(output_sam.fullpathGetter()) deliverOutput(job, output_sam, config["output_dir"]) job.addFollowOnJobFn(realignmentRootJobFunction, config, chainedSamFileId) else: job.fileStore.logToMaster("[chainSamFileJobFunction]Not chaining SAM, passing alignment " "on to realignment") job.addFollowOnJobFn(realignmentRootJobFunction, config, aln_struct.FileStoreID())
def deliverLedgerJobFunction(job, config, ledger_fids): fHs = [open(job.fileStore.readGlobalFile(f), "r") for f in ledger_fids] ls = [pickle.load(f) for f in fHs] ledger = ls[0] [ledger.update(d) for d in ls[1:]] fn = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_ledger.pkl" % config["ledger_name"]) _h = open(fn.fullpathGetter(), "w") pickle.dump(ledger, _h) _h.close() deliverOutput(job, fn, config["readstore_ledger_dir"])
def consolidateMethylationCallsJobFunction(job, config, methylation_prob_fids): outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_%s.tsv" % (config["sample_label"], config["degenerate"])) _handle = open(outfile.fullpathGetter(), "w") files = fileinput.input( [job.fileStore.readGlobalFile(fid) for fid in methylation_prob_fids]) map(lambda l: _handle.write(l), files) files.close() _handle.close() deliverOutput(job, outfile, config["output_dir"]) return
def consolidateVariantCallsJobFunction(job, config, posterior_prob_fids): variants = getVariantCallFunctions(config["degenerate"]) parser = variants.parseVariantCalls file_iter = (job.fileStore.readGlobalFile(fid) for fid in posterior_prob_fids) table = pd.concat([parser(f) for f in file_iter]).sort_values(["contig", "ref_pos"]) outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_%s.tsv" % (config["sample_label"], config["degenerate"])) _handle = open(outfile.fullpathGetter(), "w") variants.writeVariantCalls(table, _handle) _handle.close() deliverOutput(job, outfile, config["output_dir"])
def calculateMethylationProbabilityJobFunction( job, config, cPecan_config, ignore_hmm, batch_number, signalMachine_image="quay.io/artrand/signalmachine"): def _get_url(read_label): try: return ledger[read_label] except KeyError: return None def _SignalMachine(read_label, cigar, nanopore_read): guide_aln = LocalFile(workdir=workdir) _handle = open(guide_aln.fullpathGetter(), "w") _handle.write(cigar) _handle.close() require(os.path.exists(guide_aln.fullpathGetter()), "NO guide aln file") signalMachine_args = [ "--sm3Hdp", "-s", "1", "-o", "%s" % degenerate_enum, "-L", "%s" % read_label, "-T", "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)), "-q", "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()), "-f", "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()), "-b", "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()), "-p", "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()), "-u", "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()), "-v", "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)), ] try: docker_call(job=job, tool=signalMachine_image, parameters=signalMachine_args, work_dir=(workdir + "/")) except subprocess.CalledProcessError: pass def _parse_probabilities(): return pd.read_table( posteriors.fullpathGetter(), usecols=(1, 2, 3, 6), names=["ref_pos", "base", "posterior", "read_label"], dtype={ "ref_pos": np.int, "base": np.str, "posterior": np.float64, "read_label": np.str }) def _sumExpectationsOverColumns(): f = LocalFile(workdir=workdir) _h = open(f.fullpathGetter(), "w") for pos, pos_df in aligned_pairs.groupby(["ref_pos"]): for base, base_df in pos_df.groupby("base"): marginal_prob = base_df["posterior"].sum() coverage = len(base_df["read_label"].unique()) l = "%s\t%s\t%s\t%s\t%s\n" % (cPecan_config["contig_name"], pos, base, marginal_prob, coverage) _h.write(l) _h.close() return f job.fileStore.logToMaster( "[calculateMethylationProbabilityJobFunction]Running on batch %s" % batch_number) workdir = job.fileStore.getLocalTempDir() fw_seqfile, bw_seqfile, ok = processReferenceSequence( cPecan_config["contig_seq"], workdir, config["motif_key"], config["substitute_char"]) if not ok: raise RuntimeError( "[calculateMethylationProbabilityJobFunction]ERROR processing reference sequences" ) # get the models hmmfid = config["HMM_fid"] hdpfid = config["HDP_fid"] try: models = LocalFileManager(job=job, fileIds_to_get=[hmmfid, hdpfid], workdir=workdir) except AssertionError: raise RuntimeError( "[calculateMethylationProbabilityJobFunction]ERROR getting models locally" ) # download the npRead files ledger = config["ledger"] url_iter = (_get_url(l.strip()) for l in cPecan_config["query_labels"]) read_urls = [u for u in url_iter if u is not None] if config["debug"]: job.fileStore.logToMaster( "[calculateMethylationProbabilityJobFunction]Got %s URLs" % len(read_urls)) npReads = [ unzipLocalFile(f) for f in [urlDownloadToLocalFile(job, workdir, url) for url in read_urls] if f is not None ] failed = len(read_urls) - len(npReads) if failed > 0 and config["stop_at_failed_reads"]: raise RuntimeError( "[calculateMethylationProbabilityJobFunction]Got %s failed npRead" "downloads and stop_at_failed_reads is True" % failed) else: if config["debug"]: job.fileStore.logToMaster( "[calculateMethylationProbabilityJobFunction]" "Failed to download and upzip %s NanoporeReads" % failed) # file to collect the posterior probs posteriors = LocalFile(workdir=workdir, filename="%s_%s.dat" % (config["sample_label"], uuid.uuid4())) degenerate_enum = getVariantCallFunctions(config["degenerate"]).enum() # do the signal alignment, and get the posterior probabilities map( lambda (l, c, n): _SignalMachine(l.strip(), c, n), zip(cPecan_config["query_labels"], cPecan_config["exonerate_cigars"], npReads)) # the reads may not produce any posteriors, if, for example, they don't align to a region where # there are any ambiguity characters the posteriors file will be empty and we just return # None, which is the convention if not os.path.exists(posteriors.fullpathGetter()) or os.stat( posteriors.fullpathGetter()).st_size == 0: return None # reminder: the convention is that 'expectations' are un-normalized posterior probabilities # so this file is a table of expectatiosn, I also use the convention that the trailing # underscore means `file` or `file-path` aligned_pairs = _parse_probabilities() expectations_ = _sumExpectationsOverColumns() if config["probs_output_dir"] is not None: deliverOutput(job, posteriors, config["probs_output_dir"]) return job.fileStore.writeGlobalFile(expectations_.fullpathGetter())