Example #1
0
 def makeNanoporeRead(f5_path):
     # here we load the NanoporeRead and write it to a file
     np = NanoporeRead(fast_five_file=f5_path, twoD=False)  # make this a config arg
     ok = np.Initialize(job)
     if not ok:
         return None
     _l = np.read_label
     tF = job.fileStore.getLocalTempFile()
     fH = open(tF, "w")
     ok = np.Write(job, fH, initialize=False)
     if not ok:
         fH.close()
         return None
     fH.close()
     # then we gzip it and deliver it to the readstore and return the ledger line
     fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l)
     fH = open(tF, "rb")
     gz = gzip.open(fn.fullpathGetter(), "wb")
     shutil.copyfileobj(fH, gz)
     fH.close()
     gz.close()
     try:
         deliverOutput(job, fn, readstore_dir)
     except RuntimeError:
         job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l)
         return None
     return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))
Example #2
0
def chainSamFileJobFunction(job, config, aln_struct):
    # Cull the files from the job store that we want
    if config["chain"] is None and config["realign"] is None:
        job.fileStore.logToMaster("[chainSamFileJobFunction]Nothing to do.")
        return

    if config["chain"] is not None:
        sam_file   = job.fileStore.readGlobalFile(aln_struct.FileStoreID())
        reference  = job.fileStore.readGlobalFile(config["reference_FileStoreID"])
        reads      = job.fileStore.readGlobalFile(config["sample_FileStoreID"])
        workdir    = job.fileStore.getLocalTempDir()
        output_sam = LocalFile(workdir=workdir, filename="{}_chained.bam".format(config["sample_label"]))

        if config["debug"]:
            job.fileStore.logToMaster("[chainSamFileJobFunction] chaining {bwa_out} (locally: {sam})"
                                      "".format(bwa_out=aln_struct.FileStoreID(), sam=sam_file))

        chainSamFile(parent_job=job,
                     samFile=sam_file,
                     outputSamFile=output_sam.fullpathGetter(),
                     readFastqFile=reads,
                     referenceFastaFile=reference)

        chainedSamFileId = job.fileStore.writeGlobalFile(output_sam.fullpathGetter())
        deliverOutput(job, output_sam, config["output_dir"])
        job.addFollowOnJobFn(realignmentRootJobFunction, config, chainedSamFileId)

    else:
        job.fileStore.logToMaster("[chainSamFileJobFunction]Not chaining SAM, passing alignment "
                                  "on to realignment")
        job.addFollowOnJobFn(realignmentRootJobFunction, config, aln_struct.FileStoreID())
Example #3
0
def deliverLedgerJobFunction(job, config, ledger_fids):
    fHs    = [open(job.fileStore.readGlobalFile(f), "r") for f in ledger_fids]
    ls     = [pickle.load(f) for f in fHs]
    ledger = ls[0]
    [ledger.update(d) for d in ls[1:]]

    fn = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_ledger.pkl" % config["ledger_name"])
    _h = open(fn.fullpathGetter(), "w")
    pickle.dump(ledger, _h)
    _h.close()
    deliverOutput(job, fn, config["readstore_ledger_dir"])
Example #4
0
def consolidateMethylationCallsJobFunction(job, config, methylation_prob_fids):
    outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(),
                        filename="%s_%s.tsv" %
                        (config["sample_label"], config["degenerate"]))
    _handle = open(outfile.fullpathGetter(), "w")
    files = fileinput.input(
        [job.fileStore.readGlobalFile(fid) for fid in methylation_prob_fids])
    map(lambda l: _handle.write(l), files)
    files.close()
    _handle.close()
    deliverOutput(job, outfile, config["output_dir"])
    return
Example #5
0
def consolidateVariantCallsJobFunction(job, config, posterior_prob_fids):
    variants = getVariantCallFunctions(config["degenerate"])
    parser = variants.parseVariantCalls
    file_iter = (job.fileStore.readGlobalFile(fid)
                 for fid in posterior_prob_fids)
    table = pd.concat([parser(f)
                       for f in file_iter]).sort_values(["contig", "ref_pos"])
    outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(),
                        filename="%s_%s.tsv" %
                        (config["sample_label"], config["degenerate"]))
    _handle = open(outfile.fullpathGetter(), "w")
    variants.writeVariantCalls(table, _handle)
    _handle.close()
    deliverOutput(job, outfile, config["output_dir"])
Example #6
0
def calculateMethylationProbabilityJobFunction(
        job,
        config,
        cPecan_config,
        ignore_hmm,
        batch_number,
        signalMachine_image="quay.io/artrand/signalmachine"):
    def _get_url(read_label):
        try:
            return ledger[read_label]
        except KeyError:
            return None

    def _SignalMachine(read_label, cigar, nanopore_read):
        guide_aln = LocalFile(workdir=workdir)
        _handle = open(guide_aln.fullpathGetter(), "w")
        _handle.write(cigar)
        _handle.close()
        require(os.path.exists(guide_aln.fullpathGetter()),
                "NO guide aln file")
        signalMachine_args = [
            "--sm3Hdp",
            "-s",
            "1",
            "-o",
            "%s" % degenerate_enum,
            "-L",
            "%s" % read_label,
            "-T",
            "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)),
            "-q",
            "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()),
            "-f",
            "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()),
            "-b",
            "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()),
            "-p",
            "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()),
            "-u",
            "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()),
            "-v",
            "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)),
        ]
        try:
            docker_call(job=job,
                        tool=signalMachine_image,
                        parameters=signalMachine_args,
                        work_dir=(workdir + "/"))
        except subprocess.CalledProcessError:
            pass

    def _parse_probabilities():
        return pd.read_table(
            posteriors.fullpathGetter(),
            usecols=(1, 2, 3, 6),
            names=["ref_pos", "base", "posterior", "read_label"],
            dtype={
                "ref_pos": np.int,
                "base": np.str,
                "posterior": np.float64,
                "read_label": np.str
            })

    def _sumExpectationsOverColumns():
        f = LocalFile(workdir=workdir)
        _h = open(f.fullpathGetter(), "w")
        for pos, pos_df in aligned_pairs.groupby(["ref_pos"]):
            for base, base_df in pos_df.groupby("base"):
                marginal_prob = base_df["posterior"].sum()
                coverage = len(base_df["read_label"].unique())
                l = "%s\t%s\t%s\t%s\t%s\n" % (cPecan_config["contig_name"],
                                              pos, base, marginal_prob,
                                              coverage)
                _h.write(l)
        _h.close()
        return f

    job.fileStore.logToMaster(
        "[calculateMethylationProbabilityJobFunction]Running on batch %s" %
        batch_number)
    workdir = job.fileStore.getLocalTempDir()
    fw_seqfile, bw_seqfile, ok = processReferenceSequence(
        cPecan_config["contig_seq"], workdir, config["motif_key"],
        config["substitute_char"])
    if not ok:
        raise RuntimeError(
            "[calculateMethylationProbabilityJobFunction]ERROR processing reference sequences"
        )
    # get the models
    hmmfid = config["HMM_fid"]
    hdpfid = config["HDP_fid"]
    try:
        models = LocalFileManager(job=job,
                                  fileIds_to_get=[hmmfid, hdpfid],
                                  workdir=workdir)
    except AssertionError:
        raise RuntimeError(
            "[calculateMethylationProbabilityJobFunction]ERROR getting models locally"
        )

    # download the npRead files
    ledger = config["ledger"]
    url_iter = (_get_url(l.strip()) for l in cPecan_config["query_labels"])
    read_urls = [u for u in url_iter if u is not None]

    if config["debug"]:
        job.fileStore.logToMaster(
            "[calculateMethylationProbabilityJobFunction]Got %s URLs" %
            len(read_urls))

    npReads = [
        unzipLocalFile(f) for f in
        [urlDownloadToLocalFile(job, workdir, url) for url in read_urls]
        if f is not None
    ]
    failed = len(read_urls) - len(npReads)

    if failed > 0 and config["stop_at_failed_reads"]:
        raise RuntimeError(
            "[calculateMethylationProbabilityJobFunction]Got %s failed npRead"
            "downloads and stop_at_failed_reads is True" % failed)
    else:
        if config["debug"]:
            job.fileStore.logToMaster(
                "[calculateMethylationProbabilityJobFunction]"
                "Failed to download and upzip %s NanoporeReads" % failed)

    # file to collect the posterior probs
    posteriors = LocalFile(workdir=workdir,
                           filename="%s_%s.dat" %
                           (config["sample_label"], uuid.uuid4()))
    degenerate_enum = getVariantCallFunctions(config["degenerate"]).enum()

    # do the signal alignment, and get the posterior probabilities
    map(
        lambda (l, c, n): _SignalMachine(l.strip(), c, n),
        zip(cPecan_config["query_labels"], cPecan_config["exonerate_cigars"],
            npReads))

    # the reads may not produce any posteriors, if, for example, they don't align to a region where
    # there are any ambiguity characters the posteriors file will be empty and we just return
    # None, which is the convention
    if not os.path.exists(posteriors.fullpathGetter()) or os.stat(
            posteriors.fullpathGetter()).st_size == 0:
        return None

    # reminder: the convention is that 'expectations' are un-normalized posterior probabilities
    # so this file is a table of expectatiosn, I also use the convention that the trailing
    # underscore means `file` or `file-path`
    aligned_pairs = _parse_probabilities()
    expectations_ = _sumExpectationsOverColumns()
    if config["probs_output_dir"] is not None:
        deliverOutput(job, posteriors, config["probs_output_dir"])

    return job.fileStore.writeGlobalFile(expectations_.fullpathGetter())