Exemple #1
0
def chainSamFileJobFunction(job, config, aln_struct):
    # Cull the files from the job store that we want
    if config["chain"] is None and config["realign"] is None:
        job.fileStore.logToMaster("[chainSamFileJobFunction]Nothing to do.")
        return

    if config["chain"] is not None:
        sam_file   = job.fileStore.readGlobalFile(aln_struct.FileStoreID())
        reference  = job.fileStore.readGlobalFile(config["reference_FileStoreID"])
        reads      = job.fileStore.readGlobalFile(config["sample_FileStoreID"])
        workdir    = job.fileStore.getLocalTempDir()
        output_sam = LocalFile(workdir=workdir, filename="{}_chained.bam".format(config["sample_label"]))

        if config["debug"]:
            job.fileStore.logToMaster("[chainSamFileJobFunction] chaining {bwa_out} (locally: {sam})"
                                      "".format(bwa_out=aln_struct.FileStoreID(), sam=sam_file))

        chainSamFile(parent_job=job,
                     samFile=sam_file,
                     outputSamFile=output_sam.fullpathGetter(),
                     readFastqFile=reads,
                     referenceFastaFile=reference)

        chainedSamFileId = job.fileStore.writeGlobalFile(output_sam.fullpathGetter())
        deliverOutput(job, output_sam, config["output_dir"])
        job.addFollowOnJobFn(realignmentRootJobFunction, config, chainedSamFileId)

    else:
        job.fileStore.logToMaster("[chainSamFileJobFunction]Not chaining SAM, passing alignment "
                                  "on to realignment")
        job.addFollowOnJobFn(realignmentRootJobFunction, config, aln_struct.FileStoreID())
Exemple #2
0
 def makeNanoporeRead(f5_path):
     # here we load the NanoporeRead and write it to a file
     np = NanoporeRead(fast_five_file=f5_path, twoD=False)  # make this a config arg
     ok = np.Initialize(job)
     if not ok:
         return None
     _l = np.read_label
     tF = job.fileStore.getLocalTempFile()
     fH = open(tF, "w")
     ok = np.Write(job, fH, initialize=False)
     if not ok:
         fH.close()
         return None
     fH.close()
     # then we gzip it and deliver it to the readstore and return the ledger line
     fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l)
     fH = open(tF, "rb")
     gz = gzip.open(fn.fullpathGetter(), "wb")
     shutil.copyfileobj(fH, gz)
     fH.close()
     gz.close()
     try:
         deliverOutput(job, fn, readstore_dir)
     except RuntimeError:
         job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l)
         return None
     return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))
Exemple #3
0
def deliverLedgerJobFunction(job, config, ledger_fids):
    fHs    = [open(job.fileStore.readGlobalFile(f), "r") for f in ledger_fids]
    ls     = [pickle.load(f) for f in fHs]
    ledger = ls[0]
    [ledger.update(d) for d in ls[1:]]

    fn = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_ledger.pkl" % config["ledger_name"])
    _h = open(fn.fullpathGetter(), "w")
    pickle.dump(ledger, _h)
    _h.close()
    deliverOutput(job, fn, config["readstore_ledger_dir"])
def consolidateMethylationCallsJobFunction(job, config, methylation_prob_fids):
    outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(),
                        filename="%s_%s.tsv" %
                        (config["sample_label"], config["degenerate"]))
    _handle = open(outfile.fullpathGetter(), "w")
    files = fileinput.input(
        [job.fileStore.readGlobalFile(fid) for fid in methylation_prob_fids])
    map(lambda l: _handle.write(l), files)
    files.close()
    _handle.close()
    deliverOutput(job, outfile, config["output_dir"])
    return
 def _sumExpectationsOverColumns():
     f = LocalFile(workdir=workdir)
     _h = open(f.fullpathGetter(), "w")
     for pos, pos_df in aligned_pairs.groupby(["ref_pos"]):
         for base, base_df in pos_df.groupby("base"):
             marginal_prob = base_df["posterior"].sum()
             coverage = len(base_df["read_label"].unique())
             l = "%s\t%s\t%s\t%s\t%s\n" % (cPecan_config["contig_name"],
                                           pos, base, marginal_prob,
                                           coverage)
             _h.write(l)
     _h.close()
     return f
def consolidateVariantCallsJobFunction(job, config, posterior_prob_fids):
    variants = getVariantCallFunctions(config["degenerate"])
    parser = variants.parseVariantCalls
    file_iter = (job.fileStore.readGlobalFile(fid)
                 for fid in posterior_prob_fids)
    table = pd.concat([parser(f)
                       for f in file_iter]).sort_values(["contig", "ref_pos"])
    outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(),
                        filename="%s_%s.tsv" %
                        (config["sample_label"], config["degenerate"]))
    _handle = open(outfile.fullpathGetter(), "w")
    variants.writeVariantCalls(table, _handle)
    _handle.close()
    deliverOutput(job, outfile, config["output_dir"])
def processReferenceSequence(ref_seq,
                             workdir,
                             motif_key=None,
                             sub_char="X",
                             parent_job=None):
    # make the forward and backward sequences, substituting the necessary motifs
    if motif_key is not None:
        motif, ok = getMotif(motif_key, ref_seq)
        require(
            ok,
            "[processReferenceSequence]Illegal motif_key given %s" % motif_key)
        if parent_job is not None:
            parent_job.fileStore.logToMaster(
                "[processReferenceSequence]Made %s substitutions" %
                motif.substitutionPositionCount())
        try:
            fw_refseq = motif.forwardSubstitutedSequence(sub_char)
            bw_refseq = motif.complementSubstitutedSequence(sub_char)
        except AssertionError:
            return None, None, False
    else:
        fw_refseq = ref_seq.upper()
        bw_refseq = _reverseComplement(fw_refseq,
                                       reverse=False,
                                       complement=True)

    fw_refseqfile = LocalFile(workdir=workdir)
    bw_refseqfile = LocalFile(workdir=workdir)
    sequences = [fw_refseq, bw_refseq]
    sequence_files = [fw_refseqfile, bw_refseqfile]

    for f, s in zip(sequence_files, sequences):
        _h = open(f.fullpathGetter(), "w")
        _h.write(s + "\n")
        _h.close()

    [
        require(os.path.exists(f.fullpathGetter()),
                "[processReferenceSequence]Missing %s" % f.filenameGetter())
        for f in sequence_files
    ]

    return fw_refseqfile, bw_refseqfile, True
Exemple #8
0
def signalAlignRootJobFunction(job, config, sample):
    # download the reference
    config["reference_FileStoreID"] = job.addChildJobFn(urlDownlodJobFunction,
                                                        config["ref"],
                                                        disk=config["ref_size"]).rv()

    # download the BAM, and shard by region
    alignment_fid = job.addChildJobFn(urlDownlodJobFunction, sample.URL, disk=sample.size).rv()

    # download the models
    config["HMM_fid"] = job.addChildJobFn(urlDownlodJobFunction, config["HMM_file"], disk="10M").rv()
    config["HDP_fid"] = job.addChildJobFn(urlDownlodJobFunction, config["HDP_file"], disk="250M").rv()

    # setup labels
    config["sample_label"] = sample.sample_label

    # download and load the ledger
    # TODO use new function here
    ledger = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s.tmp" % uuid.uuid4().hex)
    urlDownload(job, config["ledger_url"], ledger)
    config["ledger"] = cPickle.load(open(ledger.fullpathGetter(), "r"))
    job.addFollowOnJobFn(shardAlignmentJobNode, config, alignment_fid)
Exemple #9
0
def prepareFast5Tarfile(job, split_tars_bigger_than_this, batchsize, download_slots, part_size, rs_sample):
    job.fileStore.logToMaster("[prepareFast5Tarfile]Working on sample %s" % rs_sample.sample_label)
    workdir = job.fileStore.getLocalTempDir()
    archive = LocalFile(workdir=workdir, filename="%s.tar" % uuid.uuid4().hex)
    urlDownload(job, rs_sample.URL, archive, download_slots=str(download_slots), part_size=str(part_size))

    _handle = tarfile.open(archive.fullpathGetter(), "r")
    members = _handle.getmembers()[1:]  # the first member is often just the directory with the fast5s
    paths   = [os.path.join(workdir, m.name) for m in members]
    _handle.extractall(path=workdir)

    if rs_sample.size >= split_tars_bigger_than_this:
        _iter    = [paths[i:i + batchsize] for i in range(0, len(paths), batchsize)]
        tar_fids = [archiveBatchAndUploadToFileStore(job, b, workdir) for b in _iter]
        _handle.close()
        job.fileStore.logToMaster("[prepareFast5Tarfile]Split %s into %s smaller tars"
                                  % (rs_sample.sample_label, len(tar_fids)))
        return tar_fids
    else:
        tar_fid = archiveBatchAndUploadToFileStore(job, paths, workdir)
        _handle.close()
        return [tar_fid]
 def _SignalMachine(read_label, cigar, nanopore_read):
     guide_aln = LocalFile(workdir=workdir)
     _handle = open(guide_aln.fullpathGetter(), "w")
     _handle.write(cigar)
     _handle.close()
     require(os.path.exists(guide_aln.fullpathGetter()),
             "NO guide aln file")
     signalMachine_args = [
         "--sm3Hdp",
         "-s",
         "1",
         "-o",
         "%s" % degenerate_enum,
         "-L",
         "%s" % read_label,
         "-T",
         "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)),
         "-q",
         "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()),
         "-f",
         "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()),
         "-b",
         "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()),
         "-p",
         "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()),
         "-u",
         "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()),
         "-v",
         "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)),
     ]
     try:
         docker_call(job=job,
                     tool=signalMachine_image,
                     parameters=signalMachine_args,
                     work_dir=(workdir + "/"))
     except subprocess.CalledProcessError:
         pass
def getFastqFromBam(job,
                    bam_sample,
                    samtools_image="quay.io/ucsc_cgl/samtools"):
    # n.b. this is NOT a jobFunctionWrappingJob, it just takes the parent job as
    # an argument to have access to the job store
    # download the BAM to the local directory, use a uid to aviod conflicts
    uid = uuid.uuid4().hex
    work_dir = job.fileStore.getLocalTempDir()
    local_bam = LocalFile(workdir=work_dir, filename="bam_{}.bam".format(uid))
    fastq_reads = LocalFile(workdir=work_dir,
                            filename="fastq_reads{}.fq".format(uid))

    urlDownload(parent_job=job,
                source_url=bam_sample.URL,
                destination_file=local_bam)

    require(not os.path.exists(fastq_reads.fullpathGetter()),
            "[getFastqFromBam]fastq file already exists")

    # run samtools to get the reads from the BAM
    # TODO use DOCKER_DIR and clean this up. idea: make globls.py or something
    samtools_parameters = [
        "fastq", "/data/{}".format(local_bam.filenameGetter())
    ]
    with open(fastq_reads.fullpathGetter(), 'w') as fH:
        docker_call(job=job,
                    tool=samtools_image,
                    parameters=samtools_parameters,
                    work_dir=work_dir,
                    outfile=fH)

    require(os.path.exists(fastq_reads.fullpathGetter()),
            "[getFastqFromBam]didn't generate reads")

    # upload fastq to fileStore
    return job.fileStore.writeGlobalFile(fastq_reads.fullpathGetter())
def calculateMethylationProbabilityJobFunction(
        job,
        config,
        cPecan_config,
        ignore_hmm,
        batch_number,
        signalMachine_image="quay.io/artrand/signalmachine"):
    def _get_url(read_label):
        try:
            return ledger[read_label]
        except KeyError:
            return None

    def _SignalMachine(read_label, cigar, nanopore_read):
        guide_aln = LocalFile(workdir=workdir)
        _handle = open(guide_aln.fullpathGetter(), "w")
        _handle.write(cigar)
        _handle.close()
        require(os.path.exists(guide_aln.fullpathGetter()),
                "NO guide aln file")
        signalMachine_args = [
            "--sm3Hdp",
            "-s",
            "1",
            "-o",
            "%s" % degenerate_enum,
            "-L",
            "%s" % read_label,
            "-T",
            "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)),
            "-q",
            "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()),
            "-f",
            "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()),
            "-b",
            "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()),
            "-p",
            "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()),
            "-u",
            "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()),
            "-v",
            "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)),
        ]
        try:
            docker_call(job=job,
                        tool=signalMachine_image,
                        parameters=signalMachine_args,
                        work_dir=(workdir + "/"))
        except subprocess.CalledProcessError:
            pass

    def _parse_probabilities():
        return pd.read_table(
            posteriors.fullpathGetter(),
            usecols=(1, 2, 3, 6),
            names=["ref_pos", "base", "posterior", "read_label"],
            dtype={
                "ref_pos": np.int,
                "base": np.str,
                "posterior": np.float64,
                "read_label": np.str
            })

    def _sumExpectationsOverColumns():
        f = LocalFile(workdir=workdir)
        _h = open(f.fullpathGetter(), "w")
        for pos, pos_df in aligned_pairs.groupby(["ref_pos"]):
            for base, base_df in pos_df.groupby("base"):
                marginal_prob = base_df["posterior"].sum()
                coverage = len(base_df["read_label"].unique())
                l = "%s\t%s\t%s\t%s\t%s\n" % (cPecan_config["contig_name"],
                                              pos, base, marginal_prob,
                                              coverage)
                _h.write(l)
        _h.close()
        return f

    job.fileStore.logToMaster(
        "[calculateMethylationProbabilityJobFunction]Running on batch %s" %
        batch_number)
    workdir = job.fileStore.getLocalTempDir()
    fw_seqfile, bw_seqfile, ok = processReferenceSequence(
        cPecan_config["contig_seq"], workdir, config["motif_key"],
        config["substitute_char"])
    if not ok:
        raise RuntimeError(
            "[calculateMethylationProbabilityJobFunction]ERROR processing reference sequences"
        )
    # get the models
    hmmfid = config["HMM_fid"]
    hdpfid = config["HDP_fid"]
    try:
        models = LocalFileManager(job=job,
                                  fileIds_to_get=[hmmfid, hdpfid],
                                  workdir=workdir)
    except AssertionError:
        raise RuntimeError(
            "[calculateMethylationProbabilityJobFunction]ERROR getting models locally"
        )

    # download the npRead files
    ledger = config["ledger"]
    url_iter = (_get_url(l.strip()) for l in cPecan_config["query_labels"])
    read_urls = [u for u in url_iter if u is not None]

    if config["debug"]:
        job.fileStore.logToMaster(
            "[calculateMethylationProbabilityJobFunction]Got %s URLs" %
            len(read_urls))

    npReads = [
        unzipLocalFile(f) for f in
        [urlDownloadToLocalFile(job, workdir, url) for url in read_urls]
        if f is not None
    ]
    failed = len(read_urls) - len(npReads)

    if failed > 0 and config["stop_at_failed_reads"]:
        raise RuntimeError(
            "[calculateMethylationProbabilityJobFunction]Got %s failed npRead"
            "downloads and stop_at_failed_reads is True" % failed)
    else:
        if config["debug"]:
            job.fileStore.logToMaster(
                "[calculateMethylationProbabilityJobFunction]"
                "Failed to download and upzip %s NanoporeReads" % failed)

    # file to collect the posterior probs
    posteriors = LocalFile(workdir=workdir,
                           filename="%s_%s.dat" %
                           (config["sample_label"], uuid.uuid4()))
    degenerate_enum = getVariantCallFunctions(config["degenerate"]).enum()

    # do the signal alignment, and get the posterior probabilities
    map(
        lambda (l, c, n): _SignalMachine(l.strip(), c, n),
        zip(cPecan_config["query_labels"], cPecan_config["exonerate_cigars"],
            npReads))

    # the reads may not produce any posteriors, if, for example, they don't align to a region where
    # there are any ambiguity characters the posteriors file will be empty and we just return
    # None, which is the convention
    if not os.path.exists(posteriors.fullpathGetter()) or os.stat(
            posteriors.fullpathGetter()).st_size == 0:
        return None

    # reminder: the convention is that 'expectations' are un-normalized posterior probabilities
    # so this file is a table of expectatiosn, I also use the convention that the trailing
    # underscore means `file` or `file-path`
    aligned_pairs = _parse_probabilities()
    expectations_ = _sumExpectationsOverColumns()
    if config["probs_output_dir"] is not None:
        deliverOutput(job, posteriors, config["probs_output_dir"])

    return job.fileStore.writeGlobalFile(expectations_.fullpathGetter())