コード例 #1
0
class FastqEmitter(object):
    def __init__(self, filename):
        self.writer = FastqWriter(filename)

    def emit(self, zmwRead):
        self.writer.writeRecord(zmwRead.readName, zmwRead.basecalls(),
                                zmwRead.QualityValue())
コード例 #2
0
ファイル: bash5tools.py プロジェクト: zkennedy/pbh5tools
class FastqEmitter(object):
    def __init__(self, filename):
        self.writer = FastqWriter(filename)

    def emit(self, zmwRead):
        self.writer.writeRecord(zmwRead.readName,
                                zmwRead.basecalls(),
                                zmwRead.QualityValue())
コード例 #3
0
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
コード例 #4
0
ファイル: CombineUtils.py プロジェクト: lpp1985/lpp_Script
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
コード例 #5
0
ファイル: FastqSplitter.py プロジェクト: ksahlin/cDNA_primer
    def split(self, first_split=None):
        """Split `input_fastq` into smaller files each containing
        `reads_per_split` reads. Return splitted fastq."""
        split_index = 0
        self.out_fns = []
        writer = FastqWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))

        if first_split is None:
            first_split = self.reads_per_split
        with FastqReader(self.input_fastq) as reader:
            for ridx, r in enumerate(reader):
                if ((split_index == 0 and ridx == first_split) or (split_index > 0 and ridx % self.reads_per_split == 0)) \
                    and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastqWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence, r.quality)

        writer.close()
        return list(self.out_fns)
コード例 #6
0
class ResultCollector(object):
    """
    Gathers results and writes to a file.
    """
    def __init__(self, resultsQueue, algorithmName, algorithmConfig):
        self._resultsQueue = resultsQueue
        self._algorithmName = algorithmName
        self._algorithmConfig = algorithmConfig

    def _run(self):
        self.onStart()

        sentinelsReceived = 0
        while sentinelsReceived < options.numWorkers:
            result = self._resultsQueue.get()
            if result is None:
                sentinelsReceived += 1
            else:
                self.onResult(result)

        self.onFinish()

    def run(self):
        if options.doProfiling:
            cProfile.runctx("self._run()",
                            globals=globals(),
                            locals=locals(),
                            filename=os.path.join(
                                options.temporaryDirectory,
                                "profile-%s.out" % (self.name)))
        else:
            self._run()

    # ==================================
    # Overridable interface begins here.
    #

    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId = defaultdict(list)
        self.consensusChunksByRefId = defaultdict(list)

        # open file writers
        self.fastaWriter = None
        self.fastqWriter = None
        self.gffWriter = None
        self.vcfWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())
        if options.vcfOutputFilename:
            self.vcfWriter = VariantsVcfWriter(options.vcfOutputFilename,
                                               vars(options),
                                               reference.byName.values())

    def onResult(self, result):
        window, cssAndVariants = result
        css, variants = cssAndVariants
        self._recordNewResults(window, css, variants)
        self._flushContigIfCompleted(window)

    def onFinish(self):
        logging.info("Analysis completed.")
        if self.fastaWriter: self.fastaWriter.close()
        if self.fastqWriter: self.fastqWriter.close()
        if self.gffWriter: self.gffWriter.close()
        if self.vcfWriter: self.vcfWriter.close()
        logging.info("Output files completed.")

    def _recordNewResults(self, window, css, variants):
        refId, refStart, refEnd = window
        self.consensusChunksByRefId[refId].append(css)
        self.variantsByRefId[refId] += variants
        self.referenceBasesProcessedById[refId] += (refEnd - refStart)

    def _flushContigIfCompleted(self, window):
        refId, _, _ = window
        refEntry = reference.byName[refId]
        refName = refEntry.fullName
        basesProcessed = self.referenceBasesProcessedById[refId]
        requiredBases = reference.numReferenceBases(refId,
                                                    options.referenceWindows)
        if basesProcessed == requiredBases:
            # This contig is done, so we can dump to file and delete
            # the data structures.
            if self.gffWriter or self.vcfWriter:
                variants = sorted(self.variantsByRefId[refId])
                if self.gffWriter:
                    self.gffWriter.writeVariants(variants)
                if self.vcfWriter:
                    self.vcfWriter.writeVariants(variants)
            del self.variantsByRefId[refId]

            #
            # If the user asked to analyze a window or a set of
            # windows, we output a FAST[AQ] contig per analyzed
            # window.  Otherwise we output a fasta contig per
            # reference contig.
            #
            # We try to be intelligent about naming the output
            # contigs, to include window information where applicable.
            #
            for span in reference.enumerateSpans(refId,
                                                 options.referenceWindows):
                _, s, e = span
                if (s == 0) and (e == refEntry.length):
                    spanName = refName
                else:
                    spanName = refName + "_%d_%d" % (s, e)
                cssName = consensus.consensusContigName(
                    spanName, self._algorithmName)
                # Gather just the chunks pertaining to this span
                chunksThisSpan = [
                    chunk for chunk in self.consensusChunksByRefId[refId]
                    if windows.windowsIntersect(chunk.refWindow, span)
                ]
                css = consensus.join(chunksThisSpan)

                if self.fastaWriter:
                    self.fastaWriter.writeRecord(cssName, css.sequence)
                if self.fastqWriter:
                    self.fastqWriter.writeRecord(cssName, css.sequence,
                                                 css.confidence)

            del self.consensusChunksByRefId[refId]
コード例 #7
0
class ResultCollector(object):
    """
    Gathers results and writes to a file.
    """
    def __init__(self, resultsQueue, algorithmName, algorithmConfig):
        self._resultsQueue = resultsQueue
        self._algorithmName = algorithmName
        self._algorithmConfig = algorithmConfig

    def _run(self):
        self.onStart()

        sentinelsReceived = 0
        while sentinelsReceived < options.numWorkers:
            result = self._resultsQueue.get()
            if result is None:
                sentinelsReceived += 1
            else:
                self.onResult(result)

        self.onFinish()

    def run(self):
        if options.doProfiling:
            cProfile.runctx("self._run()",
                            globals=globals(),
                            locals=locals(),
                            filename=os.path.join(options.temporaryDirectory,
                                                  "profile-%s.out" % (self.name)))
        else:
            self._run()


    # ==================================
    # Overridable interface begins here.
    #

    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId             = defaultdict(list)
        self.consensusChunksByRefId      = defaultdict(list)

        # open file writers
        self.fastaWriter = self.fastqWriter = self.gffWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())

    def onResult(self, result):
        window, cssAndVariants = result
        css, variants = cssAndVariants
        self._recordNewResults(window, css, variants)
        self._flushContigIfCompleted(window)

    def onFinish(self):
        logging.info("Analysis completed.")
        if self.fastaWriter: self.fastaWriter.close()
        if self.fastqWriter: self.fastqWriter.close()
        if self.gffWriter:   self.gffWriter.close()
        logging.info("Output files completed.")

    def _recordNewResults(self, window, css, variants):
        refId, refStart, refEnd = window
        self.consensusChunksByRefId[refId].append(css)
        self.variantsByRefId[refId] += variants
        self.referenceBasesProcessedById[refId] += (refEnd - refStart)

    def _flushContigIfCompleted(self, window):
        refId, _, _ = window
        refEntry = reference.byName[refId]
        refName = refEntry.fullName
        basesProcessed = self.referenceBasesProcessedById[refId]
        requiredBases = reference.numReferenceBases(refId, options.referenceWindows)
        if basesProcessed == requiredBases:
            # This contig is done, so we can dump to file and delete
            # the data structures.
            if self.gffWriter:
                self.gffWriter.writeVariants(sorted(self.variantsByRefId[refId]))
            del self.variantsByRefId[refId]

            #
            # If the user asked to analyze a window or a set of
            # windows, we output a FAST[AQ] contig per analyzed
            # window.  Otherwise we output a fasta contig per
            # reference contig.
            #
            # We try to be intelligent about naming the output
            # contigs, to include window information where applicable.
            #
            for span in reference.enumerateSpans(refId, options.referenceWindows):
                _, s, e = span
                if (s == 0) and (e == refEntry.length):
                    spanName = refName
                else:
                    spanName = refName + "_%d_%d" % (s, e)
                cssName = consensus.consensusContigName(spanName, self._algorithmName)
                # Gather just the chunks pertaining to this span
                chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId]
                                   if windows.windowsIntersect(chunk.refWindow, span) ]
                css = consensus.join(chunksThisSpan)

                if self.fastaWriter:
                    self.fastaWriter.writeRecord(cssName,
                                                 css.sequence)
                if self.fastqWriter:
                    self.fastqWriter.writeRecord(cssName,
                                                 css.sequence,
                                                 css.confidence)

            del self.consensusChunksByRefId[refId]
コード例 #8
0
def pick_rep(isoform_filename,
             gff_filename,
             group_filename,
             output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(
                ".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError(
                    "%s must contain either indexed FASTA files or " %
                    isoform_filename + "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." %
                      isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." %
                             isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml":  # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                    r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
コード例 #9
0
ファイル: CollapsingUtils.py プロジェクト: lpp1985/lpp_Script
def pick_rep(isoform_filename, gff_filename,
             group_filename, output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError("%s must contain either indexed FASTA files or " % isoform_filename +
                              "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." % isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml": # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i/10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)