Example #1
0
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename, in_rep_filename,
                                  out_abundance_filename, out_gff_filename, out_rep_filename,
                                  good):
    """Write good collapsed isoforms."""
    in_suffix = parse_ds_filename(in_rep_filename)[1]
    out_suffix = parse_ds_filename(out_rep_filename)[1]
    if in_suffix != out_suffix:
        raise ValueError("Format of input %s and output %s must match." %
                         (in_rep_filename, out_rep_filename))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError("Format of input %s and output %s must be either FASTA or FASTQ." %
                         (in_rep_filename, out_rep_filename))

    # then read gff, and write good gff record.
    with CollapseGffWriter(out_gff_filename) as gff_writer:
        for r in CollapseGffReader(in_gff_filename):
            if r.seqid in good:
                gff_writer.writeRecord(r)

    # next read rep fasta/fastq, and write good rep fasta/fastq record.
    rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \
                 else FastqReader(in_rep_filename)
    rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \
                 else FastqWriter(out_rep_filename)
    for r in rep_reader:
        # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465
        if r.name.split('|')[0] in good:
            rep_writer.writeRecord(r)

    # finally write abundance info of good records.
    with AbundanceReader(in_abundance_filename) as a_reader, \
        AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer:
        for r in a_reader:
            if r.pbid in good:
                a_writer.writeRecord(r)
Example #2
0
class FastaEmitter(object):
    def __init__(self, filename):
        self.writer = FastaWriter(filename)

    def emit(self, zmwRead):
        self.writer.writeRecord(zmwRead.readName,
                                zmwRead.basecalls())
Example #3
0
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
Example #4
0
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
Example #5
0
 def test_contigset_write(self):
     fasta = upstreamData.getLambdaFasta()
     ds = ContigSet(fasta)
     assert isinstance(ds.resourceReaders()[0], IndexedFastaReader)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'test.fasta')
     w = FastaWriter(outfn)
     for rec in ds:
         w.writeRecord(rec)
     w.close()
     fas = FastaReader(outfn)
     for rec in fas:
         # make sure a __repr__ didn't slip through:
         assert not rec.sequence.startswith('<')
 def test_contigset_write(self):
     fasta = upstreamData.getLambdaFasta()
     ds = ContigSet(fasta)
     self.assertTrue(isinstance(ds.resourceReaders()[0],
                                IndexedFastaReader))
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'test.fasta')
     w = FastaWriter(outfn)
     for rec in ds:
         w.writeRecord(rec)
     w.close()
     fas = FastaReader(outfn)
     for rec in fas:
         # make sure a __repr__ didn't slip through:
         self.assertFalse(rec.sequence.startswith('<'))
Example #7
0
    def split(self):
        """Split `input_fasta` into smaller files each containing
        `reads_per_split` reads. Return splitted fasta."""
        split_index = 0
        self.out_fns = []
        writer = FastaWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))
        with FastaReader(self.input_fasta) as reader:
            for ridx, r in enumerate(reader):
                if ridx % self.reads_per_split == 0 and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastaWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence)

        writer.close()
        return list(self.out_fns)
Example #8
0
    def split(self):
        """Split `input_fasta` into smaller files each containing
        `reads_per_split` reads. Return splitted fasta."""
        split_index = 0
        self.out_fns = []
        writer = FastaWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))
        with FastaReader(self.input_fasta) as reader:
            for ridx, r in enumerate(reader):
                if ridx % self.reads_per_split == 0 and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastaWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence)

        writer.close()
        return list(self.out_fns)
Example #9
0
    def save(self, dir):
        """
        Save this ArrowEvidence to a directory.  The directory will be
        *created* by this method.

        Format of evidence dump:
        evidence_dump/
          ref000001/
            0-1005/
              consensus.fa
              arrow-scores.h5
            995-2005/
            ...
        """
        logging.info("Dumping evidence to %s" % (dir, ))
        join = os.path.join
        if os.path.exists(dir):
            raise Exception(
                "Evidence dump does not expect directory %s to exist." % dir)
        os.makedirs(dir)
        #refFasta       = FastaWriter(join(dir, "reference.fa"))
        #readsFasta     = FastaWriter(join(dir, "reads.fa"))
        consensusFasta = FastaWriter(join(dir, "consensus.fa"))
        windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd))
        #refFasta.writeRecord(windowName, self.refSequence)
        #refFasta.close()

        consensusFasta.writeRecord(windowName + "|arrow", self.consensus)
        consensusFasta.close()

        import h5py
        arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5"))
        arrowScoreFile.create_dataset("Scores", data=self.scores)
        vlen_str = h5py.special_dtype(vlen=str)
        arrowScoreFile.create_dataset("RowNames",
                                      data=self.rowNames,
                                      dtype=vlen_str)
        arrowScoreFile.create_dataset("ColumnNames",
                                      data=self.colNames,
                                      dtype=vlen_str)
        arrowScoreFile.create_dataset("BaselineScores",
                                      data=self.baselineScores)
        arrowScoreFile.close()
Example #10
0
def main():
    id2seq = {}
    parser = argparse.ArgumentParser()
    parser.add_argument("-b",
                        "--breakpoint",
                        help="file containing breakpoints")
    parser.add_argument("-a",
                        "--assembly",
                        help="fasta file containing contigs")
    parser.add_argument("-o", "--outfile", help="new assembly file")
    parser.add_argument("-l", "--lenfile", help="length of contigs")

    args = parser.parse_args()

    lenfile = open(args.lenfile, 'w')

    lenmap = {}
    f = FastaReader(args.assembly)
    for record in f:
        id = record.id
        id2seq[id] = record.sequence[0:-10]
        new_seq = {}

        f = open(args.breakpoint, 'r')
        lines = f.readlines()
        for line in lines:
            attrs = line.split()
            if len(attrs) == 1:
                curr_contig = attrs[0]
                seq = id2seq[curr_contig]
            else:
                start = long(attrs[0])
                end = long(attrs[1])
                new_id = curr_contig + '_' + attrs[0] + '_' + attrs[1]
                new_seq[new_id] = seq[start:end]
                lenmap[new_id] = end - start + 1
        rec_list = []
        writer = FastaWriter(args.scaffold)
        for key in new_seq:
            writer.writeRecord(key, new_seq[key])

        for key in lenmap:
            lenfile.write(key + "\t" + str(lenmap[key]) + '\n')
Example #11
0
def main(parser):
    args = parser.parse_args()

    # Get outfile name
    if args.outFile is None:
        outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
    else:
        outfile = args.outFile

    # Input files
    barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
    ccsFofn = (l.strip('\n') for l in args.ccs_fofn)

    # Get the read names that are not barcoded
    no_barcode = set()
    for barcodeFile in barcodeFofn:
        bcH5 = BarcodeH5Reader(barcodeFile)
        for row in bcH5.bestDS:
            if row[3] / row[1] < args.minAvgBarcodeScore:
                no_barcode.add('%s/%d' % (bcH5.movieName, row[0]))

    if args.fasta:
        outh = FastaWriter(outfile)
    else:
        outh = FastqWriter(outfile)

    for ccsFile in ccsFofn:
        ccsH5 = BasH5Reader(ccsFile)
        for ccsRead in ccsH5.ccsReads():
            if ccsRead.zmw.zmwName in no_barcode:
                basecalls = ccsRead.basecalls()
                if len(basecalls) >= args.minMaxInsertLength:
                    if args.fasta:
                        outh.writeRecord(
                            FastaRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls()))
                    else:
                        outh.writeRecord(
                            FastqRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls(),
                                        ccsRead.QualityValue()))
    outh.close()
Example #12
0
def combine_consensus_isoforms(split_indices, split_files,
                               combined_consensus_isoforms_fa,
                               sample_name):
    """
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_files -- consensus isoforms in each splitted cluster bin.
    """
    assert len(split_indices) == len(split_files)
    writer = FastaWriter(combined_consensus_isoforms_fa)
    for i, split_fn in zip(split_indices, split_files):
        logging.debug("Adding prefix i%s to %s.", str(i), split_fn)
        with ContigSetReaderWrapper(split_fn) as reader:
            for read in reader:
                name = combined_cid_ice_name(name=read.name, cluster_bin_index=i,
                                             sample_name=sample_name)
                writer.writeRecord(name, read.sequence[:])
    writer.close()
    logging.info("Consensus isoforms output combined to:%s",
                 combined_consensus_isoforms_fa)
Example #13
0
def combine_consensus_isoforms(split_indices, split_files,
                               combined_consensus_isoforms_fa,
                               sample_name):
    """
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_files -- consensus isoforms in each splitted cluster bin.
    """
    assert len(split_indices) == len(split_files)
    writer = FastaWriter(combined_consensus_isoforms_fa)
    for i, split_fn in zip(split_indices, split_files):
        logging.debug("Adding prefix i%s to %s.", str(i), split_fn)
        with ContigSetReaderWrapper(split_fn) as reader:
            for read in reader:
                name = combined_cid_ice_name(name=read.name, cluster_bin_index=i,
                                             sample_name=sample_name)
                writer.writeRecord(name, read.sequence[:])
    writer.close()
    logging.info("Consensus isoforms output combined to:%s",
                 combined_consensus_isoforms_fa)
Example #14
0
def main(parser):
  args = parser.parse_args()

  # Get outfile name
  if args.outFile is None:
    outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
  else:
    outfile = args.outFile
  
  # Input files
  barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
  baxFofn = (l.strip('\n') for l in args.bax_fofn)
  
  # Get the read names that are not barcoded
  no_barcode = defaultdict(set)
  for barcodeFile in barcodeFofn:
    bcH5 = BarcodeH5Reader(barcodeFile)
    for row in bcH5.bestDS:
      if row[3] / row[1] < args.minAvgBarcodeScore:
        no_barcode[bcH5.movieName].add(row[0])

  if args.fasta:
    outh = FastaWriter(outfile)
  else:
    outh = FastqWriter(outfile)

  for baxFile in baxFofn:
    baxH5 = BasH5Reader(baxFile)
    for holeNum in baxH5.sequencingZmws:
      if holeNum in no_barcode[baxH5.movieName]:
        zmw = baxH5[holeNum]
        if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength:
          for subread in zmw.subreads:
            if len(subread.basecalls()) >= args.minSubreadLength:
              if args.fasta:
                outh.writeRecord(FastaRecord(subread.readName,subread.basecalls()))
              else:
                outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue()))

  outh.close()
Example #15
0
    def split(self, reads_in_first_split=None):
        """Split `input_fasta` into smaller files each containing
        `reads_per_split` reads. Return splitted fasta."""
        split_index = 0
        self.out_fns = []
        writer = FastaWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))
        if reads_in_first_split is None:
            reads_in_first_split = self.reads_per_split
        with ContigSetReaderWrapper(self.input_fasta) as reader:
            for ridx, r in enumerate(reader):
                if ((split_index == 0 and ridx == reads_in_first_split) or
                        (split_index > 0 and ridx % self.reads_per_split == 0)) \
                    and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastaWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence[:])

        writer.close()
        return list(self.out_fns)
Example #16
0
    def split(self, reads_in_first_split=None):
        """Split `input_fasta` into smaller files each containing
        `reads_per_split` reads. Return splitted fasta."""
        split_index = 0
        self.out_fns = []
        writer = FastaWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))
        if reads_in_first_split is None:
            reads_in_first_split = self.reads_per_split
        with ContigSetReaderWrapper(self.input_fasta) as reader:
            for ridx, r in enumerate(reader):
                if ((split_index == 0 and ridx == reads_in_first_split) or
                        (split_index > 0 and ridx % self.reads_per_split == 0)) \
                    and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastaWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence[:])

        writer.close()
        return list(self.out_fns)
Example #17
0
def main(parser):
  args = parser.parse_args()

  # Get outfile name
  if args.outFile is None:
    outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
  else:
    outfile = args.outFile
  
  # Input files
  barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
  ccsFofn = (l.strip('\n') for l in args.ccs_fofn)  
  
  # Get the read names that are not barcoded
  no_barcode = set()
  for barcodeFile in barcodeFofn:
    bcH5 = BarcodeH5Reader(barcodeFile)
    for row in bcH5.bestDS:
      if row[3] / row[1] < args.minAvgBarcodeScore:
        no_barcode.add('%s/%d' % (bcH5.movieName,row[0]))
  
  if args.fasta:
    outh = FastaWriter(outfile)
  else:
    outh = FastqWriter(outfile)
  
  for ccsFile in ccsFofn:
    ccsH5 = BasH5Reader(ccsFile)
    for ccsRead in ccsH5.ccsReads():
      if ccsRead.zmw.zmwName in no_barcode:
        basecalls = ccsRead.basecalls()
        if len(basecalls) >= args.minMaxInsertLength:
          if args.fasta:
            outh.writeRecord(FastaRecord(ccsRead.zmw.zmwName, ccsRead.basecalls()))
          else:
            outh.writeRecord(FastqRecord(ccsRead.zmw.zmwName, ccsRead.basecalls(), ccsRead.QualityValue()))
  outh.close()
Example #18
0
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename,
                                  in_rep_filename, out_abundance_filename,
                                  out_gff_filename, out_rep_filename, good):
    """Write good collapsed isoforms."""
    in_suffix = parse_ds_filename(in_rep_filename)[1]
    out_suffix = parse_ds_filename(out_rep_filename)[1]
    if in_suffix != out_suffix:
        raise ValueError("Format of input %s and output %s must match." %
                         (in_rep_filename, out_rep_filename))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input %s and output %s must be either FASTA or FASTQ." %
            (in_rep_filename, out_rep_filename))

    # then read gff, and write good gff record.
    with CollapseGffWriter(out_gff_filename) as gff_writer:
        for r in CollapseGffReader(in_gff_filename):
            if r.seqid in good:
                gff_writer.writeRecord(r)

    # next read rep fasta/fastq, and write good rep fasta/fastq record.
    rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \
                 else FastqReader(in_rep_filename)
    rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \
                 else FastqWriter(out_rep_filename)
    for r in rep_reader:
        # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465
        if r.name.split('|')[0] in good:
            rep_writer.writeRecord(r)

    # finally write abundance info of good records.
    with AbundanceReader(in_abundance_filename) as a_reader, \
        AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer:
        for r in a_reader:
            if r.pbid in good:
                a_writer.writeRecord(r)
Example #19
0
    def save(self, dir):
        """
        Save this ArrowEvidence to a directory.  The directory will be
        *created* by this method.

        Format of evidence dump:
        evidence_dump/
          ref000001/
            0-1005/
              consensus.fa
              arrow-scores.h5
            995-2005/
            ...
        """
        logging.info("Dumping evidence to %s" % (dir,))
        join = os.path.join
        if os.path.exists(dir):
            raise Exception, "Evidence dump does not expect directory %s to exist." % dir
        os.makedirs(dir)
        #refFasta       = FastaWriter(join(dir, "reference.fa"))
        #readsFasta     = FastaWriter(join(dir, "reads.fa"))
        consensusFasta = FastaWriter(join(dir, "consensus.fa"))
        windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd))
        #refFasta.writeRecord(windowName, self.refSequence)
        #refFasta.close()

        consensusFasta.writeRecord(windowName + "|arrow", self.consensus)
        consensusFasta.close()

        arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5"))
        arrowScoreFile.create_dataset("Scores", data=self.scores)
        vlen_str = h5py.special_dtype(vlen=str)
        arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str)
        arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str)
        arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores)
        arrowScoreFile.close()
Example #20
0
def dumpEvidence(evidenceDumpBaseDirectory, refWindow, refSequence, alns,
                 quiverConsensus):
    """This will import h5py at runtime.
    """
    # Format of evidence dump:
    # evidence_dump/
    #   ref000001/
    #     0-1005/
    #       reference.fa
    #       reads.fa
    #       consensus.fa
    #       quiver-scores.h5
    #     995-2005/
    #       ...
    join = os.path.join
    refId, refStart, refEnd = refWindow
    refName = reference.idToName(refId)
    windowDirectory = join(evidenceDumpBaseDirectory, refName,
                           "%d-%d" % (refStart, refEnd))
    logging.info("Dumping evidence to %s" % (windowDirectory, ))

    if os.path.exists(windowDirectory):
        raise Exception(
            "Evidence dump does not expect directory %s to exist." %
            windowDirectory)
    os.makedirs(windowDirectory)
    refFasta = FastaWriter(join(windowDirectory, "reference.fa"))
    readsFasta = FastaWriter(join(windowDirectory, "reads.fa"))
    consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa"))

    windowName = refName + (":%d-%d" % (refStart, refEnd))
    refFasta.writeRecord(windowName, refSequence)
    refFasta.close()

    consensusFasta.writeRecord(windowName + "|quiver",
                               quiverConsensus.sequence)
    consensusFasta.close()

    rowNames, columnNames, baselineScores, scores = scoreMatrix(
        quiverConsensus.mms)
    import h5py
    quiverScoreFile = h5py.File(join(windowDirectory, "quiver-scores.h5"))
    quiverScoreFile.create_dataset("Scores", data=scores)
    vlen_str = h5py.special_dtype(vlen=str)
    quiverScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str)
    quiverScoreFile.create_dataset("ColumnNames",
                                   data=columnNames,
                                   dtype=vlen_str)
    quiverScoreFile.create_dataset("BaselineScores", data=baselineScores)
    quiverScoreFile.close()
    for aln in alns:
        readsFasta.writeRecord(str(aln.rowNumber),
                               aln.read(orientation="genomic", aligned=False))
    readsFasta.close()
Example #21
0
def dumpEvidence(evidenceDumpBaseDirectory,
                 refWindow, refSequence, alns,
                 quiverConsensus):
    # Format of evidence dump:
    # evidence_dump/
    #   ref000001/
    #     0-1005/
    #       reference.fa
    #       reads.fa
    #       consensus.fa
    #       quiver-scores.h5
    #     995-2005/
    #       ...
    join = os.path.join
    refId, refStart, refEnd = refWindow
    refName = reference.idToName(refId)
    windowDirectory = join(evidenceDumpBaseDirectory,
                           refName,
                           "%d-%d" % (refStart, refEnd))
    logging.info("Dumping evidence to %s" % (windowDirectory,))

    if os.path.exists(windowDirectory):
        raise Exception, "Evidence dump does not expect directory %s to exist." % windowDirectory
    os.makedirs(windowDirectory)
    refFasta       = FastaWriter(join(windowDirectory, "reference.fa"))
    readsFasta     = FastaWriter(join(windowDirectory, "reads.fa"))
    consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa"))

    windowName = refName + (":%d-%d" % (refStart, refEnd))
    refFasta.writeRecord(windowName, refSequence)
    refFasta.close()

    consensusFasta.writeRecord(windowName + "|quiver", quiverConsensus.sequence)
    consensusFasta.close()

    rowNames, columnNames, baselineScores, scores = scoreMatrix(quiverConsensus.mms)
    quiverScoreFile = h5py.File(join(windowDirectory, "quiver-scores.h5"))
    quiverScoreFile.create_dataset("Scores", data=scores)
    vlen_str = h5py.special_dtype(vlen=str)
    quiverScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str)
    quiverScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str)
    quiverScoreFile.create_dataset("BaselineScores", data=baselineScores)
    quiverScoreFile.close()
    for aln in alns:
        readsFasta.writeRecord(str(aln.rowNumber),
                               aln.read(orientation="genomic", aligned=False))
    readsFasta.close()
Example #22
0
def split_imgt_drb( input_file ):
    drb3 = FastaWriter("DRB3_nuc.fasta")
    drb4 = FastaWriter("DRB4_nuc.fasta")
    drb5 = FastaWriter("DRB5_nuc.fasta")

    for record in FastaReader( input_file ):
        # Check that this is an IMGT-formatted FASTA record
        assert record.header.startswith('HLA:')

        # Split the locus name out of the header
        locus = record.header.strip().split('_')[1]

        # Write the record to the appropriate file
        if locus.startswith('DRB3'):
            drb3.writeRecord( record )
        elif locus.startswith('DRB4'):
            drb4.writeRecord( record )
        elif locus.startswith('DRB5'):
            drb5.writeRecord( record )
        else:
            raise Exception("Locus {0} is not DRB3, 4, or 5!".format(locus))
Example #23
0
def ice_fq2fa(in_fq, out_fa):
    handle = FastaWriter(out_fa)
    for r in FastqReader(in_fq):
        handle.writeRecord(r.name, r.sequence)
 def test_writeFasta2(self):
     f = StringIO()
     w = FastaWriter(f)
     for record in FastaReader(self.fasta1):
         w.writeRecord(record.header, record.sequence)
     assert_equal(self.fasta1.getvalue(), f.getvalue())
Example #25
0
        if len(insert) < 500: continue
        smrtBellReads = simSmrtBellReads(ccRng, seqParams, insert, readLength)

        for subreadId, subread in enumerate(smrtBellReads):
            yield moleculeId, subreadId, (strand + subreadId) % 2, \
                chromosomeId, tStart, tEnd, subread


if __name__ == '__main__':
    fw     = FastaWriter("/tmp/reads.fa")
    csvOut = open("/tmp/reads.csv", "w")
    csvOut.write("MoleculeId,SubReadId,Chromosome,tStart,tEnd,Strand\n")

    genome = [r.sequence for r in FastaReader("~/Data/lambdaNEB.fa")]
    #genome = [r.sequence for r in FastaReader("~/Data/Diploid/diploidLambda.fa")]
    seqParams = cc.SequencingParameters.C2()
    for readId, item in enumerate(simExperiment(42, seqParams,
                                                4000, 1000, genome, 50000, False)):
        moleculeId, subreadId, strand, chromosomeId, tStart, tEnd, subread = item

        rStart = subreadId
        rEnd   = subreadId

        readName = "mSimulator/%d" % moleculeId

        fw.writeRecord(readName, subread)

        csvOut.write("%d,%d,%d,%d,%d,%d" %
                     (moleculeId, subreadId, chromosomeId, tStart, tEnd, strand))
        csvOut.write("\n")
 def test_writeFasta1(self):
     f = StringIO()
     w = FastaWriter(f)
     for record in FastaReader(self.fasta1):
         w.writeRecord(record)
     assert self.fasta1.getvalue() == f.getvalue()
class ResultCollector(object):
    """
    Gathers results and writes to a file.
    """
    def __init__(self, resultsQueue, algorithmName, algorithmConfig):
        self._resultsQueue = resultsQueue
        self._algorithmName = algorithmName
        self._algorithmConfig = algorithmConfig

    def _run(self):
        self.onStart()

        sentinelsReceived = 0
        while sentinelsReceived < options.numWorkers:
            result = self._resultsQueue.get()
            if result is None:
                sentinelsReceived += 1
            else:
                self.onResult(result)

        self.onFinish()

    def run(self):
        if options.doProfiling:
            cProfile.runctx("self._run()",
                            globals=globals(),
                            locals=locals(),
                            filename=os.path.join(
                                options.temporaryDirectory,
                                "profile-%s.out" % (self.name)))
        else:
            self._run()

    # ==================================
    # Overridable interface begins here.
    #

    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId = defaultdict(list)
        self.consensusChunksByRefId = defaultdict(list)

        # open file writers
        self.fastaWriter = None
        self.fastqWriter = None
        self.gffWriter = None
        self.vcfWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())
        if options.vcfOutputFilename:
            self.vcfWriter = VariantsVcfWriter(options.vcfOutputFilename,
                                               vars(options),
                                               reference.byName.values())

    def onResult(self, result):
        window, cssAndVariants = result
        css, variants = cssAndVariants
        self._recordNewResults(window, css, variants)
        self._flushContigIfCompleted(window)

    def onFinish(self):
        logging.info("Analysis completed.")
        if self.fastaWriter: self.fastaWriter.close()
        if self.fastqWriter: self.fastqWriter.close()
        if self.gffWriter: self.gffWriter.close()
        if self.vcfWriter: self.vcfWriter.close()
        logging.info("Output files completed.")

    def _recordNewResults(self, window, css, variants):
        refId, refStart, refEnd = window
        self.consensusChunksByRefId[refId].append(css)
        self.variantsByRefId[refId] += variants
        self.referenceBasesProcessedById[refId] += (refEnd - refStart)

    def _flushContigIfCompleted(self, window):
        refId, _, _ = window
        refEntry = reference.byName[refId]
        refName = refEntry.fullName
        basesProcessed = self.referenceBasesProcessedById[refId]
        requiredBases = reference.numReferenceBases(refId,
                                                    options.referenceWindows)
        if basesProcessed == requiredBases:
            # This contig is done, so we can dump to file and delete
            # the data structures.
            if self.gffWriter or self.vcfWriter:
                variants = sorted(self.variantsByRefId[refId])
                if self.gffWriter:
                    self.gffWriter.writeVariants(variants)
                if self.vcfWriter:
                    self.vcfWriter.writeVariants(variants)
            del self.variantsByRefId[refId]

            #
            # If the user asked to analyze a window or a set of
            # windows, we output a FAST[AQ] contig per analyzed
            # window.  Otherwise we output a fasta contig per
            # reference contig.
            #
            # We try to be intelligent about naming the output
            # contigs, to include window information where applicable.
            #
            for span in reference.enumerateSpans(refId,
                                                 options.referenceWindows):
                _, s, e = span
                if (s == 0) and (e == refEntry.length):
                    spanName = refName
                else:
                    spanName = refName + "_%d_%d" % (s, e)
                cssName = consensus.consensusContigName(
                    spanName, self._algorithmName)
                # Gather just the chunks pertaining to this span
                chunksThisSpan = [
                    chunk for chunk in self.consensusChunksByRefId[refId]
                    if windows.windowsIntersect(chunk.refWindow, span)
                ]
                css = consensus.join(chunksThisSpan)

                if self.fastaWriter:
                    self.fastaWriter.writeRecord(cssName, css.sequence)
                if self.fastqWriter:
                    self.fastqWriter.writeRecord(cssName, css.sequence,
                                                 css.confidence)

            del self.consensusChunksByRefId[refId]
Example #28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--cleaned", help="cleaned assembly")
    parser.add_argument("-f", "--scaffold", help="final scaffold file")
    parser.add_argument("-l", "--links", help="links sorted by score")
    parser.add_argument("-n", "--length", help="contig length")
    args = parser.parse_args()

    f = FastaReader(args.cleaned)

    for record in f:
        id = record.id
        print id
        id2seq[id] = record.sequence[0:-10]

    def break_cycle(nodes):
        nodeset = set()
        for node in nodes:
            nodeset.add(node.split(":")[0])
        nodeset = list(nodeset)
        weight = ""
        chosen_edge = ""
        if len(nodeset) == 2:
            u = nodeset[0]
            v = nodeset[1]
            if u in nodes_to_edges:
                edges = nodes_to_edges[u]
                sorted_edges = sorted(edges,
                                      key=operator.itemgetter(2),
                                      reverse=True)
                #print sorted_edges
                for each in sorted_edges:
                    if each[1].split(":")[0] == v:
                        weight = each[2]
                        chosen_edge = each

            if v in nodes_to_edges:
                edges = nodes_to_edges[v]
                sorted_edges = sorted(edges,
                                      key=operator.itemgetter(2),
                                      reverse=True)
                for each in sorted_edges:
                    if each[1].split(":")[0] == u:
                        if each[2] > weight:
                            weight = each[2]
                            chosen_edge = each
                        if each[2] < weight:
                            break

            if chosen_edge != "":
                start = chosen_edge[0]
                end = chosen_edge[1]
                path = []
                if start.split(":")[1] == 'B':
                    path.append(start.split(":")[0] + ":E")
                    #path.append(start.split(":")[0]+":B")
                else:
                    path.append(start.split(":")[0] + ":B")
                    #path.append(start.split(":")[0]+":E")

                path.append(start)
                path.append(end)

                if end.split(":")[1] == 'B':
                    path.append(end.split(":")[0] + ":E")
                    #path.append(end.split(":")[0]+":B")
                else:
                    path.append(end.split(":")[0] + ":B")
                    #path.append(end.split(":")[0]+":E")

                return path

    with open(args.length, 'r') as f:
        lines = f.readlines()
        for line in lines:
            attrs = line.split()
            contig_length[attrs[0]] = long(attrs[-1])

    contigs = set()
    with open(args.links, 'r') as f:
        for row in f:
            row = row.strip().split()
            v1, v2 = row[0:2]
            score = float(row[-1])
            count = float(row[3])
            c1 = v1.split(":")[0]
            c2 = v2.split(":")[0]
            contigs.add(c1)
            contigs.add(c2)

            if c1 not in nodes_to_edges:
                nodes_to_edges[c1] = []
            if c2 not in nodes_to_edges:
                nodes_to_edges[c2] = []
            nodes_to_edges[c1].append((v1, v2, float(row[3])))

            key = c1 + '$' + c2
            if count >= 60:
                H.add_edge(c1, c2, weight=int(row[-1]))
                oriented.add_edge(v1, v2, weight=int(row[-1]))
            #H.add_edge(c1,c2,weight=int(row[-1]))
            #oriented.add_edge(v1,v2,weight=int(row[-1]),count=float(row[3]))

            #print key
            if key not in edgemap:
                edgemap[key] = int(row[-1])
            else:
                edgemap[key] += int(row[-1])

            key = c2 + '$' + c1
            if key not in edgemap:
                edgemap[key] = int(row[-1])
            else:
                edgemap[key] += int(row[-1])

            if v1 not in existing_nodes and v2 not in existing_nodes:
                if count < 150:
                    continue
                G.add_edge(v1, v2, score=score, t="x")

                existing_nodes.add(v1)
                existing_nodes.add(v2)

    for ctg in list(contigs):
        G.add_edge(ctg + ":B", ctg + ":E", t="c", score=0)

    g_idx = 1
    recs = []
    to_merge = set()

    backbone_paths = {}
    path_id = 1
    assigned = {}
    # for u,v,data in G.edges(data=True):
    #     if data['score'] == 0:
    #         G[u][v]['score'] = 1000000
    #         continue
    #     G[u][v]['score'] = 1.0/data['score']
    for subg in nx.connected_component_subgraphs(G):
        p0 = []

        for v in subg.nodes():
            if subg.degree(v) == 1:
                p0.append(v)

        if len(p0) != 2:
            path = break_cycle(subg.nodes())
            if path != None:
                #print path
                if len(path) == 2:
                    assigned[path[0].split(':')[0]] = False
                    to_merge.add(path[0].split(':')[0])
                    continue
                backbone_paths[path_id] = path
                path_id += 1
            else:
                #print 'here'
                #print subg.nodes()
                for each in subg.nodes():
                    to_merge.add(each.split(':')[0])
                    assigned[each.split(':')[0]] = False
            continue
        else:
            path = nx.shortest_path(subg, p0[0], p0[1])
            if len(path) == 2:
                to_merge.add(path[0].split(':')[0])
                continue
            backbone_paths[path_id] = path
            #print path
            path_id += 1
            curr_contig = ""

        g_idx += 1

    #now for each separate contig, find a maximum likely backbone path
    assignment = {}
    for each in to_merge:
        max_sum = -1
        max_path = -1
        for key in backbone_paths:
            path = backbone_paths[key]
            cur_sum = 0
            cnt = 0
            for node in path:
                if H.has_edge(each, node.split(':')[0]):
                    cur_sum += H[each][node.split(':')[0]]['weight']
                    cnt += 1
            if cnt != 0 and cur_sum > max_sum:
                max_sum = cur_sum
                max_path = key

        if max_sum != -1:
            assignment[each] = (max_path, max_sum, contig_length[each])

    #now that we have found the path, try putting contig at best position in the path

    count = len(assignment)

    path_to_contig = {}

    for each in assignment:
        key = assignment[each][0]
        if key not in path_to_contig:
            path_to_contig[key] = []
        path_to_contig[key].append(
            (each, assignment[each][1], assignment[each][2]))

    for each in path_to_contig:
        contigs = path_to_contig[each]
        contigs_sorted = sorted(contigs,
                                key=operator.itemgetter(1),
                                reverse=True)
        path_to_contig[each] = contigs_sorted
        #print contigs_sorted

    ofile = open('ambigous_contigs', 'w')

    for path_id in path_to_contig:
        path = backbone_paths[path_id]
        temp_path = list(path)

        contigs = path_to_contig[path_id]
        contigs = [str(i[0]) for i in contigs]
        explored = {}
        cnt = len(contigs)
        #print 'contig_length = ' + str(cnt)
        prev_len = -1
        curr_len = 0
        while True:
            final_max = -1
            final_pos = -1
            final_orient = ''
            final_contig = ''
            final_begin = ''
            final_end = ''
            #print len(explored)
            if len(explored) == len(contigs) or prev_len == len(explored):
                break
            prev_len = len(explored)
            for contig in contigs:
                if contig not in explored:
                    begin = contig + ":B"
                    end = contig + ":E"
                    total_max = -1
                    orientation = ''
                    pos = -1
                    #check for positions in the middle of the path
                    for i in range(1, len(path) - 1, 2):
                        score_fow = -1
                        score_rev = -1

                        if oriented.has_edge(path[i],
                                             begin) and oriented.has_edge(
                                                 end, path[i + 1]):
                            score_fow = oriented[
                                path[i]][begin]['weight'] + oriented[end][path[
                                    i + 1]]['weight']

                        if oriented.has_edge(path[i],
                                             end) and oriented.has_edge(
                                                 begin, path[i + 1]):
                            score_rev = oriented[
                                path[i]][end]['weight'] + oriented[begin][path[
                                    i + 1]]['weight']

                        if score_fow >= score_rev:
                            if score_fow > total_max:
                                total_max = score_fow
                                orientation = 'fow'
                                pos = i
                        else:
                            if score_rev > total_max:
                                total_max = score_rev
                                orientation = 'rev'
                                pos = i

                        #check for start and end
                        if oriented.has_edge(begin, path[0]):
                            score_fow = oriented[begin][path[0]]['weight']
                            if score_fow > total_max:
                                total_max = score_fow
                                orientation = 'fow'
                                pos = 0

                        if oriented.has_edge(end, path[0]):
                            score_rev = oriented[end][path[0]]['weight']
                            if score_rev > total_max:
                                total_max = score_rev
                                orientation = 'rev'
                                pos = 0

                        if oriented.has_edge(path[-1], begin):
                            score_fow = oriented[path[-1]][begin]['weight']
                            if score_fow > total_max:
                                total_max = score_fow
                                orientation = 'fow'
                                pos = len(path)

                        if oriented.has_edge(path[-1], end):
                            score_rev = oriented[path[-1]][end]['weight']
                            if score_rev > total_max:
                                total_max = score_rev
                                orientation = 'rev'
                                pos = len(path)

                if total_max > final_max:
                    final_max = total_max
                    final_pos = pos
                    final_orient = orientation
                    final_contig = contig
                    final_begin = begin
                    final_end = end

            if final_max > 70:
                #prev_len = len(explored)
                explored[final_contig] = 1
                if final_orient == 'fow':
                    if final_pos == 0:
                        path.insert(0, final_begin)
                        path.insert(0, final_end)
                    else:
                        if final_pos == len(path):
                            path.append(final_begin)
                            path.append(final_end)
                        else:
                            path.insert(final_pos + 1, final_begin)
                            path.insert(final_pos + 2, final_end)

                else:
                    if final_pos == 0:
                        path.insert(0, final_begin)
                        path.insert(0, final_end)
                    else:
                        if final_pos == len(path):
                            path.append(final_end)
                            path.append(final_begin)
                        else:
                            path.insert(final_pos + 1, final_end)
                            path.insert(final_pos + 2, final_begin)

            else:
                explored[final_contig] = 1

        backbone_paths[path_id] = path

    # for key in backbone_paths:
    #     if len(backbone_paths[key]) >= 4:
    #         print backbone_paths[key]

    # for key1 in backbone_paths:
    #     max_weight = 0
    #     max_path = ''
    #     for key2 in backbone_paths:
    #         if key1 != key2:
    #             path1 = backbone_paths[key1]
    #             path2 = backbone_paths[key2]
    #             weight = 0
    #             for contig1 in path1:
    #                 ctg1 = contig1.split(':')[0]
    #                 for contig2 in path2:
    #                     ctg2 = contig2.split(':')[0]
    #                     if H.has_edge(ctg1,ctg2):
    #                         weight += H[ctg1][ctg2]['weight']
    #             if weight > max_weight:
    #                 max_weight = weight
    #                 max_path = key2

    #     if max_path != '' and 1000 < max_weight < 4000:
    #         print backbone_paths[key1], backbone_paths[max_path], max_weight

    c_id = 1
    writer = FastaWriter(args.scaffold)
    for key in backbone_paths:
        if len(backbone_paths[key]) >= 4:
            path = backbone_paths[key]
            curr_contig = ""
            print c_id
            for i in range(0, len(path) - 1, 2):
                curr = path[i]
                next = path[i + 1]
                curr = curr.split(':')
                next = next.split(':')
                print curr
                if curr[1] == 'B' and next[1] == 'E':
                    curr_contig += id2seq[curr[0]]
                if curr[1] == 'E' and next[1] == 'B':
                    #print id2seq[curr[0]]
                    curr_contig += revcompl(id2seq[curr[0]])
                if i != len(path) - 2:
                    for j in range(0, 500):
                        curr_contig += 'N'
            # rec = SeqRecord(Seq(curr_contig,generic_dna),id='scaffold_'+str(c_id))
            # recs.append(rec)
            print c_id
            writer.writeRecord('scaffold_' + str(c_id), curr_contig)
            c_id += 1
class ResultCollector(object):
    """
    Gathers results and writes to a file.
    """
    def __init__(self, resultsQueue, algorithmName, algorithmConfig):
        self._resultsQueue = resultsQueue
        self._algorithmName = algorithmName
        self._algorithmConfig = algorithmConfig

    def _run(self):
        self.onStart()

        sentinelsReceived = 0
        while sentinelsReceived < options.numWorkers:
            result = self._resultsQueue.get()
            if result is None:
                sentinelsReceived += 1
            else:
                self.onResult(result)

        self.onFinish()

    def run(self):
        if options.doProfiling:
            cProfile.runctx("self._run()",
                            globals=globals(),
                            locals=locals(),
                            filename=os.path.join(options.temporaryDirectory,
                                                  "profile-%s.out" % (self.name)))
        else:
            self._run()


    # ==================================
    # Overridable interface begins here.
    #

    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId             = defaultdict(list)
        self.consensusChunksByRefId      = defaultdict(list)

        # open file writers
        self.fastaWriter = self.fastqWriter = self.gffWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())

    def onResult(self, result):
        window, cssAndVariants = result
        css, variants = cssAndVariants
        self._recordNewResults(window, css, variants)
        self._flushContigIfCompleted(window)

    def onFinish(self):
        logging.info("Analysis completed.")
        if self.fastaWriter: self.fastaWriter.close()
        if self.fastqWriter: self.fastqWriter.close()
        if self.gffWriter:   self.gffWriter.close()
        logging.info("Output files completed.")

    def _recordNewResults(self, window, css, variants):
        refId, refStart, refEnd = window
        self.consensusChunksByRefId[refId].append(css)
        self.variantsByRefId[refId] += variants
        self.referenceBasesProcessedById[refId] += (refEnd - refStart)

    def _flushContigIfCompleted(self, window):
        refId, _, _ = window
        refEntry = reference.byName[refId]
        refName = refEntry.fullName
        basesProcessed = self.referenceBasesProcessedById[refId]
        requiredBases = reference.numReferenceBases(refId, options.referenceWindows)
        if basesProcessed == requiredBases:
            # This contig is done, so we can dump to file and delete
            # the data structures.
            if self.gffWriter:
                self.gffWriter.writeVariants(sorted(self.variantsByRefId[refId]))
            del self.variantsByRefId[refId]

            #
            # If the user asked to analyze a window or a set of
            # windows, we output a FAST[AQ] contig per analyzed
            # window.  Otherwise we output a fasta contig per
            # reference contig.
            #
            # We try to be intelligent about naming the output
            # contigs, to include window information where applicable.
            #
            for span in reference.enumerateSpans(refId, options.referenceWindows):
                _, s, e = span
                if (s == 0) and (e == refEntry.length):
                    spanName = refName
                else:
                    spanName = refName + "_%d_%d" % (s, e)
                cssName = consensus.consensusContigName(spanName, self._algorithmName)
                # Gather just the chunks pertaining to this span
                chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId]
                                   if windows.windowsIntersect(chunk.refWindow, span) ]
                css = consensus.join(chunksThisSpan)

                if self.fastaWriter:
                    self.fastaWriter.writeRecord(cssName,
                                                 css.sequence)
                if self.fastqWriter:
                    self.fastqWriter.writeRecord(cssName,
                                                 css.sequence,
                                                 css.confidence)

            del self.consensusChunksByRefId[refId]
Example #30
0
def pick_rep(isoform_filename,
             gff_filename,
             group_filename,
             output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(
                ".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError(
                    "%s must contain either indexed FASTA files or " %
                    isoform_filename + "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." %
                      isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." %
                             isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml":  # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                    r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
 def test_writeFasta2(self):
     f = StringIO()
     w = FastaWriter(f)
     for record in FastaReader(self.fasta2):
         w.writeRecord(record.header, record.sequence)
     assert self.fasta2.getvalue() == f.getvalue()
Example #32
0
 def test_writeFasta2(self):
     f = StringIO()
     w = FastaWriter(f)
     for record in FastaReader(self.fasta1):
         w.writeRecord(record.name, record.sequence)
     assert_equal(self.fasta1.getvalue(), f.getvalue())
Example #33
0
def pick_rep(isoform_filename, gff_filename,
             group_filename, output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError("%s must contain either indexed FASTA files or " % isoform_filename +
                              "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." % isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml": # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i/10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
Example #34
0
class FastaEmitter(object):
    def __init__(self, filename):
        self.writer = FastaWriter(filename)

    def emit(self, zmwRead):
        self.writer.writeRecord(zmwRead.readName, zmwRead.basecalls())