Exemple #1
0
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
Exemple #2
0
def read_fastq_dict(fastq_input):
    records = {}
    if isinstance(fastq_input, str):
        for rec in FastqReader(fastq_input):
            name = rec.name.strip().split()[0]
            assert name not in records
            records[name] = rec
    elif isinstance(fastq_input, list):
        for filename in fastq_input:
            for rec in FastqReader(filename):
                name = rec.name.strip().split()[0]
                assert name not in records
                records[name] = rec
    return records
Exemple #3
0
    def __init__(self, isoseq_output_fn, reference_transcripts_fn,
                 output_analysis_fn, min_true_positive, max_false_positive,
                 min_seq_similarity, max_fuzzy_junction):
        self.isoseq_output_fn = isoseq_output_fn
        self.reference_transcripts_fn = reference_transcripts_fn
        self.output_analysis_fn = output_analysis_fn

        if isoseq_output_fn.endswith(".fasta") or isoseq_output_fn.endswith(
                ".fa"):
            self.isoforms = [r for r in FastaReader(isoseq_output_fn)]
            self.isoseq_output_fa = self.isoseq_output_fn
        elif isoseq_output_fn.endswith(".fastq") or isoseq_output_fn.endswith(
                ".fq"):
            self.isoforms = [r for r in FastqReader(isoseq_output_fn)]
            self.isoseq_output_fa = self.output_analysis_fn + ".isoseq.fa"
            with FastaWriter(self.isoseq_output_fa) as writer:
                for r in self.isoforms:
                    writer.writeRecord(r.name, r.sequence)

        self.reference_transcripts = [
            r for r in FastaReader(reference_transcripts_fn)
        ]

        self.min_true_positive = min_true_positive
        self.max_false_positive = max_false_positive
        self.min_seq_similarity = min_seq_similarity if min_seq_similarity <= 1 \
                                  else min_seq_similarity / 100.0
        self.max_fuzzy_junction = max_fuzzy_junction

        self.alns = self.filter_alns(
            self.map_isoforms_to_reference_transcripts())
Exemple #4
0
    def test_runner(self):
        """Test CombineRunner."""
        ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30)
        d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks")
        split_dirs = [op.join(d, b, "cluster_out") for b in
                      ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")]
        print split_dirs
        out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir")
        rmpath(out_combined_dir)
        mkdir(out_combined_dir)
        obj = CombineRunner(combined_dir=out_combined_dir,
                            sample_name="mysample",
                            split_dirs=split_dirs,
                            ipq_opts=ipq_opts)
        obj.run()

        expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq,
                            obj.all_consensus_isoforms_fa,
                            obj.all_cluster_report_fn, obj.all_cluster_summary_fn)
        self.assertTrue(all([op.exists(f) for f in expected_out_fns]))

        expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470',
                                'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457',
                                'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471']
        self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms)
        self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms)

        expected_lq_isoforms_num = 73
        self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num)

        expected_consensus_isoforms_num = 79
        self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
Exemple #5
0
def split_laa_fastq(input_file_name,
                    output_file_base,
                    subreads_file_name,
                    bio_samples_by_bc=None):
    """
    Split an LAA FASTQ file into one file per barcode.
    """
    if op.getsize(input_file_name) == 0:
        return []
    records = defaultdict(list)
    with FastqReader(input_file_name) as fastq_in:
        for rec in fastq_in:
            bc_id = re.sub("^Barcode", "", rec.id.split("_")[0])
            records[bc_id].append(rec)
    if bio_samples_by_bc is None:
        bio_samples_by_bc = {}
        with SubreadSet(subreads_file_name, strict=True) as ds:
            if ds.isBarcoded:  # pylint: disable=no-member
                bio_samples_by_bc = get_barcode_sample_mappings(ds)
    outputs = []
    for bc_id in sorted(records.keys()):
        bio_sample = bio_samples_by_bc.get(bc_id, "unknown")
        ofn = "{b}.{s}.{i}.fastq".format(b=output_file_base,
                                         s=bio_sample,
                                         i=bc_id)
        with FastqWriter(ofn) as fastq_out:
            for rec in records[bc_id]:
                fastq_out.writeRecord(rec)
        outputs.append(ofn)
    return outputs
Exemple #6
0
def isValidFastq(filename):
    if not isValidFile(filename) or not isFastqFile(filename):
        return False
    try:
        list(FastqReader(filename))
    except:
        return False
    return True
Exemple #7
0
def ReadSeqFiles(fns):
    seqs = {}
    for fn in fns:
        for record in FastqReader(fn):
            if record.id in seqs:
                print "ERROR: Duplicate sequence id '{0}'".format(record.id)
                raise SystemExit
            seqs[record.id] = record
    return seqs
Exemple #8
0
 def precache_fastq(self, fastq_filename):
     """
     Cache each sequence in the FASTQ file into self.qv
     """
     for r in FastqReader(fastq_filename):
         seqid = r.name.split()[0]
         self.qv[seqid] = {}
         c_basQV.fastq_precache_helper(seqid, r.quality, self.qv)
         self.make_qv_mean([seqid])
Exemple #9
0
def write_temp_fasta(fastq_file):
    """
    Write a temporary Fasta file from a Fastq
    """
    temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False)
    with FastaWriter(temp.name) as handle:
        for record in FastqReader(fastq_file):
            temp_record = FastaRecord(record.name, record.sequence)
            handle.writeRecord(temp_record)
    return temp
Exemple #10
0
def _fastq_to_fasta(fastq_path, fasta_path):
    """Convert a fastq file to  fasta file"""
    with FastqReader(fastq_path) as r:
        with FastaWriter(fasta_path) as w:
            for fastq_record in r:
                fasta_record = FastaRecord(fastq_record.name, fastq_record.sequence)
                w.writeRecord(fasta_record)

    log.info("Completed converting {q} to {f}".format(q=fastq_path, f=fasta_path))
    return 0
Exemple #11
0
def combine_amplicon_analysis_files(directory):
    output_file = os.path.join(directory, 'amplicon_analysis.all.fastq')
    with FastqWriter(output_file) as handle:
        for input_file in [
                'amplicon_analysis.fastq',
                'amplicon_analysis_chimeras_noise.fastq'
        ]:
            input_path = os.path.join(directory, input_file)
            for record in FastqReader(input_path):
                handle.writeRecord(record)
    return output_file
Exemple #12
0
def _parse_input_records( input_file ):
    """
    Parse the input sequence records with the appropriate pbcore Reader
    """
    input_type = get_file_type( input_file )
    if input_type == 'fasta':
        return list( FastaReader( input_file ))
    elif input_type == 'fastq':
        return list( FastqReader( input_file ))
    else:
        msg = 'Input file must be either Fasta or Fastq'
        log.error( msg )
Exemple #13
0
def main(parser):
    args = parser.parse_args()

    fx = args.inFastx if args.inFastx else sys.stdin
    #sortedRecs    = sorted(pysam.FastxFile(fx),key=lambda r:-len(r.sequence))
    try:
        recs = list(FastqReader(fx))
    except ValueError:
        #this will fail if fasta is streamed
        recs = list(FastaReader(fx))
    if not len(recs):
        print(f'No records in {fx}')
        return None

    counts = [rec.sequence.count(args.motif) for rec in recs]
    xlabel = f'{args.motif} Repeat Copies (exclusive)'
    labelMotif = list(map(
        eval, args.labelMotif.split(','))) if args.labelMotif else None
    f, c, b = countPlot(counts,
                        args.name,
                        xlabel,
                        args.ylabel,
                        labelValues=labelMotif,
                        binsize=args.binsize,
                        bandwidth=args.bandwidth,
                        plotKde=args.plotKde)
    f.savefig(f'{args.out}.motifcount.{args.format}',
              format=args.format,
              dpi=args.dpi)
    counts = list(map(len, recs))
    xlabel = 'Target Insert Length (bp)'
    labelLength = list(map(
        eval, args.labelLength.split(','))) if args.labelLength else None
    f, c, b = countPlot(counts,
                        args.name,
                        xlabel,
                        args.ylabel,
                        labelValues=labelLength,
                        binsize=len(args.motif) * args.binsize,
                        bandwidth=len(args.motif) * args.bandwidth,
                        plotKde=args.plotKde)
    f.savefig(f'{args.out}.insertSize.{args.format}',
              format=args.format,
              dpi=args.dpi)
    if args.exportBincounts:
        oname = f'{args.out}.histogramBins.csv'
        with open(oname, 'w') as ofile:
            ofile.write('Length,Reads\n')
            for bn, cnt in zip(b, c):
                ofile.write(f'{bn},{cnt}\n')

    print('Done')
    return f
Exemple #14
0
def readSequenceRecords(filename):
    """
    Parse the input sequence records with the appropriate pbcore Reader
    """
    fileType = getFileType(filename)
    if fileType == 'fasta':
        return list(FastaReader(filename))
    elif fileType == 'fastq':
        return list(FastqReader(filename))
    else:
        msg = 'Input file must be either FASTA or FASTQ'
        log.error(msg)
        raise TypeError(msg)
 def run_after(self, rtc, output_dir):
     rep_fn = rtc.task.output_files[0]
     gff_fn = rtc.task.output_files[1]
     abundance_fn = rtc.task.output_files[2]
     group_fn = rtc.task.output_files[3]
     read_stat_fn = rtc.task.output_files[4]
     from pbcore.io import FastqReader
     from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader, ReadStatReader
     self.assertEqual(len([r for r in FastqReader(rep_fn)]), 65)
     self.assertEqual(len([r for r in CollapseGffReader(gff_fn)]), 65)
     self.assertEqual(len([r for r in AbundanceReader(abundance_fn)]), 65)
     self.assertEqual(len([r for r in GroupReader(group_fn)]), 86)
     self.assertEqual(len([r for r in ReadStatReader(read_stat_fn)]), 10873)
 def test_split_laa_fastq(self):
     ifn = self.input_file_name
     ofb = tempfile.NamedTemporaryFile().name
     ofs = split_laa_fastq(ifn, ofb, self._subreads)
     assert len(ofs) == 2
     suffixes = sorted([".".join(of.split('.')[1:]) for of in ofs])
     assert suffixes == [
         'Alice.lbc1--lbc1.fastq', 'Charles.lbc3--lbc3.fastq'
     ]
     for i, ofn in enumerate(ofs):
         with FastqReader(ofn) as fastq_in:
             recs = [rec for rec in fastq_in]
             for j in range(2):
                 assert str(recs[j]) == str(self._records[(i * 2) + j])
Exemple #17
0
 def _open_files(self, *input_filenames):
     """Open file handers and return."""
     readers = []
     for fn in input_filenames:
         if ContigSetReaderWrapper.get_file_type(fn) == "FASTA":
             readers.append(FastaReader(fn))
         elif ContigSetReaderWrapper.get_file_type(fn) == "FASTQ":
             readers.append(FastqReader(fn))
         elif ContigSetReaderWrapper.get_file_type(fn) == "CONTIGSET":
             readers.append(ContigSet(fn))
         else:
             raise IOError(
                 "Could not read %s as FASTA/FASTQ/CONTIGSET file." % fn)
     return readers
Exemple #18
0
def _get_contigs(fastq):
    """
    Digests the polished contigs into a dict of ContigInfo objects
    :param fastq: (str) path to polished fastq file
    :return: (dict) contig id -> ContigInfo object
    """
    contigs = {}
    fqr = FastqReader(fastq)
    for rec in fqr:
        # remove quiver/arrow appended string, otherwise we can't cross
        # reference the name in the gff
        cinf = ContigInfo(rec)
        contigs[cinf.name] = cinf

    return contigs
def read_sequences_by_barcode(sequence_file, trim):
    paths = defaultdict(list)
    with FastqReader(sequence_file) as handle:
        for record in handle:
            barcode = record.name.split('_Cluster')[0][7:]
            if barcode == "65535--65535":
                continue
            if trim:
                trimSeq = record.sequence[trim:-trim]
                trimQual = record.qualityString[trim:-trim]
                rec = FastqRecord(record.name, trimSeq, trimQual)
            else:
                rec = record
            paths[barcode].append(rec)
    return paths
Exemple #20
0
 def test_len_fastq(self):
     fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
           'lambda/2590980/0008/Analysis_Results/'
           'm141115_075238_ethan_c100699872550000001'
           '823139203261572_s1_p0.1.subreads.fastq')
     fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     with open(fq_out, 'w') as fqh:
         with open(fn, 'r') as fih:
             for line in itertools.islice(fih, 24):
                 fqh.write(line)
     cset = ContigSet(fq_out)
     assert not cset.isIndexed
     assert isinstance(cset.resourceReaders()[0], FastqReader)
     assert sum(1 for _ in cset) == sum(1 for _ in FastqReader(fq_out))
     assert sum(1 for _ in cset) == 6
Exemple #21
0
def split_laa_fastq(input_file_name, output_file_base):
    """
    Split an LAA FASTQ file into one file per barcode.
    """
    if op.getsize(input_file_name) == 0:
        return []
    records = defaultdict(list)
    with FastqReader(input_file_name) as fastq_in:
        for rec in fastq_in:
            bc_id = rec.id.split("_")[0]
            records[bc_id].append(rec)
    outputs = []
    for bc_id in sorted(records.keys()):
        ofn = "{b}.{i}.fastq".format(b=output_file_base, i=bc_id)
        with FastqWriter(ofn) as fastq_out:
            for rec in records[bc_id]:
                fastq_out.writeRecord(rec)
        outputs.append(ofn)
    return outputs
Exemple #22
0
    def test_filter_by_count(self):
        """Test filter_by_count"""
        out_abundance_fn = op.join(_OUT_DIR_, "filter_by_count.abundance.txt")
        out_gff_fn = op.join(_OUT_DIR_, "filter_by_count.gff")
        out_rep_fn = op.join(_OUT_DIR_, "filter_by_count.rep.fastq")
        filter_by_count(in_group_filename=GROUP_FN, in_abundance_filename=ABUNDANCE_FN,
                        in_gff_filename=GFF_FN, in_rep_filename=REP_FN,
                        out_abundance_filename=out_abundance_fn,
                        out_gff_filename=out_gff_fn,
                        out_rep_filename=out_rep_fn,
                        min_count=20)

        out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)]
        self.assertEqual(out_abundance_ids, self.expected_good)

        out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)]
        self.assertEqual(out_gff_ids, self.expected_good)

        out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)]
        self.assertEqual(out_rep_ids, self.expected_good)
Exemple #23
0
    def testAll(self):
        """Test FastqRandomReader.keys() and __getitem__."""
        reads = [r for r in FastqReader(self.inFq)]
        names = [r.name for r in reads]

        frr = FastqRandomReader(self.inFq)
        self.assertTrue(set(frr.keys()) == set(names))

        self.assertTrue(
            False not in [frr[r.name].name == r.name for r in reads])

        self.assertTrue(
            False not in [frr[r.name].sequence == r.sequence for r in reads])

        self.assertTrue(
            False not in
            [frr[r.name].quality.all() == r.quality.all() for r in reads])

        self.assertTrue(
            False not in
            [frr[r.name].qualityString == r.qualityString for r in reads])
Exemple #24
0
    def test_filter_out_subsets(self):
        """Test filter_out_subsets"""
        out_abundance_fn = op.join(_OUT_DIR_, "filter_out_subsets.abundance.txt")
        out_gff_fn = op.join(_OUT_DIR_, "filter_out_subsets.gff")
        out_rep_fn = op.join(_OUT_DIR_, "filter_out_subsets.rep.fastq")
        filter_out_subsets(in_abundance_filename=ABUNDANCE_FN,
                           in_gff_filename=GFF_FN, in_rep_filename=REP_FN,
                           out_abundance_filename=out_abundance_fn,
                           out_gff_filename=out_gff_fn, out_rep_filename=out_rep_fn,
                           max_fuzzy_junction=5)

        all = [r.seqid for r in CollapseGffReader(GFF_FN)]
        expected_good = set(all)-set(self.expected_diff)
        out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)]
        self.assertEqual(set(out_abundance_ids), expected_good)

        out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)]
        self.assertEqual(set(out_gff_ids), expected_good)

        out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)]
        self.assertEqual(set(out_rep_ids), expected_good)
Exemple #25
0
 def presmooth(self, seqids, window_size, fastq_filename=None):
     """
     precache MUST BE already called! Otherwise will have error!
     """
     self.window_size = window_size
     for seqid in seqids:
         try:
             self.qv[seqid]['smoothed'] = c_basQV.maxval_per_window(
                 self.qv[seqid]['unsmoothed'], window_size)
         except KeyError:
             if fastq_filename is None:
                 raise KeyError("Qvs of {seqid} ".format(seqid=seqid) +
                                "must be precached.")
             for r in FastqReader(fastq_filename):
                 if r.name.split()[0] == seqid:
                     self.qv[seqid] = {}
                     c_basQV.fastq_precache_helper(seqid, r.quality,
                                                   self.qv)
                     break
             raise KeyError("Qvs of {seqid} ".format(seqid=seqid) +
                            "could not be read from {fq}".format(
                                fq=fastq_filename))
Exemple #26
0
def main(parser):
    args = parser.parse_args()

    fx = args.inFastx if args.inFastx else sys.stdin
    #sortedRecs    = sorted(pysam.FastxFile(fx),key=lambda r:-len(r.sequence))
    keyfunc = (lambda s: -len(s)) if args.reverse else len
    try:
        sortedRecs = sorted(FastqReader(fx), key=keyfunc)
    except ValueError:
        #this will fail if fasta is streamed
        sortedRecs = sorted(FastaReader(fx), key=keyfunc)
    if len(sortedRecs) == 0:
        #weird single fx rec with three lines fails
        sortedRecs = sorted(FastaReader(fx), key=keyfunc)

    motifs = args.motifs.split(',')
    motifCols = args.sep.join(map('{{{}}}'.format, motifs))
    if args.blockCounts:
        outFormat = f'{{readName}}{args.sep}{motifCols}{args.sep}{{blockCount}}{args.sep}{{totalLength}}'
    else:
        outFormat = f'{{readName}}{args.sep}{motifCols}{args.sep}{{totalLength}}'
    getCounts = countMotifs(motifs,
                            lengthField='totalLength',
                            blocks='blockCount' if args.blockCounts else False,
                            collapseHP=args.collapseHP)

    oFile = open(args.out, 'w') if args.out else sys.stdout
    #column names
    oFile.write(re.sub('{|}', '', outFormat) + '\n')
    for rec in sortedRecs:
        counts = getCounts(rec.sequence)
        counts['readName'] = rec.name
        oFile.write(outFormat.format(**counts) + '\n')
    oFile.close()

    return None
Exemple #27
0
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename,
                                  in_rep_filename, out_abundance_filename,
                                  out_gff_filename, out_rep_filename, good):
    """Write good collapsed isoforms."""
    in_suffix = parse_ds_filename(in_rep_filename)[1]
    out_suffix = parse_ds_filename(out_rep_filename)[1]
    if in_suffix != out_suffix:
        raise ValueError("Format of input %s and output %s must match." %
                         (in_rep_filename, out_rep_filename))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input %s and output %s must be either FASTA or FASTQ." %
            (in_rep_filename, out_rep_filename))

    # then read gff, and write good gff record.
    with CollapseGffWriter(out_gff_filename) as gff_writer:
        for r in CollapseGffReader(in_gff_filename):
            if r.seqid in good:
                gff_writer.writeRecord(r)

    # next read rep fasta/fastq, and write good rep fasta/fastq record.
    rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \
                 else FastqReader(in_rep_filename)
    rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \
                 else FastqWriter(out_rep_filename)
    for r in rep_reader:
        # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465
        if r.name.split('|')[0] in good:
            rep_writer.writeRecord(r)

    # finally write abundance info of good records.
    with AbundanceReader(in_abundance_filename) as a_reader, \
        AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer:
        for r in a_reader:
            if r.pbid in good:
                a_writer.writeRecord(r)
Exemple #28
0
 def test_fastq_consolidate(self):
     fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
           'lambda/2590980/0008/Analysis_Results/'
           'm141115_075238_ethan_c100699872550000001'
           '823139203261572_s1_p0.1.subreads.fastq')
     fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     cset_out = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     with open(fq_out, 'w') as fqh:
         with open(fn, 'r') as fih:
             for line in itertools.islice(fih, 240):
                 fqh.write(line)
     cset = ContigSet(fq_out)
     cset_l = sum(1 for _ in cset)
     assert cset_l == 60
     cset.filters.addRequirement(length=[('>', 1000)])
     cset_l = sum(1 for _ in cset)
     assert cset_l == 23
     cset.consolidate(cfq_out)
     cset_l = sum(1 for _ in cset)
     cfq = FastqReader(cfq_out)
     assert cset_l == 23
     assert cset_l == sum(1 for _ in cfq)
     cset.write(cset_out)
Exemple #29
0
    def pickup_best_clusters(self):
        """Pick up hiqh QV clusters."""
        self.add_log(
            "Picking up the best clusters according to QVs from {fs}.".format(
                fs=", ".join(self.fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        # check if the uc cids are integers
        uc_keys_are_int = type(uc.keys()[0]) is int

        polished = {}  # cid --> FastqRecord

        for fq in self.fq_filenames:
            self.add_log("Looking at arrowed fq {f}".format(f=fq))
            for r in FastqReader(fq):
                # possible ID #1: c0|arrow (a single Ice2 directory)
                # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory)
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                i = cid.find('/')
                if i > 0:
                    cid = cid[:i]
                if uc_keys_are_int:
                    # only convert in the case where uc keys are integers (ex: is c10, but 10)
                    cid = int(
                        cid[1:])  #becuz possible ID #2, dont convert to int
                polished[cid] = r

        expected_acc_dict = {}  # cid --> expected accuracy (ex: 0.99)
        good = []  # contains all the cids that are HQ

        # calculate expected QV given 5'/3' trimming
        # for sequences that are shorter than the trimming, use the length itself
        for cid, r in polished.iteritems():
            qv_len = max(len(r.quality),
                         len(r.quality) - self.qv_trim_5 - self.qv_trim_3)
            q = [phred_to_qv(x) for x in r.quality]
            err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3])
            expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len))
            if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \
                len(uc[cid]) >= self.hq_min_full_length_reads :
                good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        if self.report_fn is not None:
            self.write_report(report_fn=self.report_fn,
                              uc=uc,
                              partial_uc=partial_uc2)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".format(
            f=self.arrowed_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".format(
            f=self.arrowed_bad_fa))
        with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \
                FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \
                FastqWriter(self.arrowed_good_fq) as good_fq_writer, \
                FastqWriter(self.arrowed_bad_fq) as bad_fq_writer:
            for cid in polished:
                r = polished[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                    format(cid=cid,
                           flnc_num=len(uc[cid]),
                           nfl_num=len(partial_uc2[cid]),
                           read_len=len(r.sequence))
                newname = cid_with_annotation2(
                    newname, expected_acc=expected_acc_dict[cid])

                if cid in good:
                    self.add_log(
                        "processing arrowed cluster {c} --> good.".format(
                            c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence[:])
                    good_fq_writer.writeRecord(newname, r.sequence[:],
                                               r.quality)
                else:
                    self.add_log(
                        "processing arrowed cluster {c} --> bad.".format(
                            c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence[:])
                    bad_fq_writer.writeRecord(newname, r.sequence[:],
                                              r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log(
            "High-quality Arrowed consensus written " +
            "to:\n{0}\n{1}".format(self.arrowed_good_fa, self.arrowed_good_fq),
            level=logging.INFO)
        self.add_log(
            "Low-quality Arrowed consensus written " +
            "to:\n{0}\n{1}".format(self.arrowed_bad_fa, self.arrowed_bad_fq),
            level=logging.INFO)
        self.add_log("-" * 60, level=logging.INFO)
Exemple #30
0
    def test_empty_fastq_consolidate(self):
        fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
              'lambda/2590980/0008/Analysis_Results/'
              'm141115_075238_ethan_c100699872550000001'
              '823139203261572_s1_p0.1.subreads.fastq')
        fq1_out = tempfile.NamedTemporaryFile(suffix="1.fastq").name
        fq2_out = tempfile.NamedTemporaryFile(suffix="2.fastq").name
        cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name

        # Two full
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240):
                    fqh.write(line)
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240, 480):
                    fqh.write(line)
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        assert cset_l == 120
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        assert cset_l == 120
        assert cset_l == sum(1 for _ in cfq)

        # one full one empty
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240):
                    fqh.write(line)
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        assert cset_l == 60
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        assert cset_l == 60
        assert cset_l == sum(1 for _ in cfq)

        # one empty one full
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240):
                    fqh.write(line)
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        assert cset_l == 60
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        assert cset_l == 60
        assert cset_l == sum(1 for _ in cfq)

        # both empty
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        assert cset_l == 0
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        assert cset_l == 0
        assert cset_l == sum(1 for _ in cfq)