def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns, combined_hq_fa, combined_hq_fq, combined_lq_fa, combined_lq_fq, hq_lq_prefix_dict_pickle, sample_name): """Combine split hq (lq) files and save to combined_dir. Dumping hq|lq prefix dictionary to pickle. Return an instance of CombinedFiles. Parameters: split_indices -- indices of splitted cluster bins. split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...] split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...] """ assert len(split_indices) == len(split_hq_fns) assert len(split_indices) == len(split_lq_fns) assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns]) hq_pre_dict, lq_pre_dict = {}, {} hq_fa_writer = FastaWriter(combined_hq_fa) hq_fq_writer = FastqWriter(combined_hq_fq) lq_fa_writer = FastaWriter(combined_lq_fa) lq_fq_writer = FastqWriter(combined_lq_fq) for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns): logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq) hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ", sample_name=sample_name) lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ", sample_name=sample_name) hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq)) lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq)) with FastqReader(split_hq) as reader: for read in reader: name = combined_cid_hq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) hq_fa_writer.writeRecord(name, read.sequence[:]) hq_fq_writer.writeRecord(name, read.sequence[:], read.quality) with FastqReader(split_lq) as reader: for read in reader: name = combined_cid_lq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) lq_fa_writer.writeRecord(name, read.sequence[:]) lq_fq_writer.writeRecord(name, read.sequence[:], read.quality) hq_fa_writer.close() hq_fq_writer.close() lq_fa_writer.close() lq_fq_writer.close() logging.info("HQ polished output combined to:%s", combined_hq_fq) logging.info("LQ polished output combined to:%s", combined_lq_fq) logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle) with open(hq_lq_prefix_dict_pickle, 'wb') as writer: cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
def read_fastq_dict(fastq_input): records = {} if isinstance(fastq_input, str): for rec in FastqReader(fastq_input): name = rec.name.strip().split()[0] assert name not in records records[name] = rec elif isinstance(fastq_input, list): for filename in fastq_input: for rec in FastqReader(filename): name = rec.name.strip().split()[0] assert name not in records records[name] = rec return records
def __init__(self, isoseq_output_fn, reference_transcripts_fn, output_analysis_fn, min_true_positive, max_false_positive, min_seq_similarity, max_fuzzy_junction): self.isoseq_output_fn = isoseq_output_fn self.reference_transcripts_fn = reference_transcripts_fn self.output_analysis_fn = output_analysis_fn if isoseq_output_fn.endswith(".fasta") or isoseq_output_fn.endswith( ".fa"): self.isoforms = [r for r in FastaReader(isoseq_output_fn)] self.isoseq_output_fa = self.isoseq_output_fn elif isoseq_output_fn.endswith(".fastq") or isoseq_output_fn.endswith( ".fq"): self.isoforms = [r for r in FastqReader(isoseq_output_fn)] self.isoseq_output_fa = self.output_analysis_fn + ".isoseq.fa" with FastaWriter(self.isoseq_output_fa) as writer: for r in self.isoforms: writer.writeRecord(r.name, r.sequence) self.reference_transcripts = [ r for r in FastaReader(reference_transcripts_fn) ] self.min_true_positive = min_true_positive self.max_false_positive = max_false_positive self.min_seq_similarity = min_seq_similarity if min_seq_similarity <= 1 \ else min_seq_similarity / 100.0 self.max_fuzzy_junction = max_fuzzy_junction self.alns = self.filter_alns( self.map_isoforms_to_reference_transcripts())
def test_runner(self): """Test CombineRunner.""" ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30) d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks") split_dirs = [op.join(d, b, "cluster_out") for b in ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")] print split_dirs out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir") rmpath(out_combined_dir) mkdir(out_combined_dir) obj = CombineRunner(combined_dir=out_combined_dir, sample_name="mysample", split_dirs=split_dirs, ipq_opts=ipq_opts) obj.run() expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq, obj.all_consensus_isoforms_fa, obj.all_cluster_report_fn, obj.all_cluster_summary_fn) self.assertTrue(all([op.exists(f) for f in expected_out_fns])) expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470', 'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457', 'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471'] self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms) self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms) expected_lq_isoforms_num = 73 self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num) expected_consensus_isoforms_num = 79 self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
def split_laa_fastq(input_file_name, output_file_base, subreads_file_name, bio_samples_by_bc=None): """ Split an LAA FASTQ file into one file per barcode. """ if op.getsize(input_file_name) == 0: return [] records = defaultdict(list) with FastqReader(input_file_name) as fastq_in: for rec in fastq_in: bc_id = re.sub("^Barcode", "", rec.id.split("_")[0]) records[bc_id].append(rec) if bio_samples_by_bc is None: bio_samples_by_bc = {} with SubreadSet(subreads_file_name, strict=True) as ds: if ds.isBarcoded: # pylint: disable=no-member bio_samples_by_bc = get_barcode_sample_mappings(ds) outputs = [] for bc_id in sorted(records.keys()): bio_sample = bio_samples_by_bc.get(bc_id, "unknown") ofn = "{b}.{s}.{i}.fastq".format(b=output_file_base, s=bio_sample, i=bc_id) with FastqWriter(ofn) as fastq_out: for rec in records[bc_id]: fastq_out.writeRecord(rec) outputs.append(ofn) return outputs
def isValidFastq(filename): if not isValidFile(filename) or not isFastqFile(filename): return False try: list(FastqReader(filename)) except: return False return True
def ReadSeqFiles(fns): seqs = {} for fn in fns: for record in FastqReader(fn): if record.id in seqs: print "ERROR: Duplicate sequence id '{0}'".format(record.id) raise SystemExit seqs[record.id] = record return seqs
def precache_fastq(self, fastq_filename): """ Cache each sequence in the FASTQ file into self.qv """ for r in FastqReader(fastq_filename): seqid = r.name.split()[0] self.qv[seqid] = {} c_basQV.fastq_precache_helper(seqid, r.quality, self.qv) self.make_qv_mean([seqid])
def write_temp_fasta(fastq_file): """ Write a temporary Fasta file from a Fastq """ temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False) with FastaWriter(temp.name) as handle: for record in FastqReader(fastq_file): temp_record = FastaRecord(record.name, record.sequence) handle.writeRecord(temp_record) return temp
def _fastq_to_fasta(fastq_path, fasta_path): """Convert a fastq file to fasta file""" with FastqReader(fastq_path) as r: with FastaWriter(fasta_path) as w: for fastq_record in r: fasta_record = FastaRecord(fastq_record.name, fastq_record.sequence) w.writeRecord(fasta_record) log.info("Completed converting {q} to {f}".format(q=fastq_path, f=fasta_path)) return 0
def combine_amplicon_analysis_files(directory): output_file = os.path.join(directory, 'amplicon_analysis.all.fastq') with FastqWriter(output_file) as handle: for input_file in [ 'amplicon_analysis.fastq', 'amplicon_analysis_chimeras_noise.fastq' ]: input_path = os.path.join(directory, input_file) for record in FastqReader(input_path): handle.writeRecord(record) return output_file
def _parse_input_records( input_file ): """ Parse the input sequence records with the appropriate pbcore Reader """ input_type = get_file_type( input_file ) if input_type == 'fasta': return list( FastaReader( input_file )) elif input_type == 'fastq': return list( FastqReader( input_file )) else: msg = 'Input file must be either Fasta or Fastq' log.error( msg )
def main(parser): args = parser.parse_args() fx = args.inFastx if args.inFastx else sys.stdin #sortedRecs = sorted(pysam.FastxFile(fx),key=lambda r:-len(r.sequence)) try: recs = list(FastqReader(fx)) except ValueError: #this will fail if fasta is streamed recs = list(FastaReader(fx)) if not len(recs): print(f'No records in {fx}') return None counts = [rec.sequence.count(args.motif) for rec in recs] xlabel = f'{args.motif} Repeat Copies (exclusive)' labelMotif = list(map( eval, args.labelMotif.split(','))) if args.labelMotif else None f, c, b = countPlot(counts, args.name, xlabel, args.ylabel, labelValues=labelMotif, binsize=args.binsize, bandwidth=args.bandwidth, plotKde=args.plotKde) f.savefig(f'{args.out}.motifcount.{args.format}', format=args.format, dpi=args.dpi) counts = list(map(len, recs)) xlabel = 'Target Insert Length (bp)' labelLength = list(map( eval, args.labelLength.split(','))) if args.labelLength else None f, c, b = countPlot(counts, args.name, xlabel, args.ylabel, labelValues=labelLength, binsize=len(args.motif) * args.binsize, bandwidth=len(args.motif) * args.bandwidth, plotKde=args.plotKde) f.savefig(f'{args.out}.insertSize.{args.format}', format=args.format, dpi=args.dpi) if args.exportBincounts: oname = f'{args.out}.histogramBins.csv' with open(oname, 'w') as ofile: ofile.write('Length,Reads\n') for bn, cnt in zip(b, c): ofile.write(f'{bn},{cnt}\n') print('Done') return f
def readSequenceRecords(filename): """ Parse the input sequence records with the appropriate pbcore Reader """ fileType = getFileType(filename) if fileType == 'fasta': return list(FastaReader(filename)) elif fileType == 'fastq': return list(FastqReader(filename)) else: msg = 'Input file must be either FASTA or FASTQ' log.error(msg) raise TypeError(msg)
def run_after(self, rtc, output_dir): rep_fn = rtc.task.output_files[0] gff_fn = rtc.task.output_files[1] abundance_fn = rtc.task.output_files[2] group_fn = rtc.task.output_files[3] read_stat_fn = rtc.task.output_files[4] from pbcore.io import FastqReader from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader, ReadStatReader self.assertEqual(len([r for r in FastqReader(rep_fn)]), 65) self.assertEqual(len([r for r in CollapseGffReader(gff_fn)]), 65) self.assertEqual(len([r for r in AbundanceReader(abundance_fn)]), 65) self.assertEqual(len([r for r in GroupReader(group_fn)]), 86) self.assertEqual(len([r for r in ReadStatReader(read_stat_fn)]), 10873)
def test_split_laa_fastq(self): ifn = self.input_file_name ofb = tempfile.NamedTemporaryFile().name ofs = split_laa_fastq(ifn, ofb, self._subreads) assert len(ofs) == 2 suffixes = sorted([".".join(of.split('.')[1:]) for of in ofs]) assert suffixes == [ 'Alice.lbc1--lbc1.fastq', 'Charles.lbc3--lbc3.fastq' ] for i, ofn in enumerate(ofs): with FastqReader(ofn) as fastq_in: recs = [rec for rec in fastq_in] for j in range(2): assert str(recs[j]) == str(self._records[(i * 2) + j])
def _open_files(self, *input_filenames): """Open file handers and return.""" readers = [] for fn in input_filenames: if ContigSetReaderWrapper.get_file_type(fn) == "FASTA": readers.append(FastaReader(fn)) elif ContigSetReaderWrapper.get_file_type(fn) == "FASTQ": readers.append(FastqReader(fn)) elif ContigSetReaderWrapper.get_file_type(fn) == "CONTIGSET": readers.append(ContigSet(fn)) else: raise IOError( "Could not read %s as FASTA/FASTQ/CONTIGSET file." % fn) return readers
def _get_contigs(fastq): """ Digests the polished contigs into a dict of ContigInfo objects :param fastq: (str) path to polished fastq file :return: (dict) contig id -> ContigInfo object """ contigs = {} fqr = FastqReader(fastq) for rec in fqr: # remove quiver/arrow appended string, otherwise we can't cross # reference the name in the gff cinf = ContigInfo(rec) contigs[cinf.name] = cinf return contigs
def read_sequences_by_barcode(sequence_file, trim): paths = defaultdict(list) with FastqReader(sequence_file) as handle: for record in handle: barcode = record.name.split('_Cluster')[0][7:] if barcode == "65535--65535": continue if trim: trimSeq = record.sequence[trim:-trim] trimQual = record.qualityString[trim:-trim] rec = FastqRecord(record.name, trimSeq, trimQual) else: rec = record paths[barcode].append(rec) return paths
def test_len_fastq(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 24): fqh.write(line) cset = ContigSet(fq_out) assert not cset.isIndexed assert isinstance(cset.resourceReaders()[0], FastqReader) assert sum(1 for _ in cset) == sum(1 for _ in FastqReader(fq_out)) assert sum(1 for _ in cset) == 6
def split_laa_fastq(input_file_name, output_file_base): """ Split an LAA FASTQ file into one file per barcode. """ if op.getsize(input_file_name) == 0: return [] records = defaultdict(list) with FastqReader(input_file_name) as fastq_in: for rec in fastq_in: bc_id = rec.id.split("_")[0] records[bc_id].append(rec) outputs = [] for bc_id in sorted(records.keys()): ofn = "{b}.{i}.fastq".format(b=output_file_base, i=bc_id) with FastqWriter(ofn) as fastq_out: for rec in records[bc_id]: fastq_out.writeRecord(rec) outputs.append(ofn) return outputs
def test_filter_by_count(self): """Test filter_by_count""" out_abundance_fn = op.join(_OUT_DIR_, "filter_by_count.abundance.txt") out_gff_fn = op.join(_OUT_DIR_, "filter_by_count.gff") out_rep_fn = op.join(_OUT_DIR_, "filter_by_count.rep.fastq") filter_by_count(in_group_filename=GROUP_FN, in_abundance_filename=ABUNDANCE_FN, in_gff_filename=GFF_FN, in_rep_filename=REP_FN, out_abundance_filename=out_abundance_fn, out_gff_filename=out_gff_fn, out_rep_filename=out_rep_fn, min_count=20) out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)] self.assertEqual(out_abundance_ids, self.expected_good) out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)] self.assertEqual(out_gff_ids, self.expected_good) out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)] self.assertEqual(out_rep_ids, self.expected_good)
def testAll(self): """Test FastqRandomReader.keys() and __getitem__.""" reads = [r for r in FastqReader(self.inFq)] names = [r.name for r in reads] frr = FastqRandomReader(self.inFq) self.assertTrue(set(frr.keys()) == set(names)) self.assertTrue( False not in [frr[r.name].name == r.name for r in reads]) self.assertTrue( False not in [frr[r.name].sequence == r.sequence for r in reads]) self.assertTrue( False not in [frr[r.name].quality.all() == r.quality.all() for r in reads]) self.assertTrue( False not in [frr[r.name].qualityString == r.qualityString for r in reads])
def test_filter_out_subsets(self): """Test filter_out_subsets""" out_abundance_fn = op.join(_OUT_DIR_, "filter_out_subsets.abundance.txt") out_gff_fn = op.join(_OUT_DIR_, "filter_out_subsets.gff") out_rep_fn = op.join(_OUT_DIR_, "filter_out_subsets.rep.fastq") filter_out_subsets(in_abundance_filename=ABUNDANCE_FN, in_gff_filename=GFF_FN, in_rep_filename=REP_FN, out_abundance_filename=out_abundance_fn, out_gff_filename=out_gff_fn, out_rep_filename=out_rep_fn, max_fuzzy_junction=5) all = [r.seqid for r in CollapseGffReader(GFF_FN)] expected_good = set(all)-set(self.expected_diff) out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)] self.assertEqual(set(out_abundance_ids), expected_good) out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)] self.assertEqual(set(out_gff_ids), expected_good) out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)] self.assertEqual(set(out_rep_ids), expected_good)
def presmooth(self, seqids, window_size, fastq_filename=None): """ precache MUST BE already called! Otherwise will have error! """ self.window_size = window_size for seqid in seqids: try: self.qv[seqid]['smoothed'] = c_basQV.maxval_per_window( self.qv[seqid]['unsmoothed'], window_size) except KeyError: if fastq_filename is None: raise KeyError("Qvs of {seqid} ".format(seqid=seqid) + "must be precached.") for r in FastqReader(fastq_filename): if r.name.split()[0] == seqid: self.qv[seqid] = {} c_basQV.fastq_precache_helper(seqid, r.quality, self.qv) break raise KeyError("Qvs of {seqid} ".format(seqid=seqid) + "could not be read from {fq}".format( fq=fastq_filename))
def main(parser): args = parser.parse_args() fx = args.inFastx if args.inFastx else sys.stdin #sortedRecs = sorted(pysam.FastxFile(fx),key=lambda r:-len(r.sequence)) keyfunc = (lambda s: -len(s)) if args.reverse else len try: sortedRecs = sorted(FastqReader(fx), key=keyfunc) except ValueError: #this will fail if fasta is streamed sortedRecs = sorted(FastaReader(fx), key=keyfunc) if len(sortedRecs) == 0: #weird single fx rec with three lines fails sortedRecs = sorted(FastaReader(fx), key=keyfunc) motifs = args.motifs.split(',') motifCols = args.sep.join(map('{{{}}}'.format, motifs)) if args.blockCounts: outFormat = f'{{readName}}{args.sep}{motifCols}{args.sep}{{blockCount}}{args.sep}{{totalLength}}' else: outFormat = f'{{readName}}{args.sep}{motifCols}{args.sep}{{totalLength}}' getCounts = countMotifs(motifs, lengthField='totalLength', blocks='blockCount' if args.blockCounts else False, collapseHP=args.collapseHP) oFile = open(args.out, 'w') if args.out else sys.stdout #column names oFile.write(re.sub('{|}', '', outFormat) + '\n') for rec in sortedRecs: counts = getCounts(rec.sequence) counts['readName'] = rec.name oFile.write(outFormat.format(**counts) + '\n') oFile.close() return None
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename, in_rep_filename, out_abundance_filename, out_gff_filename, out_rep_filename, good): """Write good collapsed isoforms.""" in_suffix = parse_ds_filename(in_rep_filename)[1] out_suffix = parse_ds_filename(out_rep_filename)[1] if in_suffix != out_suffix: raise ValueError("Format of input %s and output %s must match." % (in_rep_filename, out_rep_filename)) if in_suffix not in ("fasta", "fastq"): raise ValueError( "Format of input %s and output %s must be either FASTA or FASTQ." % (in_rep_filename, out_rep_filename)) # then read gff, and write good gff record. with CollapseGffWriter(out_gff_filename) as gff_writer: for r in CollapseGffReader(in_gff_filename): if r.seqid in good: gff_writer.writeRecord(r) # next read rep fasta/fastq, and write good rep fasta/fastq record. rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \ else FastqReader(in_rep_filename) rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \ else FastqWriter(out_rep_filename) for r in rep_reader: # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465 if r.name.split('|')[0] in good: rep_writer.writeRecord(r) # finally write abundance info of good records. with AbundanceReader(in_abundance_filename) as a_reader, \ AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer: for r in a_reader: if r.pbid in good: a_writer.writeRecord(r)
def test_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cset_out = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq_out) cset_l = sum(1 for _ in cset) assert cset_l == 60 cset.filters.addRequirement(length=[('>', 1000)]) cset_l = sum(1 for _ in cset) assert cset_l == 23 cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) assert cset_l == 23 assert cset_l == sum(1 for _ in cfq) cset.write(cset_out)
def pickup_best_clusters(self): """Pick up hiqh QV clusters.""" self.add_log( "Picking up the best clusters according to QVs from {fs}.".format( fs=", ".join(self.fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] # check if the uc cids are integers uc_keys_are_int = type(uc.keys()[0]) is int polished = {} # cid --> FastqRecord for fq in self.fq_filenames: self.add_log("Looking at arrowed fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID #1: c0|arrow (a single Ice2 directory) # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory) cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] if uc_keys_are_int: # only convert in the case where uc keys are integers (ex: is c10, but 10) cid = int( cid[1:]) #becuz possible ID #2, dont convert to int polished[cid] = r expected_acc_dict = {} # cid --> expected accuracy (ex: 0.99) good = [] # contains all the cids that are HQ # calculate expected QV given 5'/3' trimming # for sequences that are shorter than the trimming, use the length itself for cid, r in polished.iteritems(): qv_len = max(len(r.quality), len(r.quality) - self.qv_trim_5 - self.qv_trim_3) q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3]) expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len)) if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \ len(uc[cid]) >= self.hq_min_full_length_reads : good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format( f=self.arrowed_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format( f=self.arrowed_bad_fa)) with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \ FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \ FastqWriter(self.arrowed_good_fq) as good_fq_writer, \ FastqWriter(self.arrowed_bad_fq) as bad_fq_writer: for cid in polished: r = polished[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation2( newname, expected_acc=expected_acc_dict[cid]) if cid in good: self.add_log( "processing arrowed cluster {c} --> good.".format( c=cid)) good_fa_writer.writeRecord(newname, r.sequence[:]) good_fq_writer.writeRecord(newname, r.sequence[:], r.quality) else: self.add_log( "processing arrowed cluster {c} --> bad.".format( c=cid)) bad_fa_writer.writeRecord(newname, r.sequence[:]) bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log( "High-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_good_fa, self.arrowed_good_fq), level=logging.INFO) self.add_log( "Low-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_bad_fa, self.arrowed_bad_fq), level=logging.INFO) self.add_log("-" * 60, level=logging.INFO)
def test_empty_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq1_out = tempfile.NamedTemporaryFile(suffix="1.fastq").name fq2_out = tempfile.NamedTemporaryFile(suffix="2.fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name # Two full with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240, 480): fqh.write(line) cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) assert cset_l == 120 cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) assert cset_l == 120 assert cset_l == sum(1 for _ in cfq) # one full one empty with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) assert cset_l == 60 cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) assert cset_l == 60 assert cset_l == sum(1 for _ in cfq) # one empty one full with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) assert cset_l == 60 cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) assert cset_l == 60 assert cset_l == sum(1 for _ in cfq) # both empty with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) assert cset_l == 0 cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) assert cset_l == 0 assert cset_l == sum(1 for _ in cfq)