def trim_fastq(outdir, files, start, end, length): """ """ if len(files) == 0: print("Error: no files provided", file=stderr) return elif length is not None: length_mode = True if start is not None and end is not None: if (end - start + 1) != length: print("Error: start/end/length do not agree", file=stderr) return elif end is not None: # start is None, so length should equal end if length != end: print("Error: length/end do not agree", file=stderr) return elif start is None: start = 1 else: length_mode = False if start is None: start = 1 for f in files: name, ext = os.path.splitext(os.path.basename(f)) outfile_name = name + ".trim" + ext outfile_name = os.path.join(outdir, outfile_name) with open(outfile_name, "w") as handle: for read in read_fastq(f): if length_mode: read.trim_length(length, start) else: read.trim(start, end) print(read, file=handle)
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the barcodes. """ self.counts['barcodes'] = dict() # flags for verbose output of filtered reads filter_flags = dict() for key in self.filters: filter_flags[key] = False # count all the barcodes for fq in read_fastq(self.reads): fq.trim_length(self.bc_length, start=self.bc_start) if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the barcode based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if any(filter_flags.values()): # failed quality filtering self.filter_stats['total'] += 1 if self.verbose: self.report_filtered_read(fq, filter_flags) else: # passed quality filtering try: self.counts['barcodes'][fq.sequence.upper()] += 1 except KeyError: self.counts['barcodes'][fq.sequence.upper()] = 1 self.counts['barcodes'] = \ pd.DataFrame.from_dict(self.counts['barcodes'], orient="index", dtype="int32") if len(self.counts['barcodes']) == 0: raise EnrichError("Failed to count barcodes", self.name) self.counts['barcodes'].columns = ['count'] self.counts['barcodes'] = \ self.counts['barcodes'][self.counts['barcodes']['count'] \ > self.min_count] logging.info("Counted %d barcodes (%d unique) [%s]" % \ (self.counts['barcodes']['count'].sum(), len(self.counts['barcodes'].index), self.name)) if not self.barcodevariant: self.report_filter_stats()
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the variants. """ self.df_dict['variants'] = dict() filter_flags = dict() for key in self.filters: filter_flags[key] = False logging.info("Counting variants [{name}]".format(name=self.name)) for fq in read_fastq(self.reads): if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the read based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if not any(filter_flags.values()): # passed quality filtering mutations = self.count_variant(fq.sequence) if mutations is None: # read has too many mutations self.filter_stats['max mutations'] += 1 filter_flags['max mutations'] = True if any(filter_flags.values()): self.filter_stats['total'] += 1 if self.report_filtered: self.report_filtered_read(fq, filter_flags) self.df_dict['variants'] = \ pd.DataFrame.from_dict(self.df_dict['variants'], orient="index", dtype="int32") if len(self.df_dict['variants']) == 0: raise EnrichError("Failed to count variants", self.name) self.df_dict['variants'].columns = ['count'] self.df_dict['variants'].sort('count', ascending=False, inplace=True) logging.info("Counted {n} variants ({u} unique) [{name}]".format( n=self.df_dict['variants']['count'].sum(), u=len(self.df_dict['variants'].index), name=self.name)) if self.aligner is not None: logging.info("Aligned {n} variants [{name}]".format( n=self.aligner.calls, name=self.name)) self.aligner_cache = None self.report_filter_stats()
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the variants. """ self.df_dict['variants'] = dict() filter_flags = dict() for key in self.filters: filter_flags[key] = False logging.info("Counting variants [{name}]".format(name=self.name)) for fq in read_fastq(self.reads): if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the read based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if not any(filter_flags.values()): # passed quality filtering mutations = self.count_variant(fq.sequence) if mutations is None: # read has too many mutations self.filter_stats['max mutations'] += 1 filter_flags['max mutations'] = True if any(filter_flags.values()): self.filter_stats['total'] += 1 if self.report_filtered: self.report_filtered_read(fq, filter_flags) self.df_dict['variants'] = \ pd.DataFrame.from_dict(self.df_dict['variants'], orient="index", dtype="int32") if len(self.df_dict['variants']) == 0: raise EnrichError("Failed to count variants", self.name) self.df_dict['variants'].columns = ['count'] self.df_dict['variants'].sort('count', ascending=False, inplace=True) logging.info("Counted {n} variants ({u} unique) [{name}]".format( n=self.df_dict['variants']['count'].sum(), u=len(self.df_dict['variants'].index), name=self.name)) if self.aligner is not None: logging.info("Aligned {n} variants [{name}]".format(n=self.aligner.calls, name=self.name)) self.aligner_cache = None self.report_filter_stats()
def count(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the barcodes. """ self.counts["barcodes"] = dict() # flags for verbose output of filtered reads filter_flags = dict() for key in self.filters: filter_flags[key] = False # count all the barcodes for fq in read_fastq(self.reads): fq.trim_length(self.bc_length, start=self.bc_start) if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the barcode based on specified quality settings if self.filters["chastity"]: if not fq.is_chaste(): self.filter_stats["chastity"] += 1 filter_flags["chastity"] = True if self.filters["min quality"] > 0: if fq.min_quality() < self.filters["min quality"]: self.filter_stats["min quality"] += 1 filter_flags["min quality"] = True if self.filters["avg quality"] > 0: if fq.mean_quality() < self.filters["avg quality"]: self.filter_stats["avg quality"] += 1 filter_flags["avg quality"] = True if any(filter_flags.values()): # failed quality filtering self.filter_stats["total"] += 1 if self.verbose: self.report_filtered_read(self.log, fq, filter_flags) else: # passed quality filtering try: self.counts["barcodes"][fq.sequence.upper()] += 1 except KeyError: self.counts["barcodes"][fq.sequence.upper()] = 1 self.counts["barcodes"] = pd.DataFrame.from_dict(self.counts["barcodes"], orient="index", dtype="int32") if len(self.counts["barcodes"]) == 0: raise EnrichError("Failed to count barcodes", self.name) self.counts["barcodes"].columns = ["count"] self.counts["barcodes"] = self.counts["barcodes"][self.counts["barcodes"]["count"] > self.min_count]
def count(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the variants. """ self.counts['variants'] = dict() # flags for verbose output of filtered reads filter_flags = dict() for key in self.filters: filter_flags[key] = False for fq in read_fastq(self.reads): if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the read based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if not any(filter_flags.values()): # passed quality filtering mutations = self.count_variant(fq.sequence) if mutations is None: # read has too many mutations self.filter_stats['max mutations'] += 1 filter_flags['max mutations'] = True if any(filter_flags.values()): self.filter_stats['total'] += 1 if self.verbose: self.report_filtered_read(self.log, fq, filter_flags) self.counts['variants'] = \ pd.DataFrame.from_dict(self.counts['variants'], orient="index", dtype="int32") if len(self.counts['variants']) == 0: raise EnrichError("Failed to count variants", self.name) self.counts['variants'].columns = ['count']
def trim_fastq(outdir, files, start, end, length, compression): """ """ if len(files) == 0: print("Error: no files provided", file=stderr) return elif length is not None: length_mode = True if start is not None and end is not None: if (end - start + 1) != length: print("Error: start/end/length do not agree", file=stderr) return elif end is not None: # start is None, so length should equal end if length != end: print("Error: length/end do not agree", file=stderr) return elif start is None: start = 1 else: length_mode = False if start is None: start = 1 for f in files: # open the output file _, base, ext, _ = split_fastq_path(f) outname = base + ".trim" + ext outname = os.path.join(outdir, outname) handle = create_compressed_outfile(outname, compression) # trim the reads and write them for read in read_fastq(f): if length_mode: read.trim_length(length, start) else: read.trim(start, end) print(read, file=handle) handle.close()
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the barcodes. """ self.df_dict['barcodes'] = dict() filter_flags = dict() for key in self.filters: filter_flags[key] = False # count all the barcodes logging.info("Counting barcodes [{name}]".format(name=self.name)) for fq in read_fastq(self.reads): fq.trim_length(self.bc_length, start=self.bc_start) if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the barcode based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if any(filter_flags.values()): # failed quality filtering self.filter_stats['total'] += 1 if self.report_filtered: self.report_filtered_read(fq, filter_flags) else: # passed quality filtering try: self.df_dict['barcodes'][fq.sequence.upper()] += 1 except KeyError: self.df_dict['barcodes'][fq.sequence.upper()] = 1 self.df_dict['barcodes'] = \ pd.DataFrame.from_dict(self.df_dict['barcodes'], orient="index", dtype="int32") if len(self.df_dict['barcodes']) == 0: raise EnrichError("Failed to count barcodes", self.name) self.df_dict['barcodes'].columns = ['count'] self.df_dict['barcodes'].sort('count', ascending=False, inplace=True) if 'barcodes_low_abundance' in self.df_dict: # min count is set self.df_dict['barcodes_low_abundance'] = self.df_dict['barcodes'][ self.df_dict['barcodes']['count'] < self.min_count] logging.info( "Writing counts for {n} unique low-abundance barcodes to disk [{name}]" .format(n=len(self.df_dict['barcodes_low_abundance']), name=self.name)) self.dump_data(keys=['barcodes_low_abundance']) self.df_dict['barcodes'] = self.df_dict['barcodes'][ self.df_dict['barcodes']['count'] >= self.min_count] logging.info( "Retained counts for {n} barcodes ({u} unique) [{name}]".format( n=self.df_dict['barcodes']['count'].sum(), u=len(self.df_dict['barcodes'].index), name=self.name)) if not self.barcodevariant: self.report_filter_stats()
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the barcodes. """ self.df_dict['barcodes'] = dict() filter_flags = dict() for key in self.filters: filter_flags[key] = False # count all the barcodes logging.info("Counting barcodes [{name}]".format(name=self.name)) for fq in read_fastq(self.reads): fq.trim_length(self.bc_length, start=self.bc_start) if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the barcode based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if any(filter_flags.values()): # failed quality filtering self.filter_stats['total'] += 1 if self.report_filtered: self.report_filtered_read(fq, filter_flags) else: # passed quality filtering try: self.df_dict['barcodes'][fq.sequence.upper()] += 1 except KeyError: self.df_dict['barcodes'][fq.sequence.upper()] = 1 self.df_dict['barcodes'] = \ pd.DataFrame.from_dict(self.df_dict['barcodes'], orient="index", dtype="int32") if len(self.df_dict['barcodes']) == 0: raise EnrichError("Failed to count barcodes", self.name) self.df_dict['barcodes'].columns = ['count'] self.df_dict['barcodes'].sort('count', ascending=False, inplace=True) if 'barcodes_low_abundance' in self.df_dict: # min count is set self.df_dict['barcodes_low_abundance'] = self.df_dict['barcodes'][self.df_dict['barcodes']['count'] < self.min_count] logging.info("Writing counts for {n} unique low-abundance barcodes to disk [{name}]".format(n=len(self.df_dict['barcodes_low_abundance']), name=self.name)) self.dump_data(keys=['barcodes_low_abundance']) self.df_dict['barcodes'] = self.df_dict['barcodes'][self.df_dict['barcodes']['count'] >= self.min_count] logging.info("Retained counts for {n} barcodes ({u} unique) [{name}]".format( n=self.df_dict['barcodes']['count'].sum(), u=len(self.df_dict['barcodes'].index), name=self.name)) if not self.barcodevariant: self.report_filter_stats()