def read_length_histogram(in_fastq, out_hist): """draw histogram of read lengths""" length_count = defaultdict(int) for header, seq, qual in parseFastq(gzip.open(in_fastq)): length_count[len(seq)] += 1 pyplot.figure() pyplot.bar(range(1,max(length_count)+1), [length_count[l] for l in range(1, max(length_count)+1)], align='center') pyplot.savefig(out_hist)
def read_length_histogram(in_fastq, out_hist): """draw histogram of read lengths""" length_count = defaultdict(int) for header, seq, qual in parseFastq(gzip.open(in_fastq)): length_count[len(seq)] += 1 pyplot.figure() pyplot.bar(range(1, max(length_count) + 1), [length_count[l] for l in range(1, max(length_count) + 1)], align='center') pyplot.savefig(out_hist)
def trim_regex(in_fastq, out_fastq, trim_pattern): """Search the reads for a regex, and trim everything matching the pattern and all succeeding sequence. """ pattern = re.compile(trim_pattern) with gzip.open(in_fastq) as infile: with gzip.open(out_fastq, 'w') as outfile: for header, seq, qual in parseFastq(infile): matches = [m.span() for m in pattern.finditer(seq)] if len(matches) > 0: # match to re found-- # trim the right-most hit and add the trimmed sequence to the read ID m = matches[-1] header = seq[m[0]:] + '_' + header seq = seq[:m[0]] qual = qual[:m[0]] if len(matches) > 0 or not cfg.getboolean('filtering', 'require_regex'): if len(seq) >= 10: # TODO: add adjustable min length outfile.write('@%s\n%s\n+%s\n%s\n' % (header, seq, header, qual))
def trim_regex(in_fastq, out_fastq, trim_pattern): """Search the reads for a regex, and trim everything matching the pattern and all succeeding sequence. """ pattern = re.compile(trim_pattern) with gzip.open(in_fastq) as infile: with gzip.open(out_fastq, 'w') as outfile: for header, seq, qual in parseFastq(infile): matches = [m.span() for m in pattern.finditer(seq)] if len(matches) > 0: # match to re found-- # trim the right-most hit and add the trimmed sequence to the read ID m = matches[-1] header = seq[m[0]:] + '_' + header seq = seq[:m[0]] qual = qual[:m[0]] if len(matches) > 0 or not cfg.getboolean( 'filtering', 'require_regex'): if len(seq) >= 10: # TODO: add adjustable min length outfile.write('@%s\n%s\n+%s\n%s\n' % (header, seq, header, qual))