def qual_trim(args): if args.solexa: args.b = 64 func = solexa_to_phred elif args.illumina5: func = illumina_64B else: func = illumina_33 if args.b not in (33, 64): raise Exception("wrong quality score base") if args.l < 1: raise Exception("specify longer read length") if args.p < 0 or args.p > 1: raise Exception("bad error probability cutoff") if not args.solexa and args.q < 0: raise Exception("bad quality score cutoff") if args.q: cutoff = args.q else: cutoff = calc_qual_score(args.p, args.solexa) base = args.b minlen = args.l ns = re.compile('N', re.IGNORECASE) fastqs = fastq_record(get_file_obj(args.FASTQ)) for read in fastqs: read = read.rstrip().split("\n") qual = read[3] s, max_s = 0, 0 max_i = len(read[3]) if minlen > max_i: continue for i in reversed(range(max_i)): q = func(ord(qual[i]) - base) s += cutoff - q if s < 0: break if s > max_s: max_s, max_i = s, i read[1] = read[1][:max_i] read[3] = read[3][:max_i] n_num = len(ns.findall(read[1])) if n_num < len(read[1]) and len(read[1]) >= minlen: print("\n".join(read))
def fastq_input_prep(fastq, ratio, temp_dir): """Write FASTQ in the temporary directory, and retrun (subsampled) FASTQ name, the total read count, standard deviation of read lengths. """ num = int(1 / ratio) read_count = 0.0 stats = {} fq_out = "{}/input.fq".format(temp_dir) fq_obj = get_file_obj(fastq) fout = open(fq_out, "w") for i, rec in enumerate(fastq_record(fq_obj)): if i % num == 0: fout.write(rec) read_count += 1 L = len(rec.split("\n")[1]) stats[L] = stats.get(L, 0) + 1 fout.close() fq_obj.close() mean = sum([L * c for L, c in stats.items()]) / read_count sum_square = sum([(L - mean)**2 * c for L, c in stats.items()]) sd = (sum_square / read_count)**0.5 return fq_out, read_count, sd
def fastq_input_prep(fastq, ratio, temp_dir): """Write FASTQ in the temporary directory, and retrun (subsampled) FASTQ name, the total read count, standard deviation of read lengths. """ num = int(1/ratio) read_count = 0.0 stats = {} fq_out = "{}/input.fq".format(temp_dir) fq_obj = get_file_obj(fastq) fout = open(fq_out, "w") for i, rec in enumerate(fastq_record(fq_obj)): if i % num == 0: fout.write(rec) read_count += 1 L = len(rec.split("\n")[1]) stats[L] = stats.get(L,0) + 1 fout.close() fq_obj.close() mean = sum([L*c for L,c in stats.items()]) / read_count sum_square = sum([(L-mean)**2 * c for L,c in stats.items()]) sd = (sum_square / read_count)**0.5 return fq_out, read_count, sd