def iterative_adapter_prediction(fastq, ratios, kmer_lens, sample_num, keep_len=12): """Return a list of predicted adapters. Iteratively predict 3' adapter sequence with different combinations of k and R. """ fq_seq = [] fq_obj = get_file_obj(fastq) for i, s in enumerate(fastq_sequence(fq_obj)): if i == sample_num: break fq_seq.append(s) fq_obj.close() collection = {} for kmer_len in kmer_lens: curated = {} freq = count_kmers(fq_seq, kmer_len, sample_num) for ratio in ratios: clean = filter_kmers(freq, kmer_len, ratio) assembl = assemble_kmers(clean, kmer_len//2) for s, c in assembl: key = s[:keep_len] curated[key] = max(curated.get(key,0), c) for s, c in curated.items(): collection[s] = round(collection.get(s,0)+c, 4) asmbl_min_len = min(map(len, collection.keys())) assembl = sorted(assemble_kmers(list(collection.items()), asmbl_min_len//2), key=itemgetter(1), reverse=True) return assembl
def adapter_prediction(fastq, ratio, kmer_len, sample_num): """Return a list of predicted adapters. Predict 3' adapter sequence with a combination of k and R. """ fq_obj = get_file_obj(fastq) fq_seq = fastq_sequence(fq_obj) freq = count_kmers(fq_seq, kmer_len, sample_num) clean = filter_kmers(freq, kmer_len, ratio) assembl = sorted(assemble_kmers(clean, kmer_len//2), key=itemgetter(1), reverse=True) fq_obj.close() return assembl
def adapter_prediction(fastq, ratio, kmer_len, sample_num): """Return a list of predicted adapters. Predict 3' adapter sequence with a combination of k and R. """ fq_obj = get_file_obj(fastq) fq_seq = fastq_sequence(fq_obj) freq = count_kmers(fq_seq, kmer_len, sample_num) clean = filter_kmers(freq, kmer_len, ratio) assembl = sorted(assemble_kmers(clean, kmer_len // 2), key=itemgetter(1), reverse=True) fq_obj.close() return assembl
def to_fasta(args): if args.m <= 0: raise Exception("bad value: -m") if args.x <= 0: raise Exception("bad value: -x") if args.m == args.x: raise Exception("bad read length cutoff range") if not args.f and not args.b: raise Exception("input adapter sequence") if args.f == args.b: raise Exception("5' and 3' adapters are same sequences") if args.seed_5p <= 0: raise Exception("bad value: --seed-5p") if args.seed_3p <= 0: raise Exception("bad value: --seed-3p") if args.trim_3p < 0: raise Exception("input positive value for 3'trimming") if args.trim_5p < 0: raise Exception("input positive value for 5'trimming") f_seq, f_len = args.f, args.seed_5p b_seq, b_len = args.b, args.seed_3p if not f_seq and b_seq: req = lambda x, y: y != '*' or args.a elif f_seq and b_seq: if args.B: req = lambda x, y: x != '*' and y != '*' or args.a else: req = lambda x, y: x != '*' or y != '*' or args.a elif f_seq and not b_seq: req = lambda x, y: x != '*' or args.a f_pp, f_mp = make_regex(f_seq, f_len, False, args.s) b_pp, b_mp = make_regex(b_seq, b_len, True, args.s) fas = {} for seq in fastq_sequence(get_file_obj(args.FASTQ)): seq_len = len(seq) f_i, f_mm = match_adapters(seq, f_pp, f_mp, args.s) b_i, b_mm = match_adapters(seq, b_pp, b_mp, args.s, b_len, b_len+1, seq_len) ins = seq[f_i+args.trim_5p : b_i-args.trim_3p] ins_len = len(ins) if req(f_mm, b_mm) and (args.m <= ins_len and ins_len <= args.x): fas[ins] = fas.get(ins, 0) + 1 fas = sorted(fas.items(), key=itemgetter(0)) for seq, cnt in fas: print(">{0}_{1}\n{0}".format(seq, cnt))
def qual_trim(args): if args.solexa: args.b = 64 func = solexa_to_phred elif args.illumina5: func = illumina_64B else: func = illumina_33 if args.b not in (33, 64): raise Exception("wrong quality score base") if args.l < 1: raise Exception("specify longer read length") if args.p < 0 or args.p > 1: raise Exception("bad error probability cutoff") if not args.solexa and args.q < 0: raise Exception("bad quality score cutoff") if args.q: cutoff = args.q else: cutoff = calc_qual_score(args.p, args.solexa) base = args.b minlen = args.l ns = re.compile('N', re.IGNORECASE) fastqs = fastq_record(get_file_obj(args.FASTQ)) for read in fastqs: read = read.rstrip().split("\n") qual = read[3] s, max_s = 0, 0 max_i = len(read[3]) if minlen > max_i: continue for i in reversed(range(max_i)): q = func(ord(qual[i]) - base) s += cutoff - q if s < 0: break if s > max_s: max_s, max_i = s, i read[1] = read[1][:max_i] read[3] = read[3][:max_i] n_num = len(ns.findall(read[1])) if n_num < len(read[1]) and len(read[1]) >= minlen: print("\n".join(read))
def guess_qual_offset(args): platform = [('Sanger/Illumina-1.8+', 33, 76, 33), ('Illumina-1.5+', 67, 104, 64), ('Illumina-1.3+', 54, 104, 64), ('Solexa', 59, 104, 64)] q_chars = set() fastqs = fastq_quality(get_file_obj(args.FASTQ)) sample_num = 50000 for i, quality in enumerate(fastqs): if i == sample_num: break q_chars = q_chars.union(quality) q_int = sorted(list(map(ord, q_chars))) if len(q_int) <= 1: raise Exception("unknown quality encoding") for pl in platform: if pl[1] <= q_int[1] and q_int[-2] <= pl[2]: return "{}:base={}".format(pl[0], pl[3]) raise Exception("unknown quality encoding")
def to_fasta(fastq, fasta, aseed, tm5, tm3, min_len, max_len): """Write FASTA containing clean reads, and return the number of the reads. """ fq_obj = get_file_obj(fastq) if "RAW_INPUT".startswith(aseed): iterator = fastq_sequence(fq_obj) else: iterator = clip_adapter(fq_obj, aseed, tm5, tm3, min_len, max_len) fas = {} clean_read_count = 0 for seq in iterator: fas[seq] = fas.get(seq, 0) + 1 fa_obj = open(fasta, "w") for seq, cnt in fas.items(): clean_read_count += cnt fa_obj.write(">{0}_{1}\n{0}\n".format(seq, cnt)) fa_obj.close() fq_obj.close() return clean_read_count
def guess_qual_offset(args): platform = [ ('Sanger/Illumina-1.8+', 33, 76, 33), ('Illumina-1.5+', 67, 104, 64), ('Illumina-1.3+', 54, 104, 64), ('Solexa', 59, 104, 64) ] q_chars = set() fastqs = fastq_quality(get_file_obj(args.FASTQ)) sample_num = 50000 for i, quality in enumerate(fastqs): if i == sample_num: break q_chars = q_chars.union(quality) q_int = sorted(list(map(ord, q_chars))) if len(q_int) <= 1: raise Exception("unknown quality encoding") for pl in platform: if pl[1] <= q_int[1] and q_int[-2] <= pl[2]: return "{}:base={}".format(pl[0], pl[3]) raise Exception("unknown quality encoding")
def fastq_input_prep(fastq, ratio, temp_dir): """Write FASTQ in the temporary directory, and retrun (subsampled) FASTQ name, the total read count, standard deviation of read lengths. """ num = int(1 / ratio) read_count = 0.0 stats = {} fq_out = "{}/input.fq".format(temp_dir) fq_obj = get_file_obj(fastq) fout = open(fq_out, "w") for i, rec in enumerate(fastq_record(fq_obj)): if i % num == 0: fout.write(rec) read_count += 1 L = len(rec.split("\n")[1]) stats[L] = stats.get(L, 0) + 1 fout.close() fq_obj.close() mean = sum([L * c for L, c in stats.items()]) / read_count sum_square = sum([(L - mean)**2 * c for L, c in stats.items()]) sd = (sum_square / read_count)**0.5 return fq_out, read_count, sd
def iterative_adapter_prediction(fastq, ratios, kmer_lens, sample_num, keep_len=12): """Return a list of predicted adapters. Iteratively predict 3' adapter sequence with different combinations of k and R. """ fq_seq = [] fq_obj = get_file_obj(fastq) for i, s in enumerate(fastq_sequence(fq_obj)): if i == sample_num: break fq_seq.append(s) fq_obj.close() collection = {} for kmer_len in kmer_lens: curated = {} freq = count_kmers(fq_seq, kmer_len, sample_num) for ratio in ratios: clean = filter_kmers(freq, kmer_len, ratio) assembl = assemble_kmers(clean, kmer_len // 2) for s, c in assembl: key = s[:keep_len] curated[key] = max(curated.get(key, 0), c) for s, c in curated.items(): collection[s] = round(collection.get(s, 0) + c, 4) asmbl_min_len = min(map(len, collection.keys())) assembl = sorted(assemble_kmers(list(collection.items()), asmbl_min_len // 2), key=itemgetter(1), reverse=True) return assembl
def fastq_input_prep(fastq, ratio, temp_dir): """Write FASTQ in the temporary directory, and retrun (subsampled) FASTQ name, the total read count, standard deviation of read lengths. """ num = int(1/ratio) read_count = 0.0 stats = {} fq_out = "{}/input.fq".format(temp_dir) fq_obj = get_file_obj(fastq) fout = open(fq_out, "w") for i, rec in enumerate(fastq_record(fq_obj)): if i % num == 0: fout.write(rec) read_count += 1 L = len(rec.split("\n")[1]) stats[L] = stats.get(L,0) + 1 fout.close() fq_obj.close() mean = sum([L*c for L,c in stats.items()]) / read_count sum_square = sum([(L-mean)**2 * c for L,c in stats.items()]) sd = (sum_square / read_count)**0.5 return fq_out, read_count, sd