def iterative_adapter_prediction(fastq, ratios, kmer_lens, sample_num, keep_len=12): """Return a list of predicted adapters. Iteratively predict 3' adapter sequence with different combinations of k and R. """ fq_seq = [] fq_obj = get_file_obj(fastq) for i, s in enumerate(fastq_sequence(fq_obj)): if i == sample_num: break fq_seq.append(s) fq_obj.close() collection = {} for kmer_len in kmer_lens: curated = {} freq = count_kmers(fq_seq, kmer_len, sample_num) for ratio in ratios: clean = filter_kmers(freq, kmer_len, ratio) assembl = assemble_kmers(clean, kmer_len//2) for s, c in assembl: key = s[:keep_len] curated[key] = max(curated.get(key,0), c) for s, c in curated.items(): collection[s] = round(collection.get(s,0)+c, 4) asmbl_min_len = min(map(len, collection.keys())) assembl = sorted(assemble_kmers(list(collection.items()), asmbl_min_len//2), key=itemgetter(1), reverse=True) return assembl
def adapter_prediction(fastq, ratio, kmer_len, sample_num): """Return a list of predicted adapters. Predict 3' adapter sequence with a combination of k and R. """ fq_obj = get_file_obj(fastq) fq_seq = fastq_sequence(fq_obj) freq = count_kmers(fq_seq, kmer_len, sample_num) clean = filter_kmers(freq, kmer_len, ratio) assembl = sorted(assemble_kmers(clean, kmer_len//2), key=itemgetter(1), reverse=True) fq_obj.close() return assembl
def adapter_prediction(fastq, ratio, kmer_len, sample_num): """Return a list of predicted adapters. Predict 3' adapter sequence with a combination of k and R. """ fq_obj = get_file_obj(fastq) fq_seq = fastq_sequence(fq_obj) freq = count_kmers(fq_seq, kmer_len, sample_num) clean = filter_kmers(freq, kmer_len, ratio) assembl = sorted(assemble_kmers(clean, kmer_len // 2), key=itemgetter(1), reverse=True) fq_obj.close() return assembl
def to_fasta(args): if args.m <= 0: raise Exception("bad value: -m") if args.x <= 0: raise Exception("bad value: -x") if args.m == args.x: raise Exception("bad read length cutoff range") if not args.f and not args.b: raise Exception("input adapter sequence") if args.f == args.b: raise Exception("5' and 3' adapters are same sequences") if args.seed_5p <= 0: raise Exception("bad value: --seed-5p") if args.seed_3p <= 0: raise Exception("bad value: --seed-3p") if args.trim_3p < 0: raise Exception("input positive value for 3'trimming") if args.trim_5p < 0: raise Exception("input positive value for 5'trimming") f_seq, f_len = args.f, args.seed_5p b_seq, b_len = args.b, args.seed_3p if not f_seq and b_seq: req = lambda x, y: y != '*' or args.a elif f_seq and b_seq: if args.B: req = lambda x, y: x != '*' and y != '*' or args.a else: req = lambda x, y: x != '*' or y != '*' or args.a elif f_seq and not b_seq: req = lambda x, y: x != '*' or args.a f_pp, f_mp = make_regex(f_seq, f_len, False, args.s) b_pp, b_mp = make_regex(b_seq, b_len, True, args.s) fas = {} for seq in fastq_sequence(get_file_obj(args.FASTQ)): seq_len = len(seq) f_i, f_mm = match_adapters(seq, f_pp, f_mp, args.s) b_i, b_mm = match_adapters(seq, b_pp, b_mp, args.s, b_len, b_len+1, seq_len) ins = seq[f_i+args.trim_5p : b_i-args.trim_3p] ins_len = len(ins) if req(f_mm, b_mm) and (args.m <= ins_len and ins_len <= args.x): fas[ins] = fas.get(ins, 0) + 1 fas = sorted(fas.items(), key=itemgetter(0)) for seq, cnt in fas: print(">{0}_{1}\n{0}".format(seq, cnt))
def clip_adapter(fp, aseed, tm5, tm3, min_len, max_len): """Return adapter-clipped clean reads. """ seed_len = len(aseed) pp = re.compile("(.*)" + aseed, re.IGNORECASE) for seq in fastq_sequence(fp): if len(seq) < tm5 or len(seq) < tm3: raise Exception("trimming length is too large") match = pp.search(seq) if not match: continue end = match.end() - seed_len clipped_seq = seq[tm5:end - tm3] L = len(clipped_seq) if min_len <= L and L <= max_len: yield clipped_seq
def clip_adapter(fp, aseed, tm5, tm3, min_len, max_len): """Return adapter-clipped clean reads. """ seed_len = len(aseed) pp = re.compile("(.*)"+aseed, re.IGNORECASE) for seq in fastq_sequence(fp): if len(seq) < tm5 or len(seq) < tm3: raise Exception("trimming length is too large") match = pp.search(seq) if not match: continue end = match.end() - seed_len clipped_seq = seq[tm5 : end-tm3] L = len(clipped_seq) if min_len <= L and L <= max_len: yield clipped_seq
def to_fasta(fastq, fasta, aseed, tm5, tm3, min_len, max_len): """Write FASTA containing clean reads, and return the number of the reads. """ fq_obj = get_file_obj(fastq) if "RAW_INPUT".startswith(aseed): iterator = fastq_sequence(fq_obj) else: iterator = clip_adapter(fq_obj, aseed, tm5, tm3, min_len, max_len) fas = {} clean_read_count = 0 for seq in iterator: fas[seq] = fas.get(seq, 0) + 1 fa_obj = open(fasta, "w") for seq, cnt in fas.items(): clean_read_count += cnt fa_obj.write(">{0}_{1}\n{0}\n".format(seq, cnt)) fa_obj.close() fq_obj.close() return clean_read_count
def iterative_adapter_prediction(fastq, ratios, kmer_lens, sample_num, keep_len=12): """Return a list of predicted adapters. Iteratively predict 3' adapter sequence with different combinations of k and R. """ fq_seq = [] fq_obj = get_file_obj(fastq) for i, s in enumerate(fastq_sequence(fq_obj)): if i == sample_num: break fq_seq.append(s) fq_obj.close() collection = {} for kmer_len in kmer_lens: curated = {} freq = count_kmers(fq_seq, kmer_len, sample_num) for ratio in ratios: clean = filter_kmers(freq, kmer_len, ratio) assembl = assemble_kmers(clean, kmer_len // 2) for s, c in assembl: key = s[:keep_len] curated[key] = max(curated.get(key, 0), c) for s, c in curated.items(): collection[s] = round(collection.get(s, 0) + c, 4) asmbl_min_len = min(map(len, collection.keys())) assembl = sorted(assemble_kmers(list(collection.items()), asmbl_min_len // 2), key=itemgetter(1), reverse=True) return assembl