Esempio n. 1
0
def iterative_adapter_prediction(fastq, ratios, kmer_lens,
                                 sample_num, keep_len=12):
    """Return a list of predicted adapters.

       Iteratively predict 3' adapter sequence with different
       combinations of k and R.
    """
    fq_seq = []
    fq_obj = get_file_obj(fastq)
    for i, s in enumerate(fastq_sequence(fq_obj)):
        if i == sample_num:
            break
        fq_seq.append(s)
    fq_obj.close()

    collection = {}
    for kmer_len in kmer_lens:
        curated = {}
        freq = count_kmers(fq_seq, kmer_len, sample_num)
        for ratio in ratios:
            clean = filter_kmers(freq, kmer_len, ratio)
            assembl = assemble_kmers(clean, kmer_len//2)
            for s, c in assembl:
                key = s[:keep_len]
                curated[key] = max(curated.get(key,0), c)
        for s, c in curated.items():
            collection[s] = round(collection.get(s,0)+c, 4)
    asmbl_min_len = min(map(len, collection.keys()))
    assembl = sorted(assemble_kmers(list(collection.items()), asmbl_min_len//2),
                     key=itemgetter(1), reverse=True)
    return assembl
Esempio n. 2
0
def adapter_prediction(fastq, ratio, kmer_len, sample_num):
    """Return a list of predicted adapters.

       Predict 3' adapter sequence with a combination of k and R.
    """
    fq_obj = get_file_obj(fastq)
    fq_seq = fastq_sequence(fq_obj)
    freq = count_kmers(fq_seq, kmer_len, sample_num)
    clean = filter_kmers(freq, kmer_len, ratio)
    assembl = sorted(assemble_kmers(clean, kmer_len//2),
                     key=itemgetter(1), reverse=True)
    fq_obj.close()
    return assembl
Esempio n. 3
0
def adapter_prediction(fastq, ratio, kmer_len, sample_num):
    """Return a list of predicted adapters.

       Predict 3' adapter sequence with a combination of k and R.
    """
    fq_obj = get_file_obj(fastq)
    fq_seq = fastq_sequence(fq_obj)
    freq = count_kmers(fq_seq, kmer_len, sample_num)
    clean = filter_kmers(freq, kmer_len, ratio)
    assembl = sorted(assemble_kmers(clean, kmer_len // 2),
                     key=itemgetter(1),
                     reverse=True)
    fq_obj.close()
    return assembl
Esempio n. 4
0
def to_fasta(args):
    if args.m <= 0:
        raise Exception("bad value: -m")
    if args.x <= 0:
        raise Exception("bad value: -x")
    if args.m == args.x:
        raise Exception("bad read length cutoff range")
    if not args.f and not args.b:
        raise Exception("input adapter sequence")
    if args.f == args.b:
        raise Exception("5' and 3' adapters are same sequences")
    if args.seed_5p <= 0:
        raise Exception("bad value: --seed-5p")
    if args.seed_3p <= 0:
        raise Exception("bad value: --seed-3p")
    if args.trim_3p < 0:
        raise Exception("input positive value for 3'trimming")
    if args.trim_5p < 0:
        raise Exception("input positive value for 5'trimming")

    f_seq, f_len = args.f, args.seed_5p
    b_seq, b_len = args.b, args.seed_3p
    if not f_seq and b_seq:
        req = lambda x, y: y != '*' or args.a
    elif f_seq and b_seq:
        if args.B:
            req = lambda x, y: x != '*' and y != '*' or args.a
        else:
            req = lambda x, y: x != '*' or y != '*' or args.a
    elif f_seq and not b_seq:
        req = lambda x, y: x != '*' or args.a

    f_pp, f_mp = make_regex(f_seq, f_len, False, args.s)
    b_pp, b_mp = make_regex(b_seq, b_len, True,  args.s)

    fas = {}
    for seq in fastq_sequence(get_file_obj(args.FASTQ)):
        seq_len = len(seq)
        f_i, f_mm = match_adapters(seq, f_pp, f_mp, args.s)
        b_i, b_mm = match_adapters(seq, b_pp, b_mp, args.s,
                                   b_len, b_len+1, seq_len)
        ins = seq[f_i+args.trim_5p : b_i-args.trim_3p]
        ins_len = len(ins)
        if req(f_mm, b_mm) and (args.m <= ins_len and ins_len <= args.x):
            fas[ins] = fas.get(ins, 0) + 1

    fas = sorted(fas.items(), key=itemgetter(0))
    for seq, cnt in fas:
        print(">{0}_{1}\n{0}".format(seq, cnt))
Esempio n. 5
0
def clip_adapter(fp, aseed, tm5, tm3, min_len, max_len):
    """Return adapter-clipped clean reads.

    """
    seed_len = len(aseed)
    pp = re.compile("(.*)" + aseed, re.IGNORECASE)
    for seq in fastq_sequence(fp):
        if len(seq) < tm5 or len(seq) < tm3:
            raise Exception("trimming length is too large")
        match = pp.search(seq)
        if not match:
            continue
        end = match.end() - seed_len
        clipped_seq = seq[tm5:end - tm3]
        L = len(clipped_seq)
        if min_len <= L and L <= max_len:
            yield clipped_seq
Esempio n. 6
0
def clip_adapter(fp, aseed, tm5, tm3, min_len, max_len):
    """Return adapter-clipped clean reads.

    """
    seed_len = len(aseed)
    pp = re.compile("(.*)"+aseed, re.IGNORECASE)
    for seq in fastq_sequence(fp):
        if len(seq) < tm5 or len(seq) < tm3:
            raise Exception("trimming length is too large")
        match = pp.search(seq)
        if not match:
            continue
        end = match.end() - seed_len
        clipped_seq = seq[tm5 : end-tm3]
        L = len(clipped_seq)
        if min_len <= L and L <= max_len:
            yield clipped_seq
Esempio n. 7
0
def to_fasta(fastq, fasta, aseed, tm5, tm3, min_len, max_len):
    """Write FASTA containing clean reads, and return
       the number of the reads.

    """
    fq_obj = get_file_obj(fastq)
    if "RAW_INPUT".startswith(aseed):
        iterator = fastq_sequence(fq_obj)
    else:
        iterator = clip_adapter(fq_obj, aseed, tm5, tm3, min_len, max_len)
    fas = {}
    clean_read_count = 0
    for seq in iterator:
        fas[seq] = fas.get(seq, 0) + 1
    fa_obj = open(fasta, "w")
    for seq, cnt in fas.items():
        clean_read_count += cnt
        fa_obj.write(">{0}_{1}\n{0}\n".format(seq, cnt))
    fa_obj.close()
    fq_obj.close()
    return clean_read_count
Esempio n. 8
0
def to_fasta(fastq, fasta, aseed, tm5, tm3, min_len, max_len):
    """Write FASTA containing clean reads, and return
       the number of the reads.

    """
    fq_obj = get_file_obj(fastq)
    if "RAW_INPUT".startswith(aseed):
        iterator = fastq_sequence(fq_obj)
    else:
        iterator = clip_adapter(fq_obj, aseed, tm5, tm3, min_len, max_len)
    fas = {}
    clean_read_count = 0
    for seq in iterator:
        fas[seq] = fas.get(seq, 0) + 1
    fa_obj = open(fasta, "w")
    for seq, cnt in fas.items():
        clean_read_count += cnt
        fa_obj.write(">{0}_{1}\n{0}\n".format(seq, cnt))
    fa_obj.close()
    fq_obj.close()
    return clean_read_count
Esempio n. 9
0
def iterative_adapter_prediction(fastq,
                                 ratios,
                                 kmer_lens,
                                 sample_num,
                                 keep_len=12):
    """Return a list of predicted adapters.

       Iteratively predict 3' adapter sequence with different
       combinations of k and R.
    """
    fq_seq = []
    fq_obj = get_file_obj(fastq)
    for i, s in enumerate(fastq_sequence(fq_obj)):
        if i == sample_num:
            break
        fq_seq.append(s)
    fq_obj.close()

    collection = {}
    for kmer_len in kmer_lens:
        curated = {}
        freq = count_kmers(fq_seq, kmer_len, sample_num)
        for ratio in ratios:
            clean = filter_kmers(freq, kmer_len, ratio)
            assembl = assemble_kmers(clean, kmer_len // 2)
            for s, c in assembl:
                key = s[:keep_len]
                curated[key] = max(curated.get(key, 0), c)
        for s, c in curated.items():
            collection[s] = round(collection.get(s, 0) + c, 4)
    asmbl_min_len = min(map(len, collection.keys()))
    assembl = sorted(assemble_kmers(list(collection.items()),
                                    asmbl_min_len // 2),
                     key=itemgetter(1),
                     reverse=True)
    return assembl