Beispiel #1
0
def iterative_adapter_prediction(fastq, ratios, kmer_lens,
                                 sample_num, keep_len=12):
    """Return a list of predicted adapters.

       Iteratively predict 3' adapter sequence with different
       combinations of k and R.
    """
    fq_seq = []
    fq_obj = get_file_obj(fastq)
    for i, s in enumerate(fastq_sequence(fq_obj)):
        if i == sample_num:
            break
        fq_seq.append(s)
    fq_obj.close()

    collection = {}
    for kmer_len in kmer_lens:
        curated = {}
        freq = count_kmers(fq_seq, kmer_len, sample_num)
        for ratio in ratios:
            clean = filter_kmers(freq, kmer_len, ratio)
            assembl = assemble_kmers(clean, kmer_len//2)
            for s, c in assembl:
                key = s[:keep_len]
                curated[key] = max(curated.get(key,0), c)
        for s, c in curated.items():
            collection[s] = round(collection.get(s,0)+c, 4)
    asmbl_min_len = min(map(len, collection.keys()))
    assembl = sorted(assemble_kmers(list(collection.items()), asmbl_min_len//2),
                     key=itemgetter(1), reverse=True)
    return assembl
Beispiel #2
0
def adapter_prediction(fastq, ratio, kmer_len, sample_num):
    """Return a list of predicted adapters.

       Predict 3' adapter sequence with a combination of k and R.
    """
    fq_obj = get_file_obj(fastq)
    fq_seq = fastq_sequence(fq_obj)
    freq = count_kmers(fq_seq, kmer_len, sample_num)
    clean = filter_kmers(freq, kmer_len, ratio)
    assembl = sorted(assemble_kmers(clean, kmer_len//2),
                     key=itemgetter(1), reverse=True)
    fq_obj.close()
    return assembl
Beispiel #3
0
def adapter_prediction(fastq, ratio, kmer_len, sample_num):
    """Return a list of predicted adapters.

       Predict 3' adapter sequence with a combination of k and R.
    """
    fq_obj = get_file_obj(fastq)
    fq_seq = fastq_sequence(fq_obj)
    freq = count_kmers(fq_seq, kmer_len, sample_num)
    clean = filter_kmers(freq, kmer_len, ratio)
    assembl = sorted(assemble_kmers(clean, kmer_len // 2),
                     key=itemgetter(1),
                     reverse=True)
    fq_obj.close()
    return assembl
Beispiel #4
0
def to_fasta(args):
    if args.m <= 0:
        raise Exception("bad value: -m")
    if args.x <= 0:
        raise Exception("bad value: -x")
    if args.m == args.x:
        raise Exception("bad read length cutoff range")
    if not args.f and not args.b:
        raise Exception("input adapter sequence")
    if args.f == args.b:
        raise Exception("5' and 3' adapters are same sequences")
    if args.seed_5p <= 0:
        raise Exception("bad value: --seed-5p")
    if args.seed_3p <= 0:
        raise Exception("bad value: --seed-3p")
    if args.trim_3p < 0:
        raise Exception("input positive value for 3'trimming")
    if args.trim_5p < 0:
        raise Exception("input positive value for 5'trimming")

    f_seq, f_len = args.f, args.seed_5p
    b_seq, b_len = args.b, args.seed_3p
    if not f_seq and b_seq:
        req = lambda x, y: y != '*' or args.a
    elif f_seq and b_seq:
        if args.B:
            req = lambda x, y: x != '*' and y != '*' or args.a
        else:
            req = lambda x, y: x != '*' or y != '*' or args.a
    elif f_seq and not b_seq:
        req = lambda x, y: x != '*' or args.a

    f_pp, f_mp = make_regex(f_seq, f_len, False, args.s)
    b_pp, b_mp = make_regex(b_seq, b_len, True,  args.s)

    fas = {}
    for seq in fastq_sequence(get_file_obj(args.FASTQ)):
        seq_len = len(seq)
        f_i, f_mm = match_adapters(seq, f_pp, f_mp, args.s)
        b_i, b_mm = match_adapters(seq, b_pp, b_mp, args.s,
                                   b_len, b_len+1, seq_len)
        ins = seq[f_i+args.trim_5p : b_i-args.trim_3p]
        ins_len = len(ins)
        if req(f_mm, b_mm) and (args.m <= ins_len and ins_len <= args.x):
            fas[ins] = fas.get(ins, 0) + 1

    fas = sorted(fas.items(), key=itemgetter(0))
    for seq, cnt in fas:
        print(">{0}_{1}\n{0}".format(seq, cnt))
Beispiel #5
0
def qual_trim(args):
    if args.solexa:
        args.b = 64
        func = solexa_to_phred
    elif args.illumina5:
        func = illumina_64B
    else:
        func = illumina_33

    if args.b not in (33, 64):
        raise Exception("wrong quality score base")
    if args.l < 1:
        raise Exception("specify longer read length")
    if args.p < 0 or args.p > 1:
        raise Exception("bad error probability cutoff")
    if not args.solexa and args.q < 0:
        raise Exception("bad quality score cutoff")

    if args.q:
        cutoff = args.q
    else:
        cutoff = calc_qual_score(args.p, args.solexa)

    base = args.b
    minlen = args.l
    ns = re.compile('N', re.IGNORECASE)
    fastqs = fastq_record(get_file_obj(args.FASTQ))
    for read in fastqs:
        read = read.rstrip().split("\n")
        qual = read[3]
        s, max_s = 0, 0
        max_i = len(read[3])
        if minlen > max_i:
            continue
        for i in reversed(range(max_i)):
            q = func(ord(qual[i]) - base)
            s += cutoff - q
            if s < 0:
                break
            if s > max_s:
                max_s, max_i = s, i
        read[1] = read[1][:max_i]
        read[3] = read[3][:max_i]
        n_num = len(ns.findall(read[1]))
        if n_num < len(read[1]) and len(read[1]) >= minlen:
            print("\n".join(read))
Beispiel #6
0
def guess_qual_offset(args):
    platform = [('Sanger/Illumina-1.8+', 33, 76, 33),
                ('Illumina-1.5+', 67, 104, 64), ('Illumina-1.3+', 54, 104, 64),
                ('Solexa', 59, 104, 64)]
    q_chars = set()
    fastqs = fastq_quality(get_file_obj(args.FASTQ))
    sample_num = 50000
    for i, quality in enumerate(fastqs):
        if i == sample_num:
            break
        q_chars = q_chars.union(quality)

    q_int = sorted(list(map(ord, q_chars)))
    if len(q_int) <= 1:
        raise Exception("unknown quality encoding")
    for pl in platform:
        if pl[1] <= q_int[1] and q_int[-2] <= pl[2]:
            return "{}:base={}".format(pl[0], pl[3])

    raise Exception("unknown quality encoding")
def to_fasta(fastq, fasta, aseed, tm5, tm3, min_len, max_len):
    """Write FASTA containing clean reads, and return
       the number of the reads.

    """
    fq_obj = get_file_obj(fastq)
    if "RAW_INPUT".startswith(aseed):
        iterator = fastq_sequence(fq_obj)
    else:
        iterator = clip_adapter(fq_obj, aseed, tm5, tm3, min_len, max_len)
    fas = {}
    clean_read_count = 0
    for seq in iterator:
        fas[seq] = fas.get(seq, 0) + 1
    fa_obj = open(fasta, "w")
    for seq, cnt in fas.items():
        clean_read_count += cnt
        fa_obj.write(">{0}_{1}\n{0}\n".format(seq, cnt))
    fa_obj.close()
    fq_obj.close()
    return clean_read_count
Beispiel #8
0
def guess_qual_offset(args):
    platform = [ ('Sanger/Illumina-1.8+', 33,  76, 33),
                 ('Illumina-1.5+', 67, 104, 64),
                 ('Illumina-1.3+', 54, 104, 64),
                 ('Solexa', 59, 104, 64) ]
    q_chars = set()
    fastqs = fastq_quality(get_file_obj(args.FASTQ))
    sample_num = 50000
    for i, quality in enumerate(fastqs):
        if i == sample_num:
            break
        q_chars = q_chars.union(quality)

    q_int = sorted(list(map(ord, q_chars)))
    if len(q_int) <= 1:
        raise Exception("unknown quality encoding")
    for pl in platform:
        if pl[1] <= q_int[1] and q_int[-2] <= pl[2]:
            return "{}:base={}".format(pl[0], pl[3])

    raise Exception("unknown quality encoding")
Beispiel #9
0
def to_fasta(fastq, fasta, aseed, tm5, tm3, min_len, max_len):
    """Write FASTA containing clean reads, and return
       the number of the reads.

    """
    fq_obj = get_file_obj(fastq)
    if "RAW_INPUT".startswith(aseed):
        iterator = fastq_sequence(fq_obj)
    else:
        iterator = clip_adapter(fq_obj, aseed, tm5, tm3, min_len, max_len)
    fas = {}
    clean_read_count = 0
    for seq in iterator:
        fas[seq] = fas.get(seq, 0) + 1
    fa_obj = open(fasta, "w")
    for seq, cnt in fas.items():
        clean_read_count += cnt
        fa_obj.write(">{0}_{1}\n{0}\n".format(seq, cnt))
    fa_obj.close()
    fq_obj.close()
    return clean_read_count
Beispiel #10
0
def fastq_input_prep(fastq, ratio, temp_dir):
    """Write FASTQ in the temporary directory, and retrun
       (subsampled) FASTQ name, the total read count,
       standard deviation of read lengths.

    """
    num = int(1 / ratio)
    read_count = 0.0
    stats = {}
    fq_out = "{}/input.fq".format(temp_dir)
    fq_obj = get_file_obj(fastq)
    fout = open(fq_out, "w")
    for i, rec in enumerate(fastq_record(fq_obj)):
        if i % num == 0:
            fout.write(rec)
            read_count += 1
            L = len(rec.split("\n")[1])
            stats[L] = stats.get(L, 0) + 1
    fout.close()
    fq_obj.close()
    mean = sum([L * c for L, c in stats.items()]) / read_count
    sum_square = sum([(L - mean)**2 * c for L, c in stats.items()])
    sd = (sum_square / read_count)**0.5
    return fq_out, read_count, sd
Beispiel #11
0
def iterative_adapter_prediction(fastq,
                                 ratios,
                                 kmer_lens,
                                 sample_num,
                                 keep_len=12):
    """Return a list of predicted adapters.

       Iteratively predict 3' adapter sequence with different
       combinations of k and R.
    """
    fq_seq = []
    fq_obj = get_file_obj(fastq)
    for i, s in enumerate(fastq_sequence(fq_obj)):
        if i == sample_num:
            break
        fq_seq.append(s)
    fq_obj.close()

    collection = {}
    for kmer_len in kmer_lens:
        curated = {}
        freq = count_kmers(fq_seq, kmer_len, sample_num)
        for ratio in ratios:
            clean = filter_kmers(freq, kmer_len, ratio)
            assembl = assemble_kmers(clean, kmer_len // 2)
            for s, c in assembl:
                key = s[:keep_len]
                curated[key] = max(curated.get(key, 0), c)
        for s, c in curated.items():
            collection[s] = round(collection.get(s, 0) + c, 4)
    asmbl_min_len = min(map(len, collection.keys()))
    assembl = sorted(assemble_kmers(list(collection.items()),
                                    asmbl_min_len // 2),
                     key=itemgetter(1),
                     reverse=True)
    return assembl
Beispiel #12
0
def fastq_input_prep(fastq, ratio, temp_dir):
    """Write FASTQ in the temporary directory, and retrun
       (subsampled) FASTQ name, the total read count,
       standard deviation of read lengths.

    """
    num = int(1/ratio)
    read_count = 0.0
    stats = {}
    fq_out = "{}/input.fq".format(temp_dir)
    fq_obj = get_file_obj(fastq)
    fout = open(fq_out, "w")
    for i, rec in enumerate(fastq_record(fq_obj)):
        if i % num == 0:
            fout.write(rec)
            read_count += 1
            L = len(rec.split("\n")[1])
            stats[L] = stats.get(L,0) + 1
    fout.close()
    fq_obj.close()
    mean = sum([L*c for L,c in stats.items()]) / read_count
    sum_square = sum([(L-mean)**2 * c for L,c in stats.items()])
    sd = (sum_square / read_count)**0.5
    return fq_out, read_count, sd