Ejemplo n.º 1
0
def qual_trim(args):
    if args.solexa:
        args.b = 64
        func = solexa_to_phred
    elif args.illumina5:
        func = illumina_64B
    else:
        func = illumina_33

    if args.b not in (33, 64):
        raise Exception("wrong quality score base")
    if args.l < 1:
        raise Exception("specify longer read length")
    if args.p < 0 or args.p > 1:
        raise Exception("bad error probability cutoff")
    if not args.solexa and args.q < 0:
        raise Exception("bad quality score cutoff")

    if args.q:
        cutoff = args.q
    else:
        cutoff = calc_qual_score(args.p, args.solexa)

    base = args.b
    minlen = args.l
    ns = re.compile('N', re.IGNORECASE)
    fastqs = fastq_record(get_file_obj(args.FASTQ))
    for read in fastqs:
        read = read.rstrip().split("\n")
        qual = read[3]
        s, max_s = 0, 0
        max_i = len(read[3])
        if minlen > max_i:
            continue
        for i in reversed(range(max_i)):
            q = func(ord(qual[i]) - base)
            s += cutoff - q
            if s < 0:
                break
            if s > max_s:
                max_s, max_i = s, i
        read[1] = read[1][:max_i]
        read[3] = read[3][:max_i]
        n_num = len(ns.findall(read[1]))
        if n_num < len(read[1]) and len(read[1]) >= minlen:
            print("\n".join(read))
Ejemplo n.º 2
0
def fastq_input_prep(fastq, ratio, temp_dir):
    """Write FASTQ in the temporary directory, and retrun
       (subsampled) FASTQ name, the total read count,
       standard deviation of read lengths.

    """
    num = int(1 / ratio)
    read_count = 0.0
    stats = {}
    fq_out = "{}/input.fq".format(temp_dir)
    fq_obj = get_file_obj(fastq)
    fout = open(fq_out, "w")
    for i, rec in enumerate(fastq_record(fq_obj)):
        if i % num == 0:
            fout.write(rec)
            read_count += 1
            L = len(rec.split("\n")[1])
            stats[L] = stats.get(L, 0) + 1
    fout.close()
    fq_obj.close()
    mean = sum([L * c for L, c in stats.items()]) / read_count
    sum_square = sum([(L - mean)**2 * c for L, c in stats.items()])
    sd = (sum_square / read_count)**0.5
    return fq_out, read_count, sd
Ejemplo n.º 3
0
def fastq_input_prep(fastq, ratio, temp_dir):
    """Write FASTQ in the temporary directory, and retrun
       (subsampled) FASTQ name, the total read count,
       standard deviation of read lengths.

    """
    num = int(1/ratio)
    read_count = 0.0
    stats = {}
    fq_out = "{}/input.fq".format(temp_dir)
    fq_obj = get_file_obj(fastq)
    fout = open(fq_out, "w")
    for i, rec in enumerate(fastq_record(fq_obj)):
        if i % num == 0:
            fout.write(rec)
            read_count += 1
            L = len(rec.split("\n")[1])
            stats[L] = stats.get(L,0) + 1
    fout.close()
    fq_obj.close()
    mean = sum([L*c for L,c in stats.items()]) / read_count
    sum_square = sum([(L-mean)**2 * c for L,c in stats.items()])
    sd = (sum_square / read_count)**0.5
    return fq_out, read_count, sd