Exemple #1
0
def check_sequence_name(path_R1, path_R2):
    with open(path_R1) as path_inf_R1, open(path_R2) as path_inf_R2:
        fastq_gen_R1 = read_fastq(path_inf_R1)
        fastq_gen_R2 = read_fastq(path_inf_R2)
        for gen_R1, gen_R2 in zip(fastq_gen_R1, fastq_gen_R2):
            title_R1, title_R2 = gen_R1[0], gen_R2[0]
            if len(title_R1) != len(title_R2):
                return False
            diff_idx = [
                i for i in range(len(title_R1)) if title_R1[i] != title_R2[i]
            ]
            if len(diff_idx) != 1:
                return False
            if int(title_R2[diff_idx[0]]) - int(title_R1[diff_idx[0]]) != 1:
                return False
    return True
Exemple #2
0
def subsample_fastqs(path_fastqs, num_files=10, num_sequences=1000):
    for i, path_fastq in enumerate(path_fastqs):
        with open(path_fastq) as fastq_inf:
            if i >= num_files:
                break
            fastq_gen = read_fastq(fastq_inf)
            yield limit_fastq(fastq_gen, num_sequences=num_sequences)
Exemple #3
0
def trimmer_learning(flash_output_filenames):
    filter_q_sum = 0
    trim_q_sum = 0
    totbases = 0
    tottrim = 0
    num = 0
    for fq_path in flash_output_filenames:
        with open(fq_path) as fq_inf:
            fq_gen = read_fastq(fq_inf)
            for gen in fq_gen:
                num = num + 1
                qualities = gen[2]
                totbases = totbases + len(qualities)
                qualities = [ord(qual) - 33 for qual in qualities]
                filter_q_sum = filter_q_sum + sum(qualities)
                if (len(qualities) >= 20):
                    trim_q_sum = trim_q_sum + sum(qualities[:10]) + sum(
                        qualities[-10:])
                    tottrim = tottrim + 20
    logging.info('num seqs: %d' % num)
    logging.info('filter_q_sum: %d' % filter_q_sum)
    logging.info('trim_q_sum: %d' % trim_q_sum)
    logging.info('total bases considered: %d (trim: %d)' % (totbases, tottrim))
    logging.info('filter_q: %d' % (filter_q_sum / totbases))
    logging.info('trim_q: %d' % (trim_q_sum / tottrim))

    filter_q = math.floor(filter_q_sum / totbases)
    trim_q = math.floor(trim_q_sum / tottrim) - 1
    trim_q = trim_q if trim_q > filter_q - 3 else filter_q - 3

    return filter_q, trim_q
Exemple #4
0
 def test(self):
     path_fastqs = [
         os.path.join('testfq', f) for f in os.listdir('testfq')
         if f.endswith('fastq')
     ]
     i = 0
     j = 0
     for path_fastq in path_fastqs:
         with open(path_fastq) as inf:
             fastq_gen = read_fastq(inf)
             for title, data, qualities in fastq_gen:
                 i += 1
         with open(path_fastq) as inf:
             for line in inf:
                 j += 1
     assert i == j / 4
Exemple #5
0
def main():
    start_time = datetime.now()

    parser = make_arg_parser()
    args = parser.parse_args()

    learning_params = ["shi7.py"]
    learning_pretty = ["SHI7 version", VERSION]

    input = os.path.abspath(args.input)
    output = os.path.abspath(args.output)

    # Make output folder
    if not os.path.exists(output):
        os.makedirs(output)

    # Put in the logging file
    logging.basicConfig(filename=os.path.join(output, 'shi7_learning.log'), filemode='w', level=logging.DEBUG, \
        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

    # Make temp outfolder
    if os.path.exists(os.path.join(args.output, 'temp')):
        shutil.rmtree(os.path.join(args.output, 'temp'))
        logging.info('Existing temp directory deleted.')
        os.makedirs(os.path.join(args.output, 'temp'))
    else:
        os.makedirs(os.path.join(args.output, 'temp'))

    path_fastqs = [
        os.path.join(input, f) for f in os.listdir(input)
        if f.endswith('fastq') or f.endswith('fq')
    ]

    if len(path_fastqs) == 0:
        msg = "No FASTQS found in input folder {}".format(input)
        logging.critical(msg)
        raise IOError(msg)

    # Record the input
    results, addon = template_input(input)
    logging.info(results)
    if addon:
        learning_params.extend(addon)

    # Write temp subsampled fastqs
    subsampled_fastq_path = os.path.join(output, 'temp', 'subsampled')
    os.makedirs(subsampled_fastq_path)
    totbases = totseqs = 0
    for file in path_fastqs:
        basename = os.path.basename(file)
        with open(file) as fastq_inf:
            fastq_gen = read_fastq(fastq_inf)
            with open(os.path.join(subsampled_fastq_path, basename),
                      'w') as outf:
                for header, seq, quality in limit_fastq(fastq_gen):
                    outf.write("@{header}\n{seq}\n+\n{quality}\n".format(
                        header=header, seq=seq, quality=quality))
                    totbases += len(seq)
                    totseqs += 1
    avlen = totbases / totseqs
    path_fastqs = glob(os.path.join(subsampled_fastq_path, "*"))

    # Detect if paired end
    paired_end, pair_obj = detect_paired_end(path_fastqs)
    path_fastqs = pair_obj[0]
    link_outdir = os.path.join(output, 'temp', 'link')
    os.makedirs(link_outdir)
    snames = [os.path.basename(n) for n in path_fastqs]
    path_fastqs = link_manicured_names(path_fastqs, snames, link_outdir,
                                       not paired_end, pair_obj[1:])

    results, addon = template_paired_end(paired_end)
    logging.info(results)
    if addon: learning_params.extend(addon)
    learning_pretty += ["Paired end", paired_end]

    # Detect adapters
    axe_adaptors_path = os.path.join(output, 'temp', 'axe_adaptors')
    os.makedirs(axe_adaptors_path)
    best_adap, best_size, fastq_paths = choose_axe_adaptors(
        path_fastqs, paired_end, axe_adaptors_path, int(args.threads))
    results, addon = template_choose_axe_adaptors(best_adap, best_size)
    logging.info(results)
    if addon: learning_params.extend(addon)
    learning_pretty += ["Detected adaptors", best_adap]

    # Detect output folder
    results, addon = template_output(output)
    logging.info(results)
    if addon: learning_params.extend(addon)

    # Detect stitching
    stitched_path = os.path.join(output, 'temp', 'flash')
    os.makedirs(stitched_path)
    if paired_end:
        stitches, do_outies, fastq_paths = flash_stitchable_and_check_outies(
            fastq_paths, stitched_path, int(args.threads))
    else:
        stitches, do_outies = False, False
    results, addon = template_flash(stitches, do_outies)
    logging.info(results)
    if addon: learning_params.extend(addon)
    if paired_end:
        learning_pretty += ["Stitching", stitches]
        if stitches: learning_pretty += ["Outies allowed", do_outies]

    filt_q, trim_q = trimmer_learning(fastq_paths)
    results, addon = template_trim(int(filt_q), int(trim_q))
    logging.info(results)
    if addon: learning_params.extend(addon)
    learning_pretty += ["Filter quality", filt_q, "Trimming quality", trim_q]

    # Check whether to implement stitching bounds
    if stitches:
        cv, mean = flash_check_cv(stitched_path)
        if cv < 0.1:
            learning_pretty += ["Amplicon mode", True]
            logging.info("CV: %f, Mean: %f, Avlen: %f" % (cv, mean, avlen))
            if avlen > mean: avlen = mean
            mr = math.ceil(cv * mean)
            logging.info("SD was: %d" % mr)
            minstitch, maxstitch = int(2 * avlen - mean - mr), int(2 * avlen -
                                                                   mean + mr)
            if minstitch < 8: minstitch = 8
            logging.info("Amplicon mode: stitch range [%d, %d]" %
                         (minstitch, maxstitch))
            results, addon = template_cv(minstitch, maxstitch)
            logging.info(results)
            if addon: learning_params.extend(addon)
            learning_pretty += ["Amplicon stitch minimum", minstitch]
            learning_pretty += ["Amplicon stitch maximum", maxstitch]
        else:
            learning_pretty += ["Amplicon mode", False]

    #print(str(learning_params))
    with open(os.path.join(args.output, "shi7_cmd.sh"), "w") as output:
        cmd = " ".join(learning_params)
        output.write(cmd)
        print(cmd)

    with open(os.path.join(args.output, "learning_params.txt"), "w") as output:
        for ix in range(0, len(learning_pretty), 2):
            output.write(
                str(learning_pretty[ix]) + "\t" +
                str(learning_pretty[ix + 1]) + "\n")

    if not args.debug:
        shutil.rmtree(os.path.join(args.output, 'temp'))
    logging.info('Execution time: %s' % (datetime.now() - start_time))