def split(inp_fn, out_nm):
    inp_fn_numlines = util.line_count(inp_fn)

    num_splits = 30
    split_size = int(inp_fn_numlines / num_splits)
    if num_splits * split_size < inp_fn_numlines:
        split_size += 1
    while split_size % 4 != 0:
        split_size += 1
    print('Using split size %s' % (split_size))

    split_num = 0
    for idx in range(1, inp_fn_numlines, split_size):
        start = idx
        end = start + split_size
        out_fn = out_dir + out_nm + '_%s.fastq' % (split_num)
        command = 'tail -n +%s %s | head -n %s > %s' % (start, inp_fn,
                                                        end - start, out_fn)
        split_num += 1
        print(command)

    return
Example #2
0
def demultiplex(split, filename):

    if "AH3W5GBGX9" in filename:
        print()
        exp_design = exp_design_2955
        exp_test_strs = exp_test_strs_2955
    else:

        exp_design = exp_design_3447
        exp_test_strs = exp_test_strs_3447

    for name in list(exp_design["Name"]) + ['other']:
        util.ensure_dir_exists(os.path.join(out_dir, '%s' % (filename), name))
        util.exists_empty_fn(
            os.path.join(out_dir, '%s/%s/R1_%s.fa' % (filename, name, split)))
        util.exists_empty_fn(
            os.path.join(out_dir, '%s/%s/R2_%s.fa' % (filename, name, split)))

    print(os.path.join(out_dir, name, '%s' % (filename)))
    for snum, sgroup in it.groupby(sorted(
            os.listdir(inp_dir),
            key=lambda x: re.compile("(\d+)\.fastq").search(x).groups()[0]),
                                   key=lambda x: re.compile("(\d+)\.fastq").
                                   search(x).groups()[0]):

        if snum != split: continue
        files = list(sgroup)
        fns = list([sf for sf in files if filename in sf])

        print(("LANE: {0}, FILES: {1}".format(snum, fns)))
        read_files = dict([[int(re.compile("R(\d+)").search(e).group(1)), e]
                           for e in fns])

        inp_fn1 = os.path.join(inp_dir, read_files[1])
        inp_fn2 = os.path.join(inp_dir, read_files[2])

        lc = util.line_count(inp_fn1)
        num_bad_q, num_tot, num_other, num_mapped = 0, 0, 0, 0
        timer = util.Timer(total=lc)
        i = -1

        ##
        # Functions
        ##
        def match(r1, r2, h1, h2):
            for k, v in list(exp_test_strs.items()):
                try:
                    idx = h1.index(v)
                    return k, r1
                except ValueError as e:
                    continue
            return "other", r1

        with open(inp_fn1) as f1:
            with open(inp_fn2) as f2:
                print(inp_fn1)
                print(inp_fn2)
                while 1:
                    i += 1
                    if i % 10000 == 0:
                        print((
                            "{0} records, ({1}%) [{2} bad] [{3} other]".format(
                                i / 4, 100 * float(i) / lc, num_bad_q,
                                num_other)))

                    try:
                        line1 = next(f1)
                        line2 = next(f2)
                    except StopIteration as e:
                        break

                    if i % 4 == 0:
                        h1 = line1.strip()
                        h2 = line2.strip()
                    if i % 4 == 1:
                        r1 = line1.strip()
                        r2 = line2.strip()
                    if i % 4 == 3:
                        num_tot += 1
                        qs1 = line1.strip()
                        qs2 = line2.strip()

                        markbad = False
                        for qs in [qs1, qs2]:
                            quals = [ord(s) - 33 for s in qs]
                            if np.mean(quals) < 30:
                                markbad = True

                        if markbad:
                            num_bad_q += 1
                            continue

                        demultiplex_id, trimmed_read = match(r1, r2, h1, h2)
                        if demultiplex_id == 'other':
                            num_other += 1

                        out1_fn = out_dir + '%s/%s/R1_%s.fa' % (
                            filename, demultiplex_id, split)
                        if len(('>' + h1[1:] + '\n' + r1 +
                                '\n').splitlines()) > 2:
                            print('>' + h1[1:] + '\n' + r1 + '\n')
                            raise Exception()
                        #print('>' + h1[1:] + '\n' + r1 + '\n')
                        with open(out1_fn, 'a') as f:
                            f.write('>' + h1[1:] + '\n' + r1 + '\n')

                        out2_fn = out_dir + '%s/%s/R2_%s.fa' % (
                            filename, demultiplex_id, split)
                        with open(out2_fn, 'a') as f:
                            f.write('>' + h2[1:] + '\n' + r2 + '\n')
                        num_mapped += 1

                    #timer.update()

    #logs = pd.Series({"num_bad_q":num_bad_q,
    #               "num_tot":num_tot})
    #logs.to_csv(os.path.join(LOGS_DIR,f"{datetime.date.today().isoformat}_{filename}_{split}.csv"))
    print(('Rejected %s fraction of reads' % (num_bad_q / num_tot)))
    print("<json>" + json.dumps({
        "num_bad_q": num_bad_q,
        "num_tot": num_tot,
        "num_other": num_other,
        "num_mapped": num_mapped,
    }) + "</json>")

    return
Example #3
0
def matchmaker(nm, split):
    print(nm, split)
    stdout_fn = os.path.join(_config.LOGS_DIR, 'nh_c_%s_%s.out' % (nm, split))
    util.exists_empty_fn(stdout_fn)
    out_dir = os.path.join(out_root_dir, nm, split)
    util.ensure_dir_exists(out_dir)

    inp_fn = inp_dir + '%s_R2_%s.fastq' % (nm, split)

    lsh_dict = build_targets_better_lsh()
    alignment_buffer = init_alignment_buffer()

    prepare_outfns(out_dir)

    qf = 0
    print(inp_fn)
    tot_reads = util.line_count(inp_fn)
    timer = util.Timer(total=tot_reads)
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 4 == 0:
                pass
            if i % 4 == 1:
                l2 = line.strip()
            if i % 4 == 3:
                # Quality filter
                q2 = line.strip()
                qs = [ord(s) - 33 for s in q2]
                if np.mean(qs) < 28:
                    qf += 1
                    continue

                #l2 = compbio.reverse_complement(l2)
                #l2 = l2[82] # -- note, changed from :61 to 61:. Can comment out entirely?
                l2 = reverse_complement(l2)

                #l2 = l2[-62:]

                align_header = '>1'

                # Try to find designed target from LSH
                cand_idxs = find_best_designed_target(l2, lsh_dict)
                if len(cand_idxs) == 0:
                    continue

                # Run alignment
                best_idx, align = alignment(l2, cand_idxs)
                align = align.decode("utf-8")

                # Store alignment into buffer
                store_alignment(alignment_buffer, best_idx, align_header,
                                align)

            if i % int(tot_reads / 100) == 1 and i > 1:
                # Flush alignment buffer
                alignment_buffer = flush_alignments(alignment_buffer, out_dir)

                # Stats for the curious
                with open(stdout_fn, 'a') as outf:
                    outf.write('Time: %s\n' % (datetime.datetime.now()))
                    outf.write('Progress: %s\n' % (i / int(tot_reads / 100)))
                    outf.write('Quality filtered pct: %s\n' % (qf / (i / 4)))

            #timer.update()

    # Final flush
    alignment_buffer = flush_alignments(alignment_buffer, out_dir)

    return