Example #1
0
def main(nm='', start='', end=''):

    if nm == '' and start == '' and end == '':
        gen_qsubs()
        return

    start, end = int(start), int(end)
    out_dir = out_place + nm + '/'
    util.ensure_dir_exists(out_dir)

    print('Preparing alignment output directories...')
    prepare_align_outdirs(out_dir, start, end)
    print('Done')

    global expected_cutsite
    expected_cutsite = 30

    inp_dir = inp_place + nm + '/'

    timer = util.Timer(total=end - start + 1)
    for iter_exp in range(start, end):
        data = defaultdict(list)
        for split in os.listdir(inp_dir):
            if split == 'aligns':
                continue
            inp_fn = inp_dir + '%s/%s.txt' % (split, iter_exp)
            remaster_aligns(inp_fn, data)
        save_alignments(data, out_dir, iter_exp)
        timer.update()

    return
Example #2
0
def prepare_align_outdirs(out_plc, start, end):
    util.ensure_dir_exists(out_plc)
    timer = util.Timer(total=end - start + 1)
    for exp in range(start, end + 1):
        out_idx_dir = out_plc + str(exp) + '/'
        util.ensure_dir_exists(out_idx_dir)
        if len(os.listdir(out_idx_dir)) > 0:
            subprocess.check_output('rm -rf %s*' % (out_idx_dir), shell=True)
        timer.update()
    return
Example #3
0
def genotype_data(inp_dir, out_dir, nm, start, end):
    start, end = int(start), int(end)
    master_df = pd.DataFrame()

    global crispr_cutsite
    #expected_cutsite = names_cutsites[iter_exp] -3
    #crispr_cutsite = 41; #len('TCCGTGCTGTAACGAAAGGATGGGTGCGACGCGTCAT') + 34 #note, this was changed to the zero based cutsite in reduced.csv

    timer = util.Timer(total=end - start + 1)
    for iter_exp in range(start, end):
        exp = iter_exp

        crispr_cutsite = 34

        exp_dir = '%s%s/' % (inp_dir, iter_exp)
        if not os.path.isdir(exp_dir):
            return

        # Noise categories
        master_df = get_homopolymer(master_df, exp, exp_dir)
        master_df = get_hasN(master_df, exp, exp_dir)
        master_df = get_pcr_recombination(master_df, exp, exp_dir)
        master_df = get_poormatches(master_df, exp, exp_dir)
        master_df = get_cutsite_not_sequenced(master_df, exp, exp_dir)
        master_df = get_read_too_short(master_df, exp, exp_dir)

        # Primary categories
        master_df = get_deletions(master_df, exp, exp_dir)
        master_df = get_insertions(master_df, exp, exp_dir)
        master_df = get_combination_indels(master_df, exp, exp_dir)

        # Secondary categories
        master_df = get_forgiven_indels(master_df, exp, exp_dir)
        master_df = get_forgiven_combination_indels(master_df, exp, exp_dir)

        # Other categories
        master_df = get_combination_indels_notcrispr(master_df, exp, exp_dir)
        master_df = get_other(master_df, exp, exp_dir)

        # Wildtypes
        master_df = get_wildtype(master_df, exp, exp_dir)

        timer.update()

    seq_contexts = []
    for s in master_df['_Experiment']:
        crit = (LIBRARY_DF['Name'] == s)
        if 'Designed sequence (61-bp, cutsite at position 34 by 0-index)' in LIBRARY_DF.columns:
            seq = LIBRARY_DF[crit][
                'Designed sequence (61-bp, cutsite at position 34 by 0-index)'].iloc[
                    0]
        elif 'targetseq_61' in LIBRARY_DF.columns:
            seq = LIBRARY_DF[crit]['targetseq_61'].iloc[0]

        seq_contexts.append(seq)
    master_df['_Sequence Context'] = seq_contexts

    master_df['_Cutsite'] = crispr_cutsite

    master_df.to_csv(out_dir + '%s_genotypes_%s.csv' % (nm, start))
    return
Example #4
0
def demultiplex(split, filename):

    if "AH3W5GBGX9" in filename:
        print()
        exp_design = exp_design_2955
        exp_test_strs = exp_test_strs_2955
    else:

        exp_design = exp_design_3447
        exp_test_strs = exp_test_strs_3447

    for name in list(exp_design["Name"]) + ['other']:
        util.ensure_dir_exists(os.path.join(out_dir, '%s' % (filename), name))
        util.exists_empty_fn(
            os.path.join(out_dir, '%s/%s/R1_%s.fa' % (filename, name, split)))
        util.exists_empty_fn(
            os.path.join(out_dir, '%s/%s/R2_%s.fa' % (filename, name, split)))

    print(os.path.join(out_dir, name, '%s' % (filename)))
    for snum, sgroup in it.groupby(sorted(
            os.listdir(inp_dir),
            key=lambda x: re.compile("(\d+)\.fastq").search(x).groups()[0]),
                                   key=lambda x: re.compile("(\d+)\.fastq").
                                   search(x).groups()[0]):

        if snum != split: continue
        files = list(sgroup)
        fns = list([sf for sf in files if filename in sf])

        print(("LANE: {0}, FILES: {1}".format(snum, fns)))
        read_files = dict([[int(re.compile("R(\d+)").search(e).group(1)), e]
                           for e in fns])

        inp_fn1 = os.path.join(inp_dir, read_files[1])
        inp_fn2 = os.path.join(inp_dir, read_files[2])

        lc = util.line_count(inp_fn1)
        num_bad_q, num_tot, num_other, num_mapped = 0, 0, 0, 0
        timer = util.Timer(total=lc)
        i = -1

        ##
        # Functions
        ##
        def match(r1, r2, h1, h2):
            for k, v in list(exp_test_strs.items()):
                try:
                    idx = h1.index(v)
                    return k, r1
                except ValueError as e:
                    continue
            return "other", r1

        with open(inp_fn1) as f1:
            with open(inp_fn2) as f2:
                print(inp_fn1)
                print(inp_fn2)
                while 1:
                    i += 1
                    if i % 10000 == 0:
                        print((
                            "{0} records, ({1}%) [{2} bad] [{3} other]".format(
                                i / 4, 100 * float(i) / lc, num_bad_q,
                                num_other)))

                    try:
                        line1 = next(f1)
                        line2 = next(f2)
                    except StopIteration as e:
                        break

                    if i % 4 == 0:
                        h1 = line1.strip()
                        h2 = line2.strip()
                    if i % 4 == 1:
                        r1 = line1.strip()
                        r2 = line2.strip()
                    if i % 4 == 3:
                        num_tot += 1
                        qs1 = line1.strip()
                        qs2 = line2.strip()

                        markbad = False
                        for qs in [qs1, qs2]:
                            quals = [ord(s) - 33 for s in qs]
                            if np.mean(quals) < 30:
                                markbad = True

                        if markbad:
                            num_bad_q += 1
                            continue

                        demultiplex_id, trimmed_read = match(r1, r2, h1, h2)
                        if demultiplex_id == 'other':
                            num_other += 1

                        out1_fn = out_dir + '%s/%s/R1_%s.fa' % (
                            filename, demultiplex_id, split)
                        if len(('>' + h1[1:] + '\n' + r1 +
                                '\n').splitlines()) > 2:
                            print('>' + h1[1:] + '\n' + r1 + '\n')
                            raise Exception()
                        #print('>' + h1[1:] + '\n' + r1 + '\n')
                        with open(out1_fn, 'a') as f:
                            f.write('>' + h1[1:] + '\n' + r1 + '\n')

                        out2_fn = out_dir + '%s/%s/R2_%s.fa' % (
                            filename, demultiplex_id, split)
                        with open(out2_fn, 'a') as f:
                            f.write('>' + h2[1:] + '\n' + r2 + '\n')
                        num_mapped += 1

                    #timer.update()

    #logs = pd.Series({"num_bad_q":num_bad_q,
    #               "num_tot":num_tot})
    #logs.to_csv(os.path.join(LOGS_DIR,f"{datetime.date.today().isoformat}_{filename}_{split}.csv"))
    print(('Rejected %s fraction of reads' % (num_bad_q / num_tot)))
    print("<json>" + json.dumps({
        "num_bad_q": num_bad_q,
        "num_tot": num_tot,
        "num_other": num_other,
        "num_mapped": num_mapped,
    }) + "</json>")

    return
Example #5
0
def matchmaker(nm, split):
    print(nm, split)
    stdout_fn = os.path.join(_config.LOGS_DIR, 'nh_c_%s_%s.out' % (nm, split))
    util.exists_empty_fn(stdout_fn)
    out_dir = os.path.join(out_root_dir, nm, split)
    util.ensure_dir_exists(out_dir)

    inp_fn = inp_dir + '%s_R2_%s.fastq' % (nm, split)

    lsh_dict = build_targets_better_lsh()
    alignment_buffer = init_alignment_buffer()

    prepare_outfns(out_dir)

    qf = 0
    print(inp_fn)
    tot_reads = util.line_count(inp_fn)
    timer = util.Timer(total=tot_reads)
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 4 == 0:
                pass
            if i % 4 == 1:
                l2 = line.strip()
            if i % 4 == 3:
                # Quality filter
                q2 = line.strip()
                qs = [ord(s) - 33 for s in q2]
                if np.mean(qs) < 28:
                    qf += 1
                    continue

                #l2 = compbio.reverse_complement(l2)
                #l2 = l2[82] # -- note, changed from :61 to 61:. Can comment out entirely?
                l2 = reverse_complement(l2)

                #l2 = l2[-62:]

                align_header = '>1'

                # Try to find designed target from LSH
                cand_idxs = find_best_designed_target(l2, lsh_dict)
                if len(cand_idxs) == 0:
                    continue

                # Run alignment
                best_idx, align = alignment(l2, cand_idxs)
                align = align.decode("utf-8")

                # Store alignment into buffer
                store_alignment(alignment_buffer, best_idx, align_header,
                                align)

            if i % int(tot_reads / 100) == 1 and i > 1:
                # Flush alignment buffer
                alignment_buffer = flush_alignments(alignment_buffer, out_dir)

                # Stats for the curious
                with open(stdout_fn, 'a') as outf:
                    outf.write('Time: %s\n' % (datetime.datetime.now()))
                    outf.write('Progress: %s\n' % (i / int(tot_reads / 100)))
                    outf.write('Quality filtered pct: %s\n' % (qf / (i / 4)))

            #timer.update()

    # Final flush
    alignment_buffer = flush_alignments(alignment_buffer, out_dir)

    return