Beispiel #1
0
def remaster_aligns(inp_fn, data):
  with open(inp_fn) as f:
    for i, line in enumerate(f):
      if i % 4 == 0:
        header = line.strip()
        # read start is 1-based, left-most genomic position of reference in alignment, according to SAM format. 
        read_start = int(header.split('_')[-1])
      if i % 4 == 1:
        read = line.strip()
      if i % 4 == 2:
        genome = line.strip()
        genome = genome.upper()

        # read_start is where reference starts. find expected cutsite by counting along reference until we reach master_expected_cutsite
        global expected_cutsite   # 1-based
        expected_cutsite = master_expected_cutsite - read_start
        if expected_cutsite <= 0 or expected_cutsite >= len(genome.replace('-', '')):
          continue

        # if we reversed this context in our library, reverse the alignment here. adjust expected cutsite accordingly.
        if reverse_flag:
          read = compbio.reverse_complement(read)
          genome = compbio.reverse_complement(genome)
          expected_cutsite = len(genome.replace('-', '')) - expected_cutsite

        # Find where gg should be even if there are insertions in between
        gg_seq = genome[expected_cutsite:].replace('-', '')
        cutsite = get_cutsite_idx(read, genome)
        gg = genome[cutsite:].replace('-', '')[4:6]
        # Assert that cutsite to GG must be in genome
        if len(gg) != 2:
          continue
        assert gg == 'GG', 'No GG!'

        # Ensure that cutsite is determined in a consistent manner between normal and reversed alignments 
        assert genome[cutsite - 1] != '-', 'Inconsistent cutsite!'

        # Main -- Find indel category, assuming end gaps are meaningless
        category = categorize_alignment(read, genome)

        if category in ['del_notatcut', 'del_notcrispr']:
          read, genome, category = shift_single_deletion(read, genome, category)
        if category in ['ins_notatcut', 'ins_notcrispr']:
          read, genome, category = shift_single_insertion(read, genome, category)

        header += '_%s' % (cutsite)
        alignment = [header, read, genome, '']
        data[category] += alignment
  return
Beispiel #2
0
def nts_to_aas(seq_30nt, aa_frame, path_pos_wrt_grna,
               aa_strand_relative_to_seq):
    if aa_frame == 'intronic':
        return ''

    path_idx = path_pos_wrt_grna + 9

    if aa_strand_relative_to_seq == '-':
        seq_30nt = compbio.reverse_complement(seq_30nt)
        path_idx = len(seq_30nt) - path_idx - 1

    # aa_frame in [1, 2, 3] -> [0, 1, 2]
    aa_frame_0idx = int(aa_frame) - 1
    begin_frame = (aa_frame_0idx - path_idx) % 3
    first_triplet_start_idx = (3 - begin_frame) % 3

    aas = ''
    for idx in range(first_triplet_start_idx, len(seq_30nt) + 3, 3):
        triplet = seq_30nt[idx:idx + 3]
        if len(triplet) != 3:
            break
        aa = triplet_to_aa[triplet]
        # print(triplet, aa, idx)
        aas += aa

    return aas
def create_gt_with_mutations(dfs):
  plus_strand = bool(dfs['Target strand'].iloc[0] == 0)
  gt = list(wt_gt) if plus_strand else list(rc_wt_gt)

  for idx, row in dfs.iterrows():
    try:
      assert gt[row['Position']] == row['Reference nucleotide'], 'Error: Probably bad strand'
    except:
      import code; code.interact(local=dict(globals(), **locals()))
    gt[row['Position']] = row['Mutated nucleotide']
  gt = ''.join(gt)
  return gt if plus_strand else compbio.reverse_complement(gt)
def find_matching_sequence(target, rows):
    for idx, row in rows.iterrows():
        orient = row['gRNA Orientation']
        seq = row['Alternative Sequence']
        cutsite = row['Cutsite']
        if orient == '-':
            seq = compbio.reverse_complement(seq)
            cutsite = len(seq) - cutsite
        cons_target = seq[cutsite - 27:cutsite + 28]
        if target == cons_target:
            return row
    assert False, 'Not found'
    return
def run_align_needleman_wunsch(srr, nm):
    inp_fn = inp_dir + f'{srr}.fastq'
    genome_fn = inp_dir + 'SP055-rpoZ-cMyc-Cry1Ac1-d123.fa'

    target = open(genome_fn).readlines()[1].strip()

    seq_align_tool = '/ahg/regevdata/projects/CRISPR-libraries/tools/seq-align/bin/needleman_wunsch'

    out_fn = out_dir + f'{nm}.fa'
    with open(out_fn, 'w') as f:
        pass

    alignment_buffer = []

    timer = util.Timer(total=util.line_count(inp_fn))
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 4 == 0:
                header = line.strip()
                read_nm = header.split()[0].replace('@', '')
            if i % 4 == 1:
                read = line.strip()
            if i % 4 == 3:
                qs = [ord(s) - 33 for s in line.strip()]
                if np.mean(qs) >= 30:

                    read = compbio.reverse_complement(read)

                    command = f'{seq_align_tool} --match 1 --mismatch -1 --gapopen -5 --gapextend -1 --freestartgap --freeendgap {read} {target}'
                    align = subprocess.check_output(command,
                                                    shell=True).decode('utf-8')
                    align = align[:-2]

                    alignment_buffer.append(f'>{read_nm}\n{align}\n')

                    if len(alignment_buffer) > 100:
                        print(f'Dumping alignment buffer...')
                        with open(out_fn, 'a') as f:
                            for item in alignment_buffer:
                                f.write(item)
                        alignment_buffer = []

            timer.update()

    print(f'Dumping alignment buffer...')
    with open(out_fn, 'a') as f:
        for item in alignment_buffer:
            f.write(item)
    alignment_buffer = []

    return
Beispiel #6
0
def create_gt_with_mutations(dfs):
    plus_strand = True
    gt = list(wt_gt) if plus_strand else list(rc_wt_gt)

    for idx, row in dfs.iterrows():
        try:
            assert gt[row['Position (0 based)']] == row[
                'Reference nt'], 'Error: Probably bad strand'
        except:
            import code
            code.interact(local=dict(globals(), **locals()))
        gt[row['Position (0 based)']] = row['Mutated nt']
    gt = ''.join(gt)
    return gt if plus_strand else compbio.reverse_complement(gt)
Beispiel #7
0
def find_ulmi(line1):
    constant = 'ATGACGCGTCGCACCCATC'

    def get_match_score(query, ref):
        return sum([1 for idx in range(len(query)) if query[idx] != ref[idx]])

    ulmi_idx = 61 + len(constant)
    best_ulmi = line1[ulmi_idx:ulmi_idx + 15]
    for start_pos in range(61 - 5, 61 + 5):
        query = line1[start_pos:start_pos + 19]
        if get_match_score(query, constant) <= 3:
            best_ulmi = line1[ulmi_idx:ulmi_idx + 15]
            ulmi_idx = start_pos + len(constant)
            break

    return compbio.reverse_complement(best_ulmi), ulmi_idx
Beispiel #8
0
def set_master_expected_cutsite(srr_id):
    global master_expected_cutsite
    T = _config.d.TABLE
    srr_row = T[T['Run'] == srr_id]
    if len(srr_row) == 0:
        return False, None
    cloc = str(srr_row['chromosome_loc']).split()[1]
    genome_build = str(cloc.split('_')[0])
    cloc = cloc.split('_')[1]
    chrm = str(cloc.split(':')[0])
    start = int(cloc.split(':')[1].split('-')[0])
    end = int(cloc.split(':')[1].split('-')[1])

    tool = '/cluster/mshen/tools/2bit/twoBitToFa'
    twobit_db = '/cluster/mshen/tools/2bit/%s.2bit' % (genome_build)
    twobit_start = start - 1
    command = '%s -seq=%s -start=%s -end=%s %s temp_%s.fa; cat temp_%s.fa' % (
        tool, chrm, twobit_start, end, twobit_db, srr_id, srr_id)
    query = subprocess.check_output(command, shell=True)
    genome = ''.join(query.split()[1:]).upper()
    reverse_context = False
    if genome[:2] == 'CC' and genome[-2:] != 'GG':
        master_expected_cutsite = start + 6
        reverse_context = True
    elif genome[:2] != 'CC' and genome[-2:] == 'GG':
        master_expected_cutsite = start + 23 - 6
    elif genome[:2] == 'CC' and genome[-2:] == 'GG':
        # If both CC and GG are available, default to GG.
        # Three out of 96 spacers have both CC/GG, all three are GG.
        master_expected_cutsite = start + 23 - 6
    else:
        print 'ERROR: Expected gRNA lacks NGG on both strands'
        sys.exit(0)

    context = ''
    command = '%s -seq=%s -start=%s -end=%s %s temp_%s.fa; cat temp_%s.fa' % (
        tool, chrm, master_expected_cutsite - 101,
        master_expected_cutsite + 99, twobit_db, srr_id, srr_id)
    query = subprocess.check_output(command, shell=True)
    context = ''.join(query.split()[1:]).upper()

    if reverse_context:
        context = compbio.reverse_complement(context)

    assert context[104:106] == 'GG', 'Bad GG'

    return True, context
Beispiel #9
0
def make_bt_index():
  bt_fold = _config.DATA_DIR + f'bowtie2_index/'
  util.ensure_dir_exists(bt_fold)

  for idx, row in target_df.iterrows():
    nm = row['Name']
    # seq = row['Sequence context']
    assembly = row['Assembly']
    chrom = row['Chromosome']
    strand = row['Strand']
    start = row['Start']
    end = row['End']

    twobit = '/ahg/regevdata/projects/CRISPR-libraries/tools/2bit/twoBitToFa'
    twobit_ref = f'/ahg/regevdata/projects/CRISPR-libraries/tools/2bit/{assembly}.2bit'

    # Radius = 1000 needs to be longer than any single read for bowtie2 to work without local alignment
    command = f'{twobit} -seq={chrom} -start={start - 1001} -end={end + 1000} {twobit_ref} temp.fa; cat temp.fa'
    seq = subprocess.check_output(command, shell = True).decode('utf-8')
    seq = ''.join(seq.split()[1:])
    seq = seq.upper()

    if strand == '-':
      seq = compbio.reverse_complement(seq)

    try:
      assert seq.index(row['Spacer (20 nt)']) == 1000
    except:
      print(seq.index(row['Spacer (20 nt)']))
      import code; code.interact(local=dict(globals(), **locals()))

    print(len(seq))
    print(nm)

    ref_fn = _config.DATA_DIR + f'{nm}.fa'
    with open(ref_fn, 'w') as f:
      f.write(f'>{nm}\n{seq}\n')

    bt2_build = f'/ahg/regevdata/projects/CRISPR-libraries/tools/bowtie2-2.3.5.1-linux-x86_64/bowtie2-build'
    command = f'{bt2_build} {ref_fn} {bt_fold}/{nm}'
    result = subprocess.check_output(command, shell = True)

  return
Beispiel #10
0
def search_region(nm, spc, chrm, startpos, endpos):
    startpos, endpos = int(startpos), int(endpos)
    sq = compbio.get_genomic_seq_twoBitToFa(spc, chrm, str(startpos),
                                            str(endpos))

    headers, sqs = [], []
    timer = util.Timer(total=len(sq))
    for j in range(len(sq) - 3):
        found = False
        if sq[j:j + 2] == 'GG':
            found = True
            start, end = j - 21, j + 2
            cut_site = j - 4
            orient = '+'
        if sq[j:j + 2] == 'CC':
            found = True
            start, end = j, j + 23
            cut_site = j + 5
            orient = '-'
        if found:
            s = sq[start:end]
            if len(s) != 23:
                continue
            if orient == '+':
                pass
            if orient == '-':
                start, end = end, start
                s = compbio.reverse_complement(s)
            if s[0] != 'G' and s[1] == 'G':
                s = s[1:]
            elif s[0] != 'G' and s[1] != 'G':
                s = 'G' + s
            hdr = '>' + '__'.join([
                nm, chrm,
                str(startpos + start),
                str(startpos + end),
                str(startpos + cut_site), orient
            ])
            headers.append(hdr)
            sqs.append(s)
        timer.update()
    return headers, sqs
def add_del_profiles(header, sequence, df_buffer):
    for idx in range(len(sequence)):
        seq = ''
        if sequence[idx:idx + 2] == 'CC':
            cutsite = idx + 6
            seq = sequence[cutsite - 30:cutsite + 30]
            seq = compbio.reverse_complement(seq)
        if sequence[idx:idx + 2] == 'GG':
            cutsite = idx - 4
            seq = sequence[cutsite - 30:cutsite + 30]

        if seq != '':
            if len(seq) != 60:
                continue
            local_cutsite = 30
            pred_df = _predict2.predict_mhdel(seq, local_cutsite)

            pred_df['header'] = header
            pred_df['seq'] = sequence
            pred_df['pam'] = sequence[idx:idx + 2]
            pred_df['cutsite'] = cutsite
            pred_df['shuffled'] = 'no'
            df_buffer = df_buffer.append(pred_df, ignore_index=True)

            pre, post = list(seq[:34]), list(seq[36:])
            random.shuffle(pre)
            random.shuffle(post)
            shuffled_seq = ''.join(pre) + 'GG' + ''.join(post)
            shuffled_pred_df = _predict2.predict_mhdel(seq, local_cutsite)

            shuffled_pred_df['header'] = header
            shuffled_pred_df['seq'] = sequence
            shuffled_pred_df['pam'] = sequence[idx:idx + 2]
            shuffled_pred_df['cutsite'] = cutsite
            shuffled_pred_df['shuffled'] = 'yes'
            df_buffer = df_buffer.append(shuffled_pred_df, ignore_index=True)
    return df_buffer
Beispiel #12
0
def check_ins_templated(read, genome, is_pos, ins_len):
    # if the inserted sequence and some of the neighboring sequence is present in the wildtype sequence context, it's templated.

    def find_all_instances(query, seq):
        idxs = []
        for i in range(len(seq)):
            if seq[i:i + len(query)] == query:
                idxs.append(i)
        return idxs

    imer = read[is_pos:is_pos + ins_len]
    designed_genome = genome.replace('-', '')
    rc_designed_genome = compbio.reverse_complement(designed_genome)
    if imer not in designed_genome and imer not in rc_designed_genome:
        return 0, 'na', '', ''

    # try extending 5' side
    for idx in range(is_pos - 1, -1, -1):
        new_imer = read[idx:is_pos + ins_len]
        if new_imer not in designed_genome and new_imer not in rc_designed_genome:
            break
        # Template cannot be only where we are
        if new_imer in designed_genome and new_imer not in rc_designed_genome:
            inst = find_all_instances(new_imer, designed_genome)
            if len(inst) == 1 and idx in inst:
                break
    fiveside = idx + 1

    # try extending 3' side
    for idx in range(is_pos + ins_len + 1, len(read)):
        new_imer = read[fiveside:idx]
        if new_imer not in designed_genome and new_imer not in rc_designed_genome:
            break
        # Template cannot be only where we are
        if new_imer in designed_genome and new_imer not in rc_designed_genome:
            inst = find_all_instances(new_imer, designed_genome)
            if len(inst) == 1 and fiveside in inst:
                break
    threeside = idx - 1

    fiveside_seq = read[fiveside:is_pos]
    threeside_seq = read[is_pos + ins_len:threeside]

    # If no neighboring sequence is included in template, it's not templated.
    if len(fiveside_seq) == 0 or len(threeside_seq) == 0:
        return 0, 'na', '', ''

    template = read[fiveside:threeside]

    # get p2 and mh2
    if template in genome[:is_pos] or template in compbio.reverse_complement(
            genome[:is_pos].replace('-', '')):
        p2 = fiveside_seq
        mh2 = threeside_seq
    else:
        p2 = threeside_seq
        mh2 = fiveside_seq

    # Get template orientation
    if template in designed_genome and template not in rc_designed_genome:
        template_orientation = '+'
    if template not in designed_genome and template in rc_designed_genome:
        template_orientation = '-'
    if template in designed_genome and template in rc_designed_genome:
        template_orientation = 'both'

    # a random 5mer occurs in 55 bp at 5% rate. To threshold at various false positive rates, defer decision and just return length of longest template match.
    return len(template), template_orientation, p2, mh2
def search_region(nm, spc, chrm, startpos, endpos, RepeatMasker):
    startpos, endpos = int(startpos), int(endpos)
    sq = compbio.get_genomic_seq_twoBitToFa(spc, chrm, str(startpos),
                                            str(endpos))

    dists = []
    headers, sqs = [], []
    prev, too_close_filtered = 0, 0
    num_repeats_found = 0
    timer = util.Timer(total=len(sq))
    for j in range(len(sq) - 3):
        found = False
        if sq[j:j + 2] == 'GG':
            found = True
            start, end = j - 21, j + 2
            cut_site = j - 4
            orient = '+'
        if sq[j:j + 2] == 'CC':
            found = True
            start, end = j, j + 23
            cut_site = j + 5
            orient = '-'
        if found:
            # filter grnas that are too close
            if cut_site - prev < _config.d.MIN_DIST:
                too_close_filtered += 1
                continue

            # filter incomplete grnas
            s = sq[start:end]
            if len(s) != 23:
                continue

            # filter repeats
            if RepeatMasker.search(chrm, startpos + j - 10, startpos + j + 10):
                num_repeats_found += 1
                continue

            # if all is ok
            if orient == '+':
                pass
            if orient == '-':
                start, end = end, start
                s = compbio.reverse_complement(s)

            # G-N19-NGG, G-N18-NGG, N20-NGG
            if s[0] != 'G' and s[1] == 'G':
                s = s[1:]
            elif s[0] != 'G' and s[1] != 'G':
                s = 'G' + s

            hdr = '>' + '__'.join([
                nm, chrm,
                str(startpos + start),
                str(startpos + end),
                str(startpos + cut_site), orient
            ])
            if hdr not in headers:
                headers.append(hdr)
                sqs.append(s)
            dists.append(cut_site - prev)
            prev = cut_site

        timer.update()
    return headers, sqs, too_close_filtered, num_repeats_found, dists
def bulk_predict(header, sequence, dd, dd_shuffled, df_out_dir):
    # Input: A specific sequence
    # Find all Cas9 cutsites, gather metadata, and run inDelphi
    try:
        ans = parse_header(header)
        gene_kgid, chrom, start, end = ans
    except:
        return

    for idx in range(len(sequence)):
        seq = ''
        if sequence[idx:idx + 2] == 'CC':
            cutsite = idx + 6
            seq = sequence[cutsite - 30:cutsite + 30]
            seq = compbio.reverse_complement(seq)
            orientation = '-'
        if sequence[idx:idx + 2] == 'GG':
            cutsite = idx - 4
            seq = sequence[cutsite - 30:cutsite + 30]
            orientation = '+'
        if seq == '':
            continue
        if len(seq) != 60:
            continue

        # Sanitize input
        seq = seq.upper()
        if 'N' in seq:
            continue
        if not re.match('^[ACGT]*$', seq):
            continue

        # Randomly query subset for broad shallow coverage
        r = np.random.random()
        if r > 0.05:
            continue

        # Shuffle everything but GG
        seq_nogg = list(seq[:34] + seq[36:])
        random.shuffle(seq_nogg)
        shuffled_seq = ''.join(seq_nogg[:34]) + 'GG' + ''.join(seq_nogg[36:])

        for d, seq_context, shuffled_nm in zip([dd, dd_shuffled],
                                               [seq, shuffled_seq],
                                               ['wt', 'shuffled']):
            #
            # Store metadata statistics
            #
            local_cutsite = 30
            grna = seq_context[13:33]
            cutsite_coord = start + idx
            unique_id = '%s_%s_hg38_%s_%s_%s' % (gene_kgid, grna, chrom,
                                                 cutsite_coord, orientation)

            d['Sequence Context'].append(seq_context)
            d['Local Cutsite'].append(local_cutsite)
            d['Chromosome'].append(chrom)
            d['Cutsite Location'].append(cutsite_coord)
            d['Orientation'].append(orientation)
            d['Cas9 gRNA'].append(grna)
            d['Gene kgID'].append(gene_kgid)
            d['Unique ID'].append(unique_id)

            # Make predictions
            ans = _predict.predict_all(seq_context, local_cutsite, rate_model,
                                       bp_model, normalizer)
            pred_del_df, pred_all_df, total_phi_score, ins_del_ratio = ans

            # Save predictions
            # del_df_out_fn = df_out_dir + '%s_%s_%s.csv' % (unique_id, 'dels', shuffled_nm)
            # pred_del_df.to_csv(del_df_out_fn)
            # all_df_out_fn = df_out_dir + '%s_%s_%s.csv' % (unique_id, 'all', shuffled_nm)
            # pred_all_df.to_csv(all_df_out_fn)

            ## Translate predictions to indel length frequencies
            indel_len_pred, fs = get_indel_len_pred(pred_all_df)

            #
            # Store prediction statistics
            #
            d['Total Phi Score'].append(total_phi_score)
            d['1ins/del Ratio'].append(ins_del_ratio)

            d['1ins Rate Model'].append(rate_model_nm)
            d['1ins bp Model'].append(bp_model_nm)
            d['1ins normalizer'].append(normalizer_nm)

            d['Frameshift +0'].append(fs['+0'])
            d['Frameshift +1'].append(fs['+1'])
            d['Frameshift +2'].append(fs['+2'])
            d['Frameshift'].append(fs['+1'] + fs['+2'])

            crit = (pred_del_df['Genotype Position'] != 'e')
            s = pred_del_df[crit]['Predicted_Frequency']
            s = np.array(s) / sum(s)
            del_gt_precision = 1 - entropy(s) / np.log(len(s))
            d['Precision - Del Genotype'].append(del_gt_precision)

            dls = []
            for del_len in range(1, 60):
                dlkey = -1 * del_len
                dls.append(indel_len_pred[dlkey])
            dls = np.array(dls) / sum(dls)
            del_len_precision = 1 - entropy(dls) / np.log(len(dls))
            d['Precision - Del Length'].append(del_len_precision)

            crit = (pred_all_df['Genotype Position'] != 'e')
            s = pred_all_df[crit]['Predicted_Frequency']
            s = np.array(s) / sum(s)
            all_gt_precision = 1 - entropy(s) / np.log(len(s))
            d['Precision - All Genotype'].append(all_gt_precision)

            negthree_nt = seq_context[local_cutsite - 1]
            negfour_nt = seq_context[local_cutsite]
            d['-4 nt'].append(negfour_nt)
            d['-3 nt'].append(negthree_nt)

            crit = (pred_all_df['Category'] == 'ins')
            highest_ins_rate = max(pred_all_df[crit]['Predicted_Frequency'])
            crit = (pred_all_df['Category']
                    == 'del') & (pred_all_df['Genotype Position'] != 'e')
            highest_del_rate = max(pred_all_df[crit]['Predicted_Frequency'])
            d['Highest Ins Rate'].append(highest_ins_rate)
            d['Highest Del Rate'].append(highest_del_rate)

    return
Beispiel #15
0
def matchmaker(nm, split):
    print(nm, split)
    stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split)
    util.exists_empty_fn(stdout_fn)
    out_dir = out_place + nm + '/' + split + '/'
    util.ensure_dir_exists(out_dir)

    read1_fn = inp_dir + '%s_R1_%s.fq' % (nm, split)
    read2_fn = inp_dir + '%s_R2_%s.fq' % (nm, split)

    lsh_dict = build_targets_better_lsh()
    alignment_buffer = init_alignment_buffer()

    prepare_outfns(out_dir)

    num_bad_matches = 0
    quality_pass = 0

    tot_lines = util.line_count(read1_fn)
    timer = util.Timer(total=tot_lines)
    with open(read1_fn) as f1, open(read2_fn) as f2:
        for i, (line1, line2) in enumerate(zip(f1, f2)):
            if i % 4 == 0:
                h1 = line1.strip()
                h2 = line2.strip()
            if i % 4 == 1:
                # RC of l1 contains target
                line1 = line1.strip()
                target_read = compbio.reverse_complement(line1[:61])
                ulmi, ulmi_idx = find_ulmi(line1)

                # l2 contains gRNA
                grna_read = line2.strip()

            if i % 4 == 3:

                q1, q2 = line1.strip(), line2.strip()
                read_q = q1[:61][::-1]
                ulmi_q = q1[ulmi_idx:ulmi_idx + len(ulmi)][::-1]
                grna_q = q2[18:22 + 20]

                qs = [ord(s) - 33 for s in read_q + ulmi_q + grna_q]
                if np.mean(qs) >= 28:
                    quality_pass += 1

                    align_header = '>1_%s_%s' % (ulmi, ulmi_q)

                    # Try to find designed target from LSH
                    cand_idxs = find_best_designed_target(
                        target_read, lsh_dict)
                    if len(cand_idxs) > 0:

                        bad_match = compare_target_to_grna(
                            cand_idxs, grna_read)
                        if bad_match == 'ok':
                            # Run alignment and store in buffer
                            best_idx, align = alignment(target_read, cand_idxs)
                            if align is None:
                                continue
                            store_alignment(alignment_buffer, best_idx,
                                            align_header, align, read_q)
                        else:
                            num_bad_matches += 1
                    else:
                        num_bad_matches += 1

            if i % int(tot_lines / 200) == 1 and i > 1:
                # Flush alignment buffer
                flush_alignments(alignment_buffer, out_dir)
                alignment_buffer = init_alignment_buffer()

                # Stats for the curious
                with open(stdout_fn, 'a') as outf:
                    outf.write('Time: %s\n' % (datetime.datetime.now()))
                    outf.write('Progress: %s\n' % (i / int(tot_lines / 100)))
                    outf.write('Num. mismatched gRNA/target pairs: %s\n' %
                               (num_bad_matches))
                    outf.write('Frac. mismatched gRNA/target pairs: %s\n' %
                               (num_bad_matches / quality_pass))

            timer.update()

    # Final flush
    flush_alignments(alignment_buffer, out_dir)

    return
def matchmaker(nm, split):
  print(split)
  stdout_fn = _config.SRC_DIR + f'nh_c_{nm}_{split}.out'
  util.exists_empty_fn(stdout_fn)
  out_dir = f'{out_place}{nm}/{split}/'
  util.ensure_dir_exists(out_dir)

  # Parse condition-specific settings
  exp_row = exp_design[exp_design['Name'] == nm].iloc[0]
  parent_fn = exp_row['Parent file']
  lib_nm = exp_row['Library']
  target_nm = exp_row['Target']

  # Library design
  global lib_design
  lib_design = pd.read_csv(_config.DATA_DIR + f'lib_{lib_nm}_design.csv')

  global prefixes
  global peptide_nms
  global prefix_to_peptide
  global suffixes
  global suffix_to_peptide
  prefixes = [s[:prefix_len] for s in lib_design['Sequence']]
  peptide_nms = list(lib_design['Name'])
  prefix_to_peptide = {prefix: nm for prefix, nm in zip(prefixes, peptide_nms)}
  suffixes = [compbio.reverse_complement(s[-suffix_len:]) for s in lib_design['Sequence']]
  suffix_to_peptide = {suffix: nm for suffix, nm in zip(suffixes, peptide_nms)}

  # Target 
  target_row = target_design[target_design['Target'] == target_nm].iloc[0]
  target = target_row['Sequence']
  target_strand = target_row['gRNA orientation']

  zf_split = str(split).zfill(3)
  read1_fn = inp_dir + f'{parent_fn}_R1_{zf_split}.fq'
  read2_fn = inp_dir + f'{parent_fn}_R2_{zf_split}.fq'

  count_stats = defaultdict(lambda: 0)
  count_stats['Success'] = 0

  alignment_buffer = init_alignment_buffer()
  prepare_outfns(out_dir, peptide_nms)

  tot_lines = util.line_count(read1_fn)
  timer = util.Timer(total = tot_lines)
  with open(read1_fn) as f1, open(read2_fn) as f2:
    for i, (line1, line2) in enumerate(zip(f1, f2)):
      if i % 4 == 0:
        h1 = line1.strip()
        h2 = line2.strip()
      if i % 4 == 1:
        read1 = line1.strip()
        read2 = line2.strip()
      if i % 4 == 3:
        q1, q2 = line1.strip(), line2.strip()
        count_stats['Read count'] += 1

        qs = [ord(s)-33 for s in q1 + q2]
        if np.mean(qs) < 25:
          count_stats['1a. Quality fail'] += 1
          continue

        res, msg = find_peptide1_nm(read2)
        if res is None:
          count_stats[f'2{msg}'] += 1
          continue
        p1_nm = res

        res, msg = find_peptide2_nm(read1)
        if res is None:
          count_stats[f'2{msg}'] += 1
          continue
        p2_nm = res

        peptide_nm = f'{p1_nm}-{p2_nm}'

        read1 = read1[6:]
        q1 = q1[6:]
        if target_strand == '-':
          read1 = compbio.reverse_complement(read1)
          q1 = q1[::-1]

        # Run alignment and store in buffer
        align_header = f'>1'
        align = alignment(read1, target)
        store_alignment(alignment_buffer, peptide_nm, align_header, align, q1)
        count_stats['Success'] += 1

      # flush_interval = 2000
      flush_interval = 200
      if i % int(tot_lines / flush_interval) == 1 and i > 1:
        # Flush alignment buffer
        flush_alignments(alignment_buffer, out_dir)
        alignment_buffer = init_alignment_buffer()

        # Stats for the curious
        with open(stdout_fn, 'a') as outf:
          outf.write(f'Time: {datetime.datetime.now()}\n')
          outf.write(f'Progress: {i / int(tot_lines / 100)}\n')
          outf.write(f'Line: {i}\n')
          for key in sorted(list(count_stats.keys())):
            outf.write(f'{key}, {count_stats[key]}\n')
        # break

      timer.update()
  
  # Final flush
  flush_alignments(alignment_buffer, out_dir)

  stats_df = pd.DataFrame(count_stats, index = [0])
  sorted_cols = sorted([s for s in stats_df.columns])
  stats_df = stats_df[sorted_cols]
  stats_df.to_csv(out_dir + f'stats_{nm}_{split}.csv')

  return
def wildtype_repairs(row):
    orient = row['gRNA Orientation']
    cutsite = row['Cutsite']
    seq = row['Alternative Sequence']
    wt_seq = row['Reference Sequence']
    if orient == '-':
        seq = compbio.reverse_complement(seq)
        wt_seq = compbio.reverse_complement(wt_seq)
        cutsite = len(seq) - cutsite

    # Detect wildtypes with iterative cutting - expect 0 at these
    wt_repairable_flag = 'yes'
    fs_repairable_flag = 'yes'
    grna = seq[cutsite - 10:cutsite + 3]
    for wt_seq_s in [wt_seq, compbio.reverse_complement(wt_seq)]:
        if grna in wt_seq_s:
            try:
                pam = wt_seq[wt_seq.index(grna) + 14:wt_seq.index(grna) + 16]
            except:
                wt_repairable_flag = 'iterwt'
                fs_repairable_flag = 'iterwt'
                continue
            if pam in ['GG', 'AG', 'GA']:
                wt_repairable_flag = 'iterwt'
                fs_repairable_flag = 'iterwt'

    repair_gts = []
    repair_dls = []
    longest_wt_mh = -1
    longest_nonwt_mh = -1
    for del_len in range(1, 27 + 1):
        for start_pos in range(0, del_len + 1):
            repair_gt = seq[:cutsite - del_len + start_pos] + seq[cutsite +
                                                                  start_pos:]
            l = seq[cutsite - del_len:cutsite]
            r = seq[cutsite:cutsite + del_len]
            mhs = find_microhomologies(l, r)
            if repair_gt == wt_seq:
                repair_gts.append(start_pos)
                repair_dls.append(del_len)
                for mh in mhs:
                    if start_pos in mh:
                        mh_len = len(mh) - 1
                        if mh_len > longest_wt_mh:
                            longest_wt_mh = mh_len
            else:
                for mh in mhs:
                    if start_pos in mh:
                        mh_len = len(mh) - 1
                        if mh_len > longest_nonwt_mh:
                            longest_nonwt_mh = mh_len
    if len(repair_gts) == 0:
        wt_repairable_flag = 'no'

    if longest_wt_mh > longest_nonwt_mh:
        longest_mh_wt = 'yes'
    else:
        longest_mh_wt = 'no'

    fs = row['Needed Frameshift']
    if fs == 0:
        fs_repairable_flag = 'no'
    return repair_gts, repair_dls, wt_repairable_flag, fs, fs_repairable_flag, longest_mh_wt
Beispiel #18
0
sys.path.append('/home/unix/maxwshen/')
import numpy as np
from collections import defaultdict
from mylib import util, compbio
import pandas as pd

# Default params
inp_dir = _config.OUT_PLACE + f'ill_b2_merge_n_paired_reads/'
NAME = util.get_fn(__file__)
out_dir = _config.OUT_PLACE + NAME + '/'
util.ensure_dir_exists(out_dir)

exp_design = pd.read_csv(_config.DATA_DIR + f'Badran2015_SraRunTable.csv')
wt_gt = open(_config.DATA_DIR +
             f'SP055-rpoZ-cMyc-Cry1Ac1-d123.fa').readlines()[1].strip()
rc_wt_gt = compbio.reverse_complement(wt_gt)

params = {
    'num_splits': 10,
}


##
# Primary
##
def merge_n_paired_reads(nm):
    mdf = pd.DataFrame()
    for split in range(params['num_splits']):
        df = pd.read_csv(inp_dir + f'{nm}_{split}_read_idxs.csv', index_col=0)
        mdf = mdf.append(df, ignore_index=True, sort=False)
def matchmaker(nm, split):
  print nm, split
  stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split)
  util.exists_empty_fn(stdout_fn)
  out_dir = out_place + nm + '/' + split + '/'
  util.ensure_dir_exists(out_dir)

  inp_fn = inp_dir + '%s_r2_%s.fq' % (nm, split)

  lsh_dict = build_targets_better_lsh()
  alignment_buffer = init_alignment_buffer()

  prepare_outfns(out_dir)

  qf = 0

  tot_reads = util.line_count(inp_fn)
  timer = util.Timer(total = tot_reads)
  from itertools import izip
  with open(inp_fn) as f:
    for i, line in enumerate(f):
      if i % 4 == 0:
        pass
      if i % 4 == 1:
        l2 = line.strip()
      if i % 4 == 3:
        # Quality filter
        q2 = line.strip()
        qs = [ord(s)-33 for s in q2]
        if np.mean(qs) < 28:
          qf += 1
          continue

        l2 = compbio.reverse_complement(l2)
        align_header = '>1'

        # Try to find designed target from LSH
        cand_idxs = find_best_designed_target(l2, lsh_dict)
        if len(cand_idxs) == 0:
          continue

        # Run alignment
        best_idx, align = alignment(l2, cand_idxs)

        # Store alignment into buffer
        store_alignment(alignment_buffer, best_idx, align_header, align)

      if i % int(tot_reads / 100) == 1 and i > 1:
        # Flush alignment buffer
        flush_alignments(alignment_buffer, out_dir)
        alignment_buffer = init_alignment_buffer()

        # Stats for the curious
        with open(stdout_fn, 'a') as outf:
          outf.write('Time: %s\n' % (datetime.datetime.now()))
          outf.write('Progress: %s\n' % (i / int(tot_reads / 100)) )
          outf.write('Quality filtered pct: %s\n' % (qf / (i/4)))
      timer.update()
  
  # Final flush
  flush_alignments(alignment_buffer, out_dir)

  return