def get_poswise_df(data, nm_to_seq, treat_nm):
    dd = defaultdict(list)
    timer = util.Timer(total=len(data))
    for nm in data:
        pw = data[nm]
        seq = nm_to_seq[nm]

        for jdx in range(len(pw)):
            pos = _data.idx_to_pos(jdx, treat_nm)
            ref_nt = seq[jdx]
            ref_idx = nt_to_idx[ref_nt]
            total = sum(pw[jdx])

            for kdx in range(len(pw[jdx])):
                if kdx == ref_idx:
                    continue

                count = pw[jdx][kdx]

                dd['Count'].append(count)
                dd['Total count'].append(total)
                dd['Obs nt'].append(nts[kdx])
                dd['Ref nt'].append(ref_nt)
                dd['Position'].append(pos)
                dd['Name'].append(nm)

        timer.update()

    df = pd.DataFrame(dd)
    return df
Beispiel #2
0
def gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, decisions):
  '''
    Filter treatment mutations that can be explained by control freq.
    In practice, this step is most effective for control mutations
    with relatively high frequency => relatively high variance

    Considers all events that occur (fq > 0%) in both control and treatment data
  '''
  fpr_threshold_try1 = 0.10
  for jdx, ref_nt in enumerate(seq):
    c_tot = sum(c[jdx])
    t_tot = sum(t[jdx])
    for kdx in range(len(t[jdx])):
      if kdx == nt_to_idx[ref_nt] or t[jdx][kdx] == 0:
        continue

      c_fq = c[jdx][kdx] / c_tot
      t_fq = t[jdx][kdx] / t_tot
      pval = binom.sf(t[jdx][kdx] - 1, t_tot, c_fq)

      if c_fq > 0:
        decisions['obs_nt'].append(nts[kdx])
        decisions['ref_nt'].append(ref_nt)
        decisions['c_fq'].append(c_fq)
        decisions['c_ct'].append(c[jdx][kdx])
        decisions['t_fq'].append(t_fq)
        decisions['t_ct'].append(t[jdx][kdx])
        decisions['c_tot'].append(c_tot)
        decisions['t_tot'].append(t_tot)
        decisions['idx'].append(jdx)
        decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm))
        decisions['pval'].append(pval)
        decisions['nm'].append(nm)

  return
Beispiel #3
0
def filter_high_control_muts(t, c, seq, treat_nm, nm, decisions):
  '''
    Filter positions with very high control mut freq. 
    with significant support from readcounts
  '''
  max_control_mut_fq = 0.05

  for jdx, ref_nt in enumerate(seq):
    c_tot = sum(c[jdx])
    t_tot = sum(t[jdx])
    
    wipe_col = False
    for kdx in range(len(t[jdx])):
      if kdx == nt_to_idx[ref_nt] or t[jdx][kdx] == 0:
        continue

      c_fq = c[jdx][kdx] / c_tot
      t_fq = t[jdx][kdx] / t_tot

      if c_fq > 0:
        decisions['obs_nt'].append(nts[kdx])
        decisions['ref_nt'].append(ref_nt)
        decisions['c_fq'].append(c_fq)
        decisions['c_tot'].append(c_tot)
        decisions['idx'].append(jdx)
        decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm))
        decisions['nm'].append(nm)

        if c_fq >= max_control_mut_fq:
          wipe_col = True
          decisions['wiped'].append(True)
        else:
          decisions['wiped'].append(False)

    if wipe_col:
      for kdx in range(len(t[jdx])):
        t[jdx][kdx] = 0
  return t
Beispiel #4
0
def gather_stats_illumina_errors(t, c, t_minq, c_minq, seq, treat_nm, nm, decisions):
  '''
    Identify mutations explainable by Illumina sequencing error
    Filtered at Q30 (1e-3), most columns have minimum
    Q = 32 (6e-4), or
    Q = 36 (2e-4)

    Considers all events (>0 freq.) in treatment data.
  '''
  fpr_threshold = 0.05
  for jdx, ref_nt in enumerate(seq):
    t_tot = np.sum(t[jdx])

    t_bin_p = 10**(-t_minq[jdx] / 10) 
    c_bin_p = 10**(-c_minq[jdx] / 10)

    for kdx in range(len(t[jdx])):
      if kdx == nt_to_idx[ref_nt]:
        continue

      if t[jdx][kdx] > 0:
        t_fq = t[jdx][kdx] / t_tot
        pval = binom_minus_binom_pval(t[jdx][kdx], t_bin_p, c_bin_p, t_tot)

        decisions['obs_nt'].append(nts[kdx])
        decisions['ref_nt'].append(ref_nt)
        decisions['t_bin_p'].append(t_bin_p)
        decisions['c_bin_p'].append(c_bin_p)
        decisions['t_ct'].append(t[jdx][kdx])
        decisions['t_fq'].append(t_fq)
        decisions['t_tot'].append(t_tot)
        decisions['idx'].append(jdx)
        decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm))
        decisions['pval'].append(pval)
        decisions['nm'].append(nm)
  return
def adjust_treatment_control(treat_nm):
    ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

    # adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust')
    adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb'))

    lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}
    '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
    print(
        'Gathering statistics on treatment mutations matching background profile by frequency of zeros...'
    )
    dd = defaultdict(list)
    timer = util.Timer(total=len(adj_d))
    for nm in adj_d:
        timer.update()
        pw = adj_d[nm]
        seq = nm_to_seq[nm]
        for jdx in range(len(pw)):
            tot = np.nansum(pw[jdx])
            ref_nt = seq[jdx]
            ref_idx = nt_to_idx[ref_nt]
            for kdx in range(len(pw[jdx])):
                if kdx == ref_idx:
                    continue

                count = pw[jdx][kdx]
                dd['Count'].append(count)
                dd['Total count'].append(tot)
                dd['Obs nt'].append(nts[kdx])
                dd['Ref nt'].append(ref_nt)
                if tot == 0:
                    dd['Frequency'].append(np.nan)
                else:
                    dd['Frequency'].append(count / tot)
                dd['Position index'].append(jdx)
                dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
                dd['Name'].append(nm)

    df = pd.DataFrame(dd)
    df = df[df['Total count'] >= 100]

    # Form stats_df and find p for binomial, which is typically ~0.99
    dd = defaultdict(list)
    pos_range = sorted(set(df['Position index']))
    timer = util.Timer(total=len(pos_range))
    for pos_idx in pos_range:
        timer.update()
        df_s1 = df[df['Position index'] == pos_idx]
        for ref_nt in nts:
            df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
            for obs_nt in nts:
                if obs_nt == ref_nt:
                    continue

                crit = (df_s2['Obs nt'] == obs_nt)
                dfs = df_s2[crit]
                dfs_freq = dfs['Frequency']

                num_zeros = sum(dfs_freq == 0)
                total = len(dfs_freq)
                if total == 0:
                    continue

                dd['Num target sites with zero'].append(num_zeros)
                dd['Total num target sites'].append(total)
                dd['Frequency of zero in target sites'].append(num_zeros /
                                                               total)
                dd['Mean activity'].append(np.mean(dfs_freq))
                dd['Position index'].append(pos_idx)
                dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
                dd['Obs nt'].append(obs_nt)
                dd['Ref nt'].append(ref_nt)

    fz_df = pd.DataFrame(dd)

    baseline_pos_range = pos_range[-5:]
    max_mean_activity = 0.025
    min_num_targets = 50

    crit = (fz_df['Position index'].isin(baseline_pos_range)) & \
           (fz_df['Mean activity'] <= max_mean_activity) & \
           (fz_df['Total num target sites'] >= min_num_targets)
    bg_bin_p = np.mean(fz_df[crit]['Frequency of zero in target sites'])
    if np.isnan(bg_bin_p):
        raise ValueError

    pvals = []
    timer = util.Timer(total=len(fz_df))
    for idx, row in fz_df.iterrows():
        total = row['Total num target sites']
        numzero = row['Num target sites with zero']
        pval = binom.cdf(numzero, total, bg_bin_p)
        pvals.append(pval)
        timer.update()
    fz_df['pval'] = pvals

    fz_fdr_threshold = 0.001
    fz_df = ben_hoch_fdr(fz_df, fz_fdr_threshold)
    fz_df.to_csv(out_dir + '%s_fraczero_dec.csv' % (treat_nm))

    print(
        'Filtering treatment mutations matching background profile by frequency of zeros...'
    )
    to_remove = fz_df[fz_df['FDR accept'] == False]
    adj_d = filter_freqzero_background_mutations(to_remove, adj_d, nm_to_seq)

    ##
    # Write
    ##
    with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
        pickle.dump(adj_d, f)

    return
def fig_editing_profiles(treat_nm):
  ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

  adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb'))


  lib_design, seq_col = _data.get_lib_design(treat_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  lib_nm = _data.get_lib_nm(treat_nm)
  ontarget_nms = set(_data.get_ontarget_sites(lib_design, lib_nm))

  '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
  print('Forming long df...')
  dd = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    timer.update()

    if nm not in ontarget_nms:
      continue

    pw = adj_d[nm]
    seq = nm_to_seq[nm]
    for jdx in range(len(pw)):
      tot = np.nansum(pw[jdx])
      ref_nt = seq[jdx]
      ref_idx = nt_to_idx[ref_nt]
      for kdx in range(len(pw[jdx])):
        if kdx == ref_idx:
          continue

        count = pw[jdx][kdx]
        dd['Count'].append(count)
        dd['Total count'].append(tot)
        dd['Obs nt'].append(nts[kdx])
        dd['Ref nt'].append(ref_nt)
        if tot == 0:
          dd['Frequency'].append(np.nan)
        else:
          dd['Frequency'].append(count / tot)
        dd['Position index'].append(jdx)
        dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
        dd['Name'].append(nm)

  df = pd.DataFrame(dd)
  df = df[df['Total count'] >= 100]
  n_targetsites_in_condition = len(df)

  # Form stats_df
  dd = defaultdict(list)
  pos_range = sorted(set(df['Position index']))
  timer = util.Timer(total = len(pos_range))
  for pos_idx in pos_range:
    timer.update()
    df_s1 = df[df['Position index'] == pos_idx]
    for ref_nt in nts:
      df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
      for obs_nt in nts:
        if obs_nt == ref_nt:
          continue

        crit = (df_s2['Obs nt'] == obs_nt)
        dfs = df_s2[crit]
        dfs_freq = dfs['Frequency']

        num_zeros = sum(dfs_freq == 0)
        total = len(dfs_freq)
        if total == 0:
          continue

        dd['Num target sites with zero for mutation'].append(num_zeros)
        dd['Total num target sites for mutation'].append(total)
        dd['Frequency of zero in target sites for mutation'].append(num_zeros / total)
        dd['Num target sites in condition'].append(n_targetsites_in_condition)
        dd['Mean activity'].append(np.mean(dfs_freq))
        dd['Position index'].append(pos_idx)
        dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
        dd['Obs nt'].append(obs_nt)
        dd['Ref nt'].append(ref_nt)

  hm_df = pd.DataFrame(dd)
  hm_df.to_csv(out_dir + '%s.csv' % (treat_nm))

  # Median normalize
  background_range = range(25, 34 + 1)

  for ref_nt in nts:
    for obs_nt in nts:
      if obs_nt == ref_nt:
        continue

      crit = (hm_df['Ref nt'] == ref_nt) & (hm_df['Obs nt'] == obs_nt) & (~np.isnan(hm_df['Mean activity']))
      medi = np.nanmedian(hm_df[crit & (hm_df['Position'].isin(background_range))]['Mean activity'])
      hm_df.loc[crit, 'Mean activity'] = hm_df.loc[crit, 'Mean activity'].apply(lambda x: max(0, x - medi))

  hm_df.to_csv(out_dir + '%s_median_bg_adj.csv' % (treat_nm))

  return
def calculate_statistics(treat_nm):
  ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

  adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust')

  lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
  print('Gathering statistics...')
  dd = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    timer.update()
    pw = adj_d[nm]
    seq = nm_to_seq[nm]
    for jdx in range(len(pw)):
      tot = sum(pw[jdx])
      ref_nt = seq[jdx]
      ref_idx = nt_to_idx[ref_nt]
      for kdx in range(len(pw[jdx])):
        if kdx == ref_idx:
          continue

        count = pw[jdx][kdx]
        dd['Count'].append(count)
        dd['Total count'].append(tot)
        dd['Obs nt'].append(nts[kdx])
        dd['Ref nt'].append(ref_nt)
        if tot == 0:
          dd['Frequency'].append(np.nan)
        else:
          dd['Frequency'].append(count / tot)
        dd['Position index'].append(jdx)
        dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
        dd['Name'].append(nm)

  df = pd.DataFrame(dd)
  df = df[df['Total count'] >= 50]

  dd = defaultdict(list)
  pos_range = sorted(set(df['Position index']))
  timer = util.Timer(total = len(pos_range))
  for pos_idx in pos_range:
    timer.update()
    df_s1 = df[df['Position index'] == pos_idx]
    for ref_nt in nts:
      df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
      for obs_nt in nts:
        if obs_nt == ref_nt:
          continue

        crit = (df_s2['Obs nt'] == obs_nt)
        dfs = df_s2[crit]
        dfs_freq = dfs['Frequency']

        num_zeros = sum(dfs_freq == 0)
        total = len(dfs_freq)
        if total == 0:
          continue

        dd['Num target sites with zero'].append(num_zeros)
        dd['Total num target sites'].append(total)
        dd['Frequency of zero in target sites'].append(num_zeros / total)
        dd['Mean activity'].append(np.mean(dfs_freq))
        dd['Position index'].append(pos_idx)
        dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
        dd['Obs nt'].append(obs_nt)
        dd['Ref nt'].append(ref_nt)

  stats_df = pd.DataFrame(dd)

  stats_df.to_csv(out_dir + '%s.csv' % (treat_nm))

  return
def form_data(exp_nm):
    data = _data.load_data(exp_nm, 'ag4_poswise_be_adjust')
    lib_design, seq_col = _data.get_lib_design(exp_nm)

    # Get target nt
    editor_type = _data.get_editor_type(exp_nm)
    if editor_type == 'CtoTeditor':
        target_nt = 'C'
    elif editor_type == 'AtoGeditor':
        target_nt = 'A'

    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    grnas = lib_design['gRNA (20nt)']
    design_cats = lib_design['Design category']
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}
    nm_to_grna = {nm: grna for nm, grna in zip(nms, grnas)}
    nm_to_design_cat = {
        nm: design_cat
        for nm, design_cat in zip(nms, design_cats)
    }

    dd = defaultdict(list)

    timer = util.Timer(total=len(data))
    for nm in data:
        pw = data[nm]
        seq = nm_to_seq[nm]
        grna = nm_to_grna[nm]
        design_cat = nm_to_design_cat[nm]

        # Get category, subcategory, and match count
        match_count = get_match_count(grna, seq)
        if design_cat == 'guideseq':
            category = 'Off-target series'
            subcategory = nm.split('_')[2]  # gene name
        elif design_cat == 'mismatch':
            category = 'Mismatch series'
            subcategory = nm.split('_')[1]  # series number
        elif design_cat == 'chipseq':
            category = 'Chip series'
        elif design_cat == 'vivo':
            category = 'vivo'
            subcategory = 'vivo'
        else:
            assert match_count == 20, 'fail'
            category = 'On-target'
            subcategory = 'On-target'

        for jdx in range(len(pw)):
            pos = _data.idx_to_pos(jdx, exp_nm)
            if pos not in [6, 7]:
                continue
            ref_nt = seq[jdx]
            if ref_nt != target_nt:
                continue
            ref_idx = nt_to_idx[ref_nt]
            total = sum(pw[jdx])
            edit_ct = 0
            for kdx in range(len(pw[jdx])):
                if kdx == ref_idx:
                    continue
                edit_ct += pw[jdx][kdx]

            if total > 0:
                dd['Edited fraction'].append(edit_ct / total)
            else:
                dd['Edited fraction'].append(np.nan)
            dd['Edit count'].append(edit_ct)
            dd['Total count'].append(total)
            dd['Position'].append(pos)
            dd['Ref nt'].append(ref_nt)
            dd['Name'].append(nm)

            dd['Match count'].append(int(match_count))
            dd['Category'].append(category)
            dd['Subcategory'].append(subcategory)

        timer.update()

    df = pd.DataFrame(dd)
    df.to_csv(out_dir + '%s.csv' % (exp_nm))

    return