Python get_g4_lib_design Exemples, _data.get_g4_lib_design Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ag4b4_filter_inprofile_batch_effects.py Projet : maxwshen/lib-analysis

def filter_inprofile_batch_effects():
    df = pd.read_csv(_config.DATA_DIR + 'batch_effects.csv')
    inprofile_batches = set(df['Batch'])

    be_treatments = [
        s for s in treat_control_df['Treatment'] if 'Cas9' not in s
    ]
    timer = util.Timer(total=len(be_treatments))
    for treat_nm in be_treatments:
        batch = exp_nm_to_batch[treat_nm]

        if batch in inprofile_batches:
            print(treat_nm, batch)
            adj_d = _data.load_data(treat_nm, 'ag4a2_adjust_batch_effects')
            to_remove = df[df['Batch'] == batch]

            lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
            nms = lib_design['Name (unique)']
            seqs = lib_design[seq_col]
            nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

            adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm)
            with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
                pickle.dump(adj_d, f)

        else:
            inp_fn = inp_dir + '%s.pkl' % (treat_nm)
            subprocess.check_output('cp %s %s' % (inp_fn, out_dir), shell=True)

        timer.update()

    return

Exemple #2

0

Afficher le fichier

Fichier : ag4c_covariate_form_data.py Projet : maxwshen/lib-analysis

def load_Y():
    '''
    Combine data together in human friendly formats 
  '''
    all_means = dict()

    print('WARNING: script depends on c_poswise_basic')
    c_fold = _config.OUT_PLACE + 'c_poswise_basic/'
    c_mtimes = [os.path.getmtime(c_fold + fn) for fn in os.listdir(c_fold)]
    ag4_fold = _config.OUT_PLACE + 'ag4_poswise_be_adjust/'
    ag4_mtimes = [
        os.path.getmtime(ag4_fold + fn) for fn in os.listdir(ag4_fold)
    ]

    if min(c_mtimes) < max(ag4_mtimes):
        prompt = 'The most recent modification to a file in ag4 occurred after the earliest modification in c -- c might not be up to date. Continue? (y) '
        ans = input(prompt)
        if ans != 'y':
            sys.exit(0)

    timer = util.Timer(total=len(treat_control_df))
    for idx, row in treat_control_df.iterrows():
        timer.update()
        treat_nm = row['Treatment']
        if 'Cas9' in treat_nm:
            continue
        adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust')
        lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
        nms = lib_design['Name (unique)']
        seqs = lib_design[seq_col]
        nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

        # pw_df = get_poswise_df(adj_d, nm_to_seq, treat_nm)
        pw_df = pd.read_csv(_config.OUT_PLACE + 'c_poswise_basic/%s.csv' %
                            (treat_nm))
        means = get_means(pw_df)
        means.to_csv(out_dir + 'means_%s.csv' % (treat_nm))

        all_means[treat_nm] = means

    import pickle
    with open(out_dir + 'Y.pkl', 'wb') as f:
        pickle.dump(all_means, f)

    return

Exemple #3

0

Afficher le fichier

Fichier : ag4b_adjust_background.py Projet : maxwshen/lib-analysis

def adjust_treatment_control(treat_nm):
    ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

    # adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust')
    adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb'))

    lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}
    '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
    print(
        'Gathering statistics on treatment mutations matching background profile by frequency of zeros...'
    )
    dd = defaultdict(list)
    timer = util.Timer(total=len(adj_d))
    for nm in adj_d:
        timer.update()
        pw = adj_d[nm]
        seq = nm_to_seq[nm]
        for jdx in range(len(pw)):
            tot = np.nansum(pw[jdx])
            ref_nt = seq[jdx]
            ref_idx = nt_to_idx[ref_nt]
            for kdx in range(len(pw[jdx])):
                if kdx == ref_idx:
                    continue

                count = pw[jdx][kdx]
                dd['Count'].append(count)
                dd['Total count'].append(tot)
                dd['Obs nt'].append(nts[kdx])
                dd['Ref nt'].append(ref_nt)
                if tot == 0:
                    dd['Frequency'].append(np.nan)
                else:
                    dd['Frequency'].append(count / tot)
                dd['Position index'].append(jdx)
                dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
                dd['Name'].append(nm)

    df = pd.DataFrame(dd)
    df = df[df['Total count'] >= 100]

    # Form stats_df and find p for binomial, which is typically ~0.99
    dd = defaultdict(list)
    pos_range = sorted(set(df['Position index']))
    timer = util.Timer(total=len(pos_range))
    for pos_idx in pos_range:
        timer.update()
        df_s1 = df[df['Position index'] == pos_idx]
        for ref_nt in nts:
            df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
            for obs_nt in nts:
                if obs_nt == ref_nt:
                    continue

                crit = (df_s2['Obs nt'] == obs_nt)
                dfs = df_s2[crit]
                dfs_freq = dfs['Frequency']

                num_zeros = sum(dfs_freq == 0)
                total = len(dfs_freq)
                if total == 0:
                    continue

                dd['Num target sites with zero'].append(num_zeros)
                dd['Total num target sites'].append(total)
                dd['Frequency of zero in target sites'].append(num_zeros /
                                                               total)
                dd['Mean activity'].append(np.mean(dfs_freq))
                dd['Position index'].append(pos_idx)
                dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
                dd['Obs nt'].append(obs_nt)
                dd['Ref nt'].append(ref_nt)

    fz_df = pd.DataFrame(dd)

    baseline_pos_range = pos_range[-5:]
    max_mean_activity = 0.025
    min_num_targets = 50

    crit = (fz_df['Position index'].isin(baseline_pos_range)) & \
           (fz_df['Mean activity'] <= max_mean_activity) & \
           (fz_df['Total num target sites'] >= min_num_targets)
    bg_bin_p = np.mean(fz_df[crit]['Frequency of zero in target sites'])
    if np.isnan(bg_bin_p):
        raise ValueError

    pvals = []
    timer = util.Timer(total=len(fz_df))
    for idx, row in fz_df.iterrows():
        total = row['Total num target sites']
        numzero = row['Num target sites with zero']
        pval = binom.cdf(numzero, total, bg_bin_p)
        pvals.append(pval)
        timer.update()
    fz_df['pval'] = pvals

    fz_fdr_threshold = 0.001
    fz_df = ben_hoch_fdr(fz_df, fz_fdr_threshold)
    fz_df.to_csv(out_dir + '%s_fraczero_dec.csv' % (treat_nm))

    print(
        'Filtering treatment mutations matching background profile by frequency of zeros...'
    )
    to_remove = fz_df[fz_df['FDR accept'] == False]
    adj_d = filter_freqzero_background_mutations(to_remove, adj_d, nm_to_seq)

    ##
    # Write
    ##
    with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
        pickle.dump(adj_d, f)

    return

Exemple #4

0

Afficher le fichier

Fichier : ag4e_find_highfreq_batcheffects.py Projet : maxwshen/lib-analysis

def identify_highfreq_batcheffects():

    # Gather statistics
    be_treatments = [
        s for s in treat_control_df['Treatment'] if 'Cas9' not in s
    ]
    adf = dict()
    for nm in be_treatments:
        df = pd.read_csv(inp_dir + '%s_fraczero_dec.csv' % (nm), index_col=0)
        crit = (df['Mean activity'] >= 0.01) & (df['FDR accept'] == True)
        df = df[crit]
        df = df[['Position', 'Obs nt', 'Ref nt']]
        adf[nm] = df

    batch_to_nms = defaultdict(list)
    editor_to_nms = defaultdict(list)
    for nm in be_treatments:
        batch = exp_nm_to_batch[nm]
        batch_to_nms[batch].append(nm)

        editor = exp_nm_to_editor[nm]
        editor_to_nms[editor].append(nm)

    batches = set(batch_to_nms.keys())
    editors = set(editor_to_nms.keys())

    comb_batch = get_combined_grid_profiles(adf, batches, batch_to_nms)
    comb_editor = get_combined_grid_profiles(adf, editors, editor_to_nms)

    import pickle
    with open(out_dir + 'comb_batch.pkl', 'wb') as f:
        pickle.dump(comb_batch, f)

    with open(out_dir + 'comb_editor.pkl', 'wb') as f:
        pickle.dump(comb_editor, f)
    '''
    Identify high frequency batch effects.
    Note: This procedure is highly sensitive to parameters

    In batch profiles, remove connected component at A6 and C6
    Identify remaining mutations with batch frequency >50%

    Highly recommended to use 
    ag4e_highfreq_batcheffects-show_filter.ipynb
    to visualize which mutations are being filtered

  '''
    comb_batch = remove_ontarget_component(comb_batch)

    with open(out_dir + 'comb_batch_filtontarget.pkl', 'wb') as f:
        pickle.dump(comb_batch, f)

    # Load pws and remove
    ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''
    print('Removing high frequency batch effects in each condition...')
    timer = util.Timer(total=len(be_treatments))
    for treat_nm in be_treatments:
        adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb'))
        lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
        nms = lib_design['Name (unique)']
        seqs = lib_design[seq_col]
        nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

        batch = exp_nm_to_batch[treat_nm]
        to_remove = comb_batch[batch]
        adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm)

        with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
            pickle.dump(adj_d, f)

        timer.update()

    return

Exemple #5

0

Afficher le fichier

Fichier : ag4a_calc_stats.py Projet : maxwshen/lib-analysis

def calculate_statistics(treat_nm):
  ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

  adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust')

  lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
  print('Gathering statistics...')
  dd = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    timer.update()
    pw = adj_d[nm]
    seq = nm_to_seq[nm]
    for jdx in range(len(pw)):
      tot = sum(pw[jdx])
      ref_nt = seq[jdx]
      ref_idx = nt_to_idx[ref_nt]
      for kdx in range(len(pw[jdx])):
        if kdx == ref_idx:
          continue

        count = pw[jdx][kdx]
        dd['Count'].append(count)
        dd['Total count'].append(tot)
        dd['Obs nt'].append(nts[kdx])
        dd['Ref nt'].append(ref_nt)
        if tot == 0:
          dd['Frequency'].append(np.nan)
        else:
          dd['Frequency'].append(count / tot)
        dd['Position index'].append(jdx)
        dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
        dd['Name'].append(nm)

  df = pd.DataFrame(dd)
  df = df[df['Total count'] >= 50]

  dd = defaultdict(list)
  pos_range = sorted(set(df['Position index']))
  timer = util.Timer(total = len(pos_range))
  for pos_idx in pos_range:
    timer.update()
    df_s1 = df[df['Position index'] == pos_idx]
    for ref_nt in nts:
      df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
      for obs_nt in nts:
        if obs_nt == ref_nt:
          continue

        crit = (df_s2['Obs nt'] == obs_nt)
        dfs = df_s2[crit]
        dfs_freq = dfs['Frequency']

        num_zeros = sum(dfs_freq == 0)
        total = len(dfs_freq)
        if total == 0:
          continue

        dd['Num target sites with zero'].append(num_zeros)
        dd['Total num target sites'].append(total)
        dd['Frequency of zero in target sites'].append(num_zeros / total)
        dd['Mean activity'].append(np.mean(dfs_freq))
        dd['Position index'].append(pos_idx)
        dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
        dd['Obs nt'].append(obs_nt)
        dd['Ref nt'].append(ref_nt)

  stats_df = pd.DataFrame(dd)

  stats_df.to_csv(out_dir + '%s.csv' % (treat_nm))

  return

Exemple #6

0

Afficher le fichier

def adjust_batch_effects():
    # Gather statistics
    be_treatments = [
        s for s in treat_control_df['Treatment'] if 'Cas9' not in s
    ]
    mdf = pd.DataFrame()
    timer = util.Timer(total=len(be_treatments))
    print('Loading stats from each condition...')
    for treat_nm in be_treatments:
        df = pd.read_csv(inp_dir + '%s.csv' % (treat_nm), index_col=0)
        df['Treatment'] = treat_nm
        df['Batch'] = exp_nm_to_batch[treat_nm]
        df['Editor'] = exp_nm_to_editor[treat_nm]
        mdf = mdf.append(df, ignore_index=True)
        timer.update()

    mdf['Log mean activity'] = np.log10(mdf['Mean activity'])

    cbe_editors = set([e for e in mdf['Editor'] if 'ABE' not in e])
    abe_editors = set([e for e in mdf['Editor'] if 'ABE' in e])

    # ANOVA calculations
    from scipy.stats import f_oneway

    print(
        'Calculating ANOVA on each position+mutation combination to identify batch effects...'
    )
    dd = defaultdict(list)
    set_pos = set(mdf['Position'])
    timer = util.Timer(total=len(set_pos))
    for pos in set_pos:
        timer.update()
        for ref_nt in set(mdf['Ref nt']):
            for obs_nt in set(mdf['Obs nt']):
                crit = (mdf['Position'] == pos) & \
                       (mdf['Ref nt'] == ref_nt) & \
                       (mdf['Obs nt'] == obs_nt)
                if len(mdf[crit]) == 0:
                    continue
                if pos in [22, 23]:
                    continue
                args = tuple([
                    mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity']
                    for batch_nm in set(mdf[crit]['Batch'])
                ])
                fstat, pval = f_oneway(*args)
                dd['pval'].append(pval)
                dd['Statistic'].append(fstat)
                dd['Position'].append(pos)
                dd['Ref nt'].append(ref_nt)
                dd['Obs nt'].append(obs_nt)

    stats_df = pd.DataFrame(dd)
    stats_df['-log10p'] = -np.log10(stats_df['pval'])

    # Apply Bonferroni p-value cutoff
    print(
        'Finding significant batch effects with a Bonferroni corrected p-value threshold...'
    )
    pval = 0.005
    bonf_threshold = pval / len(stats_df)
    stats_df['bonferroni accept'] = (stats_df['pval'] <= bonf_threshold)
    stats_df.to_csv(out_dir + 'mutation_dec.csv')
    '''
    Identify mutations for removal
    At mutations passing Bonferroni corrected ANOVA test,
    identify batches where mutations are frequent
  '''
    print('Identifying batches to remove mutations from...')
    to_remove = stats_df[stats_df['bonferroni accept'] == True]
    dd = defaultdict(list)
    timer = util.Timer(total=len(to_remove))
    for idx, row in to_remove.iterrows():
        timer.update()
        pos = row['Position']
        ref_nt = row['Ref nt']
        obs_nt = row['Obs nt']
        crit = (mdf['Position'] == pos) & \
               (mdf['Ref nt'] == ref_nt) & \
               (mdf['Obs nt'] == obs_nt)
        means = {
          batch_nm: np.mean(mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity']) \
          for batch_nm in set(mdf[crit]['Batch'])
        }
        mean_vals = list(means.values())
        mean_means = np.mean(mean_vals)
        median_means = np.median(mean_vals)

        crit = (mdf['Position'] == pos) & \
               (mdf['Ref nt'] == ref_nt) & \
               (mdf['Obs nt'] == obs_nt) & \
               (mdf['Editor'].isin(cbe_editors))
        cbe_means = {
          batch_nm: np.mean(mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity']) \
          for batch_nm in set(mdf[crit]['Batch'])
        }
        cbe_mean_means = np.mean(list(cbe_means.values()))
        cbe_median_means = np.median(list(cbe_means.values()))

        crit = (mdf['Position'] == pos) & \
               (mdf['Ref nt'] == ref_nt) & \
               (mdf['Obs nt'] == obs_nt) & \
               (mdf['Editor'].isin(abe_editors))
        abe_means = {
          batch_nm: np.mean(mdf[crit & (mdf['Batch'] == batch_nm)]['Mean activity']) \
          for batch_nm in set(mdf[crit]['Batch'])
        }
        abe_mean_means = np.mean(list(abe_means.values()))
        abe_median_means = np.median(list(abe_means.values()))

        # Ignore batch effects with small effect size
        if max(mean_vals) - min(mean_vals) < 0.002:
            continue

        # Batch effect should be enriched over a rare background when controlling for editor type
        bg_threshold = 0.0005
        # if cbe_mean_means > bg_threshold or abe_mean_means > bg_threshold:
        if cbe_median_means > bg_threshold or abe_median_means > bg_threshold:
            continue

        for batch_nm in means:
            if means[batch_nm] >= 0.002:
                dd['Position'].append(pos)
                dd['Ref nt'].append(ref_nt)
                dd['Obs nt'].append(obs_nt)
                dd['Batch'].append(batch_nm)
    batch_muts_to_remove = pd.DataFrame(dd)
    batch_muts_to_remove.to_csv(out_dir + 'removed_batch_effects.csv')

    # Remove mutations
    print('Removing batch effects in each condition...')
    timer = util.Timer(total=len(be_treatments))
    for treat_nm in be_treatments:
        adj_d = _data.load_data(treat_nm, 'ag4_poswise_be_adjust')
        lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
        nms = lib_design['Name (unique)']
        seqs = lib_design[seq_col]
        nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

        batch = exp_nm_to_batch[treat_nm]
        to_remove = batch_muts_to_remove[batch_muts_to_remove['Batch'] ==
                                         batch]
        adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm)

        with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
            pickle.dump(adj_d, f)

        timer.update()

    return

Exemple #7

0

Afficher le fichier

def adjust_treatment_control(treat_nm, control_nm):
  ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

  treat_data = _data.load_data(treat_nm, 'g4_poswise_be')
  control_data = _data.load_data(control_nm, 'g4_poswise_be')

  treat_minq = _data.load_minq(treat_nm, 'g4_poswise_be')
  control_minq = _data.load_minq(control_nm, 'g4_poswise_be')

  lib_design, seq_col = _data.get_g4_lib_design(treat_nm)

  adj_d = dict()
  stats_dd = defaultdict(list)

  hc_decisions = defaultdict(list)

  nm_to_seq = dict()

  '''
    Filter positions with abnormally high control mut freq. 
  '''
  print('Filtering positions with high frequency control mutations...')
  timer = util.Timer(total = len(lib_design))
  for idx, row in lib_design.iterrows():
    nm = row['Name (unique)']
    seq = row[seq_col]
    nm_to_seq[nm] = seq
    timer.update()

    stats_dd['Name'].append(nm)

    if nm not in treat_data:
      stats_dd['Status'].append('No treatment')
      continue

    t = treat_data[nm]
    if nm not in control_data:
      stats_dd['Status'].append('No control')
      adj_d[nm] = t
      continue

    stats_dd['Status'].append('Adjusted')
    c = control_data[nm]

    # Adjust
    t = filter_high_control_muts(t, c, seq, treat_nm, nm, hc_decisions)
    adj_d[nm] = t

  stats_df = pd.DataFrame(stats_dd)
  stats_df.to_csv(out_dir + '%s_stats.csv' % (treat_nm))

  hc_df = pd.DataFrame(hc_decisions)
  hc_df = hc_df.sort_values(by = 'c_fq', ascending = False)
  hc_df = hc_df.reset_index(drop = True)
  hc_df.to_csv(out_dir + '%s_hc_dec.csv' % (treat_nm))

  '''
    Filter treatment mutations that can be explained by control freq.
    In practice, this step is most effective for control mutations
    with relatively high frequency => relatively high variance
  '''
  print('Gathering statistics on treatment mutations explained by control mutations...')
  bc_decisions = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    t = adj_d[nm]
    c = control_data[nm]
    seq = nm_to_seq[nm]

    gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, bc_decisions)
    timer.update()

  '''
    Using global statistics, filter mutations
    while controlling false discovery rate
  '''
  bc_fdr_threshold = 0.05
  bc_df = pd.DataFrame(bc_decisions)
  other_distribution = bc_df[bc_df['pval'] > 0.995]
  bc_df = bc_df[bc_df['pval'] <= 0.995]
  bc_df = bc_df.sort_values(by = 'pval')
  bc_df = bc_df.reset_index(drop = True)

  fdr_decs, hit_reject = [], False
  for idx, pval in enumerate(bc_df['pval']):
    if hit_reject:
      dec = False
    else:
      fdr_critical = ((idx + 1) / len(bc_df)) * bc_fdr_threshold
      dec = bool(pval <= fdr_critical)
    fdr_decs.append(dec)
    if dec is False and hit_reject is True:
      hit_reject = False
  bc_df['FDR accept'] = fdr_decs

  other_distribution['FDR accept'] = False
  bc_df = bc_df.append(other_distribution, ignore_index = True)
  bc_df.to_csv(out_dir + '%s_bc_dec.csv' % (treat_nm))

  print('Filtering treatment mutations explained by control mutations...')
  to_remove = bc_df[bc_df['FDR accept'] == False]
  adj_d = filter_binom_control_muts(to_remove, adj_d, control_data, nm_to_seq)


  '''
  '''
  print('Subtracting control from treatment data...')
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    t = adj_d[nm]
    c = control_data[nm]
    seq = nm_to_seq[nm]

    t = subtract_treatment_control(t, c, seq)
    adj_d[nm] = t
    timer.update()

  '''
    Filter treatment mutations that are best explained by 
    spontaneous random mutations.
    Tend to be very low frequency with no counterpart in control
  '''
  print('Gathering statistics on treatment mutations explained by Illumina sequencing errors...')
  ie_decisions = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    t = adj_d[nm]
    c = control_data[nm]
    seq = nm_to_seq[nm]
    c_minq = control_minq[nm]
    t_minq = treat_minq[nm]

    gather_stats_illumina_errors(t, c, t_minq, c_minq, seq, treat_nm, nm, ie_decisions)
    timer.update()

  ie_fdr_threshold = 0.05
  ie_df = pd.DataFrame(ie_decisions)
  other_distribution = ie_df[ie_df['pval'] > 0.995]
  ie_df = ie_df[ie_df['pval'] <= 0.995]
  ie_df = ie_df.sort_values(by = 'pval')
  ie_df = ie_df.reset_index(drop = True)

  fdr_decs, hit_reject = [], False
  for idx, pval in enumerate(ie_df['pval']):
    if hit_reject:
      dec = False
    else:
      fdr_critical = ((idx + 1) / len(ie_df)) * ie_fdr_threshold
      dec = bool(pval <= fdr_critical)
    fdr_decs.append(dec)
    if dec is False and hit_reject is True:
      hit_reject = False
  ie_df['FDR accept'] = fdr_decs

  other_distribution['FDR accept'] = False
  ie_df = ie_df.append(other_distribution, ignore_index = True)
  ie_df.to_csv(out_dir + '%s_ie_dec.csv' % (treat_nm))

  mut_summary = ie_df[ie_df['FDR accept'] == True]
  mut_summary['Frequency'] = mut_summary['t_fq']
  mut_summary['Count'] = mut_summary['t_ct']
  mut_summary['Total count'] = mut_summary['t_tot']
  mut_summary = mut_summary.drop(columns = ['t_bin_p', 'c_bin_p', 'pval', 'idx', 't_fq', 't_ct', 't_tot', 'FDR accept'])
  mut_summary.to_csv(out_dir + '%s_summary.csv' % (treat_nm))

  print('Filtering treatment mutations explained by Illumina sequencing errors...')
  to_remove = ie_df[ie_df['FDR accept'] == False]
  adj_d = filter_illumina_error_muts(to_remove, adj_d, control_data, nm_to_seq)


  ##
  # Write
  ##
  with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
    pickle.dump(adj_d, f)

  return