def main():
    print(NAME)

    for idx, row in treat_control_df.iterrows():
        treat_nm = row['Treatment']
        if 'Cas9' in treat_nm:
            continue

        lib_nm = _data.get_lib_nm(treat_nm)
        if lib_nm != '12kChar': continue
        if 'U2OS' in treat_nm: continue

        num_targets = 12000
        num_targets_per_split = 2000

        print(treat_nm)
        mdf = pd.DataFrame()
        data = None
        stats_df = pd.DataFrame()
        for start_idx in range(0, num_targets, num_targets_per_split):
            stats_fn = inp_dir + '%s_%s_%s_stats.csv' % (
                treat_nm, start_idx, start_idx + num_targets_per_split - 1)
            df = pd.read_csv(stats_fn, index_col=0)
            stats_df = stats_df.append(df, ignore_index=True)

        stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm))

    print('Done')
    return
예제 #2
0
def remove_batch_effects(treat_nm, start_idx, end_idx):
    batch_nm = exp_nm_to_batch[treat_nm]

    lib_design, seq_col = _data.get_lib_design(treat_nm)
    lib_nm = _data.get_lib_nm(treat_nm)
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    adj_d = _data.load_data(treat_nm, 'ah6a1b_subtract')

    batch_muts_to_remove = pd.read_csv(
        inp_dir + 'removed_batch_effects_%s.csv' % (lib_nm), index_col=0)

    if len(batch_muts_to_remove) == 0:
        inp_pkl = _config.OUT_PLACE + f'ah6a1b_subtract/{treat_nm}_{start_idx}_{end_idx}.pkl'
        out_pkl = out_dir + f'{treat_nm}_{start_idx}_{end_idx}.pkl'
        command = f'cp {inp_pkl} {out_pkl}'
        subprocess.check_output(command, shell=True)
        return

    # Remove mutations
    to_remove = batch_muts_to_remove[batch_muts_to_remove['Batch'] == batch_nm]
    to_remove = to_remove[to_remove['Name'].isin(nms)]

    adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm)

    with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx),
              'wb') as f:
        pickle.dump(adj_d, f)

    return
예제 #3
0
def main():
  print(NAME)

  for idx, row in treat_control_df.iterrows():
    treat_nm = row['Treatment']
    if 'Cas9' in treat_nm:
      continue

    lib_nm = _data.get_lib_nm(treat_nm)
    if lib_nm == 'LibA':
      num_target_sites = 2000
      num_sites_per_split = 200
    else:
      num_target_sites = 12000
      num_sites_per_split = 2000

    print(treat_nm)
    mdf = pd.DataFrame()
    data = None
    for start_idx in range(0, num_target_sites, num_sites_per_split):
      data_fn = inp_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, start_idx + num_sites_per_split - 1)
      with open(data_fn, 'rb') as f:
        temp_d = pickle.load(f)
      if data is None:
        data = temp_d
      else:
        for key in temp_d:
          data[key] = temp_d[key]

    # Data
    with open(inp_dir + '%s.pkl' % (treat_nm), 'wb') as f:
      pickle.dump(data, f)

  print('Done')
  return
def gather_statistics(exp_nm, params):
  (muts, allowed_pos, feature_radius) = params
  # Load data
  data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0)

  # Set up library info
  lib_nm = _data.get_lib_nm(exp_nm)
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  # Prepare data
  data = data[data['Total count'] >= 100]
  data['Frequency'] = data['Count'] / data['Total count']

  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  data = data[data['Name'].isin(ontarget_sites)]

  data = data[data['Position'].isin(allowed_pos)]

  data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
  data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

  # Annotate with local sequence context
  lib_zero_idx = _data.pos_to_idx(0, exp_nm)
  dd = defaultdict(list)
  print('Annotating data with local sequence contexts...')
  timer = util.Timer(total = len(data))
  for idx, row in data.iterrows():
    seq = nm_to_seq[row['Name']]
    pidx = row['Position'] + lib_zero_idx
    local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1]
    dd['Local context'].append(local_context)
    timer.update()
  for col in dd:
    data[col] = dd[col]

  # # Gather statistics

  for mut_nm in muts:
    print(mut_nm)
    mut = muts[mut_nm]
    if len(mut) == 1:
      d_temp = data[data['Mutation'] == mut[0]]
    else:
      d_temp = data[data['Mutation'].isin(mut)]
      d_temp['Mutation'] = mut_nm
      d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
      group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
      d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    for ml_task in ['classify_zero', 'regress_nonzero']:
      print(ml_task)
      results = train_models(exp_nm, d_temp, mut_nm, ml_task)
      save_results(exp_nm, mut_nm, ml_task, results)



  return
예제 #5
0
def load_human_data(dataset_id):
  if 'CSNVL' not in dataset_id:
    lib_nm = _data.get_lib_nm(dataset_id)
    lib_design, seq_col = _data.get_lib_design(dataset_id)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
  else:
    # Use any conds to load 12kChar, CtoT, and AtoG libs
    dids = ['190418_mES_12kChar_AID', '190329_HEK293T_AtoG_ABE', '190307_HEK_CtoT_BE4']
    nms, seqs = [], []
    for did in dids:
      lib_design, seq_col = _data.get_lib_design(did)
      nms += list(lib_design['Name (unique)'])
      seqs += list(lib_design[seq_col])

  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}


  Y_dir = _config.OUT_PLACE + 'combin_data_Y_imputewt/'
  with gzip.open(Y_dir + '%s.pkl.gz' % (dataset_id), 'rb') as f:
    Y = pickle.load(f)
  
  NAMES = list(Y.keys())
  Y = list(Y.values())

  # Load X
  if 'CSNVL' not in dataset_id:
    zero_idxs = [_data.pos_to_idx(0, dataset_id)] * len(NAMES)
  else:
    zero_idxs = []
    for nm in NAMES:
      if 'satmut' in nm:
        # 21
        zero_idxs.append(_data.zero_pos['12kChar'])
      else:
        # CtoT = AtoG = 10
        zero_idxs.append(_data.zero_pos['CtoT'])

  X = []
  timer = _util.Timer(total = len(NAMES))
  for nm, y, zero_idx in zip(NAMES, Y, zero_idxs):
    seq = nm_to_seq[nm]
    # seq_30nt = seq[zero_idx - 9 : zero_idx + 20 + 1]
    if zero_idx >= 9 + 10:
      # 12kChar
      pass
    else:
      # CtoT, AtoG libs
      prefix = 'GATGGGTGCGACGCGTCAT'
      seq = prefix + seq
      zero_idx += len(prefix)

    seq_50nt = seq[zero_idx - 9 - 10 : zero_idx + 20 + 10 + 1]
    assert len(seq_50nt) == 50
    X.append(seq_50nt)

  return X, Y, NAMES
예제 #6
0
def generate_train_test(X, Y, NAMES, dataset_id, train_test_id, valid_frac = 0.10):
  if 'CSNVL' not in dataset_id:
    lib_nm = _data.get_lib_nm(dataset_id)
  else:
    # Use traintest for 12kChar
    lib_nm = '12kChar'

  tt_df = pd.read_csv(_config.OUT_PLACE + 'gen_traintest_idxs/%s_%s.csv' % (lib_nm, train_test_id), index_col = 0)
  nms_train = set(tt_df[tt_df['Category'] == 'Train']['Name'])
  nms_test = set(tt_df[tt_df['Category'] == 'Test']['Name'])

  train_idxs = [NAMES.index(nm) for nm in nms_train if nm in NAMES]
  test_idxs = [NAMES.index(nm) for nm in nms_test if nm in NAMES]

  # Validation set is last % of training set
  num_valid = int(len(train_idxs) * valid_frac)
  valid_idxs = train_idxs[-num_valid:]
  train_idxs = train_idxs[:-num_valid]

  # Optional: subset training set 
  train_idxs = train_idxs[:int(hyperparameters['training_fraction'] * len(train_idxs))]

  print(f'Training set size: {len(train_idxs)}')
  print(f'Validation set size: {len(valid_idxs)}')
  print(f'Test set size: {len(test_idxs)}')
  print(f'Total size: {len(train_idxs) + len(valid_idxs) + len(test_idxs)}')

  X_train = [X[idx] for idx in train_idxs]
  X_valid = [X[idx] for idx in valid_idxs]
  X_test = [X[idx] for idx in test_idxs]

  Y_train = [Y[idx] for idx in train_idxs]
  Y_valid = [Y[idx] for idx in valid_idxs]
  Y_test = [Y[idx] for idx in test_idxs]

  NAMES_train = [NAMES[idx] for idx in train_idxs]
  NAMES_valid = [NAMES[idx] for idx in valid_idxs]
  NAMES_test = [NAMES[idx] for idx in test_idxs]


  datasets = {
    'train': BaseEditing_Dataset(x = X_train, y = Y_train, nms = NAMES_train),
    'valid': BaseEditing_Dataset(x = X_valid, y = Y_valid, nms = NAMES_valid),
    'test': BaseEditing_Dataset(x = X_test, y = Y_test, nms = NAMES_test),
  }
  x_dim = datasets['train'].x_dim
  y_mask_dim = datasets['train'].y_mask_dim

  dataset_sizes = {
    'train': len(X_train),
    'valid': len(X_valid),
    'test': len(X_test),
  }

  return datasets, dataset_sizes, x_dim, y_mask_dim
예제 #7
0
def main():
    print(NAME)

    for idx, row in treat_control_df.iterrows():
        treat_nm = row['Treatment']
        if 'Cas9' in treat_nm:
            continue

        lib_nm = _data.get_lib_nm(treat_nm)
        if lib_nm == 'LibA':
            num_targets = 2000
            num_targets_per_split = 200
        elif lib_nm == 'CtoGA':
            num_targets = 4000
            num_targets_per_split = 500
        else:
            num_targets = 12000
            num_targets_per_split = 2000

        print(treat_nm)
        mdf = pd.DataFrame()
        data = None
        stats_df = pd.DataFrame()
        for start_idx in range(0, num_targets, num_targets_per_split):
            data_fn = inp_dir + '%s_%s_%s.pkl' % (
                treat_nm, start_idx, start_idx + num_targets_per_split - 1)
            with open(data_fn, 'rb') as f:
                temp_d = pickle.load(f)
            if data is None:
                data = temp_d
            else:
                for key in temp_d:
                    data[key] = temp_d[key]

            stats_fn = inp_dir + '%s_%s_%s_stats.csv' % (
                treat_nm, start_idx, start_idx + num_targets_per_split - 1)
            df = pd.read_csv(stats_fn, index_col=0)
            stats_df = stats_df.append(df, ignore_index=True)

        # Data
        with open(inp_dir + '%s.pkl' % (treat_nm), 'wb') as f:
            pickle.dump(data, f)

        stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm))

    print('Done')
    return
예제 #8
0
def gather_statistics(exp_nm):

    # Load data
    data = pd.read_csv(inp_dir +
                       '_batch_adjusted_all_ratios-ps0_1bpcorrect.csv',
                       index_col=0)

    data = data[data['Condition'] == exp_nm]

    # Set up library info
    lib_nm = _data.get_lib_nm(exp_nm)
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
    data = data[data['Name (unique)'].isin(ontarget_sites)]

    # Annotate with local sequence context
    # lib_zero_idx = _data.pos_to_idx(0, exp_nm)
    dd = defaultdict(list)
    print('Annotating data with local sequence contexts...')
    timer = util.Timer(total=len(data))
    for idx, row in data.iterrows():
        seq = nm_to_seq[row['Name (unique)']]
        lib_zero_idx = _data.pos_to_idx_safe(0, exp_nm, row['Name (unique)'])
        # local_context = row['gRNA (20nt)']
        local_context = seq[lib_zero_idx - 9:lib_zero_idx + 20 + 1]
        dd['Local context'].append(local_context)
        timer.update()
    for col in dd:
        data[col] = dd[col]

    print(data.shape)
    results = train_models(exp_nm, data,
                           'Log10 batch-adjusted base edit to indel ratio')
    save_results(exp_nm, results)

    return
def main():
    print(NAME)

    for idx, row in treat_control_df.iterrows():
        treat_nm = row['Treatment']
        if 'Cas9' in treat_nm:
            continue

        lib_nm = _data.get_lib_nm(treat_nm)
        if lib_nm not in ['CtoT', 'AtoG', 'CtoGA']:
            continue

        if lib_nm != 'CtoGA':
            continue

        if lib_nm in ['CtoT', 'AtoG']:
            end_idx = 12000
            jump = 2000
        elif lib_nm == 'CtoGA':
            end_idx = 4000
            jump = 500

        print(treat_nm)
        mdf = pd.DataFrame()
        data = None
        stats_df = pd.DataFrame()
        for start_idx in range(0, end_idx, jump):

            stats_fn = inp_dir + '%s_%s_%s_stats.csv' % (treat_nm, start_idx,
                                                         start_idx + jump - 1)
            df = pd.read_csv(stats_fn, index_col=0)
            stats_df = stats_df.append(df, ignore_index=True)

        # Data
        stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm))

    print('Done')
    return
def main():
  print(NAME)

  for idx, row in treat_control_df.iterrows():
    treat_nm = row['Treatment']
    if 'Cas9' in treat_nm:
      continue

    lib_nm = _data.get_lib_nm(treat_nm)
    num_targets = 12000
    num_targets_per_split = 2000

    print(treat_nm)
    mdf = pd.DataFrame()
    data = None
    stats_df = pd.DataFrame()
    for start_idx in range(0, num_targets, num_targets_per_split):

      stats_fn = inp_dir + '%s_%s_%s.csv' % (treat_nm, start_idx, start_idx + num_targets_per_split - 1)
      df = pd.read_csv(stats_fn, index_col = 0)
      stats_df = stats_df.append(df, ignore_index = True)

    # Data
    stats_df.to_csv(inp_dir + '%s.csv' % (treat_nm))

    # Pivot C->T
    crit = (stats_df['Ref nt'] == 'C') & (stats_df['Obs nt'] == 'T')
    stats_df = stats_df[crit]

    pv_df = stats_df.pivot(
      index = 'Target site',
      columns = 'Position',
      values = 'Frequency',
    )
    pv_df.to_csv(inp_dir + 'poswise_editing_%s.csv' % (treat_nm))

  print('Done')
  return
def gather_statistics(celltype, lib_nm, editor_nm):
    print(celltype, lib_nm, editor_nm)
    [rep1, rep2] = _data.get_replicates(celltype, lib_nm, editor_nm)

    df1 = pd.read_csv(inp_dir + '%s.csv' % (rep1), index_col=0)
    df2 = pd.read_csv(inp_dir + '%s.csv' % (rep2), index_col=0)

    lib_nm = _data.get_lib_nm(rep1)
    lib_design, seq_col = _data.get_lib_design(rep1)
    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)

    # Prepare data
    # data = data[data['Total count'] >= 100]
    df1 = df1[df1['Name (unique)'].isin(ontarget_sites)]
    df2 = df2[df2['Name (unique)'].isin(ontarget_sites)]

    id_cols = [
        'Name (unique)',
        'gRNA (20nt)',
        seq_col,
    ]
    mdf = df1.merge(df2, on=id_cols, suffixes=['_r1', '_r2'])

    stat_col = 'Fraction edited'
    mdf['absdiff'] = np.abs(mdf['%s_r1' % (stat_col)] - mdf['%s_r2' %
                                                            (stat_col)])

    mdf['abslfc'] = np.abs(
        np.log2(mdf['%s_r1' % (stat_col)]) - np.log2(mdf['%s_r2' %
                                                         (stat_col)]))

    n_col = 'Total count'
    mdf['Total n'] = mdf['%s_r1' % (n_col)] + mdf['%s_r2' % (n_col)]

    mdf.to_csv(out_dir + '%s_%s_%s.csv' % (celltype, lib_nm, editor_nm))
    return
예제 #12
0
def train_models(exp_nm, data, ml_task, seq_col):
  # Prepare models and data

  if ml_task == 'regress_nonzero':
    evals = {
      'spearmanr': lambda t, p, w: spearmanr(t, p)[0],
      'pearsonr': lambda t, p, w: pearsonr(t, p)[0],
      'pearsonr weighted': lambda t, p, w: weighted_pearsonr(t, p, w),
      'r2_score weighted': lambda t, p, w: sklearn.metrics.r2_score(t, p, sample_weight = w),
      'r2_score unweighted': lambda t, p, w: sklearn.metrics.r2_score(t, p),
    }

  data = data[~np.isnan(data['Y'])]
  data = data.reset_index(drop = True)

  # Prepare additional features
  package = featurize(data, exp_nm, seq_col)
  (X_all, param_nms) = package
  import code; code.interact(local=dict(globals(), **locals()))

  # Train test split
  lib_nm = _data.get_lib_nm(exp_nm)
  package = get_traintest_package(X_all, data, lib_nm)
  (x_train, x_test, y_train, y_test, w_train, w_test, nms_train, nms_test) = package

  # Train models
  ms_dd = defaultdict(list)
  ms_dd['Name'].append(exp_nm)

  model_nm = 'GBTR'

  # Hyperparameter optimization
  '''
    Approx 20 seconds per fit.
    5 * 3 * 6 * 5 * 20 seconds = 2.5 hours
  '''
  from sklearn.model_selection import GridSearchCV
  hyperparameters = {
    'n_estimators': [100, 250, 500],
    'min_samples_leaf': [2, 5],
    'max_depth': [2, 3, 4, 5],
  }
  # hyperparameters = {
  #   'n_estimators': [100, 200],
  #   'min_samples_leaf': [1],
  #   'max_depth': [3, 4],
  # }

  model = GridSearchCV(
    GradientBoostingRegressor(),
    hyperparameters,
    cv = 5,
    verbose = True,
  )

  model.fit(x_train, y_train, sample_weight = w_train)

  gscv_df = pd.DataFrame(model.cv_results_)
  gscv_df.to_csv(out_dir + '%s_hyperparamresults.csv' % (exp_nm))

  with open(out_dir + '%s_bestmodel.pkl' % (exp_nm), 'wb') as f:
    pickle.dump(model.best_estimator_, f)

  pred_train = model.predict(x_train)
  pred_test = model.predict(x_test)

  # Store model performance stats in modelstats_dd
  for ml_eval_nm in evals:
    eval_f = evals[ml_eval_nm]

    try:
      ev = eval_f(y_train, pred_train, w_train)
    except ValueError:
      ev = np.nan
    ms_dd['%s %s train' % (model_nm, ml_eval_nm)].append(ev)

    try:
      ev = eval_f(y_test, pred_test, w_test)
    except ValueError:
      ev = np.nan
    ms_dd['%s %s test' % (model_nm, ml_eval_nm)].append(ev)

  # Record predictions in data
  pred_df = pd.DataFrame({
    'Name (unique)': nms_train + nms_test,
    'y_pred_%s' % (model_nm): list(pred_train) + list(pred_test),
    'TrainTest_%s' % (model_nm): ['train'] * len(nms_train) + ['test'] * len(nms_test)
  })
  data = data.merge(pred_df, on = 'Name (unique)')

  ms_df = pd.DataFrame(ms_dd)
  ms_df = ms_df.reindex(sorted(ms_df.columns), axis = 1)
  return (ms_df, data)
예제 #13
0
def fig_editing_profiles(treat_nm):
  ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

  adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb'))


  lib_design, seq_col = _data.get_lib_design(treat_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  lib_nm = _data.get_lib_nm(treat_nm)
  ontarget_nms = set(_data.get_ontarget_sites(lib_design, lib_nm))

  '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
  print('Forming long df...')
  dd = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    timer.update()

    if nm not in ontarget_nms:
      continue

    pw = adj_d[nm]
    seq = nm_to_seq[nm]
    for jdx in range(len(pw)):
      tot = np.nansum(pw[jdx])
      ref_nt = seq[jdx]
      ref_idx = nt_to_idx[ref_nt]
      for kdx in range(len(pw[jdx])):
        if kdx == ref_idx:
          continue

        count = pw[jdx][kdx]
        dd['Count'].append(count)
        dd['Total count'].append(tot)
        dd['Obs nt'].append(nts[kdx])
        dd['Ref nt'].append(ref_nt)
        if tot == 0:
          dd['Frequency'].append(np.nan)
        else:
          dd['Frequency'].append(count / tot)
        dd['Position index'].append(jdx)
        dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
        dd['Name'].append(nm)

  df = pd.DataFrame(dd)
  df = df[df['Total count'] >= 100]
  n_targetsites_in_condition = len(df)

  # Form stats_df
  dd = defaultdict(list)
  pos_range = sorted(set(df['Position index']))
  timer = util.Timer(total = len(pos_range))
  for pos_idx in pos_range:
    timer.update()
    df_s1 = df[df['Position index'] == pos_idx]
    for ref_nt in nts:
      df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
      for obs_nt in nts:
        if obs_nt == ref_nt:
          continue

        crit = (df_s2['Obs nt'] == obs_nt)
        dfs = df_s2[crit]
        dfs_freq = dfs['Frequency']

        num_zeros = sum(dfs_freq == 0)
        total = len(dfs_freq)
        if total == 0:
          continue

        dd['Num target sites with zero for mutation'].append(num_zeros)
        dd['Total num target sites for mutation'].append(total)
        dd['Frequency of zero in target sites for mutation'].append(num_zeros / total)
        dd['Num target sites in condition'].append(n_targetsites_in_condition)
        dd['Mean activity'].append(np.mean(dfs_freq))
        dd['Position index'].append(pos_idx)
        dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
        dd['Obs nt'].append(obs_nt)
        dd['Ref nt'].append(ref_nt)

  hm_df = pd.DataFrame(dd)
  hm_df.to_csv(out_dir + '%s.csv' % (treat_nm))

  # Median normalize
  background_range = range(25, 34 + 1)

  for ref_nt in nts:
    for obs_nt in nts:
      if obs_nt == ref_nt:
        continue

      crit = (hm_df['Ref nt'] == ref_nt) & (hm_df['Obs nt'] == obs_nt) & (~np.isnan(hm_df['Mean activity']))
      medi = np.nanmedian(hm_df[crit & (hm_df['Position'].isin(background_range))]['Mean activity'])
      hm_df.loc[crit, 'Mean activity'] = hm_df.loc[crit, 'Mean activity'].apply(lambda x: max(0, x - medi))

  hm_df.to_csv(out_dir + '%s_median_bg_adj.csv' % (treat_nm))

  return
def gather_statistics(exp_nm):
    feature_radius = 10
    allowed_pos = range(3, 8 + 1)
    # Load data
    data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0)

    # Set up library info
    lib_nm = _data.get_lib_nm(exp_nm)
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    # Prepare data
    data = data[data['Total count'] >= 100]
    data['Frequency'] = data['Count'] / data['Total count']

    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
    data = data[data['Name'].isin(ontarget_sites)]

    data = data[data['Position'].isin(allowed_pos)]

    data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
    # data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

    # Annotate with local sequence context
    lib_zero_idx = _data.pos_to_idx(0, exp_nm)
    dd = defaultdict(list)
    print('Annotating data with local sequence contexts...')
    timer = util.Timer(total=len(data))
    for idx, row in data.iterrows():
        seq = nm_to_seq[row['Name']]
        pidx = row['Position'] + lib_zero_idx
        local_context = seq[pidx -
                            feature_radius:pidx] + seq[pidx + 1:pidx +
                                                       feature_radius + 1]
        dd['Local context'].append(local_context)
        timer.update()
    for col in dd:
        data[col] = dd[col]

    # # Gather statistics

    # for mut_nm in muts:
    #   print(mut_nm)
    #   mut = muts[mut_nm]
    #   if len(mut) == 1:
    #     d_temp = data[data['Mutation'] == mut[0]]
    #   else:
    #     d_temp = data[data['Mutation'].isin(mut)]
    #     d_temp['Mutation'] = mut_nm
    #     d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
    #     group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
    #     d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    print(data.columns)
    print(set(data['Mutation']))

    acc_muts = [
        'C_T',
        'C_G',
        'C_A',
    ]
    data = data[data['Mutation'].isin(acc_muts)]
    data = data.drop(columns=['Count', 'Total count', 'Ref nt', 'Obs nt'])
    data = data.pivot_table(
        index=['Name', 'Position', 'Local context'],
        columns='Mutation',
        values='Frequency',
    ).reset_index()
    data = data.fillna(value=0)

    numerator = data['C_G'] + data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_GA_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_T']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_T_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_A_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_A'] + data['C_G']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_GA'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    return
예제 #15
0
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = _config.QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    # Generate qsubs only for unfinished jobs
    num_scripts = 0
    for idx, row in treat_control_df.iterrows():
        exp_nm = row['Treatment']
        lib_nm = _data.get_lib_nm(exp_nm)

        if 'Cas9' in exp_nm:
            continue

        if lib_nm == 'LibA':
            num_target_sites = 2000
            num_sites_per_split = 200
        else:
            num_target_sites = 12000
            num_sites_per_split = 2000

        try:
            mb_file_size = os.path.getsize(inp_dir + '%s.pkl' % (exp_nm)) / 1e6
        except FileNotFoundError:
            mb_file_size = 0
        ram_gb = 2
        if mb_file_size > 200:
            ram_gb = 4
        if mb_file_size > 400:
            ram_gb = 8
        if mb_file_size > 1000:
            ram_gb = 16

        for start_idx in range(0, num_target_sites, num_sites_per_split):
            end_idx = start_idx + num_sites_per_split - 1

            # out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (exp_nm, start_idx, end_idx)
            # if os.path.exists(out_pkl_fn):
            #   if os.path.getsize(out_pkl_fn) > 0:
            #     continue

            command = 'python %s.py %s %s %s' % (NAME, exp_nm, start_idx,
                                                 end_idx)
            script_id = NAME.split('_')[0]

            # Write shell scripts
            sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, exp_nm,
                                                   start_idx)
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

            # Write qsub commands
            qsub_commands.append(
                'qsub -V -l h_rt=16:00:00,h_vmem=%sG -wd %s %s &' %
                (ram_gb, _config.SRC_DIR, sh_fn))

    # Save commands
    commands_fn = qsubs_dir + '_commands.sh'
    with open(commands_fn, 'w') as f:
        f.write('\n'.join(qsub_commands))

    subprocess.check_output('chmod +x %s' % (commands_fn), shell=True)

    print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
    return
예제 #16
0
def form_data(exp_nm, start_idx, end_idx):
    data = _data.load_data(exp_nm, 'ag5a4_profile_subset')
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    lib_nm = _data.get_lib_nm(exp_nm)
    disease_nms = _data.get_disease_sites(lib_design, lib_nm)

    # Subset for dumb parallelization, ensure only disease target sites used
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    lib_design = lib_design[lib_design['Name (unique)'].isin(disease_nms)]

    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    stats_dd = defaultdict(list)

    nms_shared = [nm for nm in nms if nm in data]
    timer = util.Timer(total=len(nms_shared))
    for iter, nm in enumerate(nms_shared):

        df = data[nm]
        seq = nm_to_seq[nm]

        design_row = lib_design[lib_design['Name (unique)'] == nm].iloc[0]
        snp_pos = int(design_row['Position of SNP in gRNA'])
        correct_nt = design_row['Corrected nucleotide (gRNA orientation)']
        path_nt = design_row['Pathogenic nucleotide (gRNA orientation)']

        nt_cols = [
            col for col in df.columns if col != 'Count' and col != 'Frequency'
        ]

        # Impute . as wildtype
        df = impute_dot_as_wildtype(df, nt_cols)
        total_ct = sum(df['Count'])

        # Ensure each row is unique
        df = df.groupby(nt_cols)['Count'].agg('sum').reset_index()

        # Filter unedited columns
        df = subset_edited_rows(df, nt_cols)
        edited_ct = sum(df['Count'])

        df = remove_noisy_edits(df, nt_cols, exp_nm)

        gt_correct_ct = get_precise_gt_correction_count(
            df, nt_cols, snp_pos, correct_nt, path_nt)

        ## Overall statistics
        stats_dd['Name (unique)'].append(nm)

        stats_dd['Obs. correction count'].append(gt_correct_ct)
        stats_dd['Obs. total count'].append(total_ct)
        stats_dd['Obs. edited count'].append(edited_ct)

        stats_dd['Obs. gt correct fraction in all reads'].append(
            gt_correct_ct / total_ct if total_ct > 0 else np.nan)
        stats_dd['Obs. gt correct precision in edited reads'].append(
            gt_correct_ct / edited_ct if edited_ct > 0 else np.nan)
        stats_dd['Obs. editing frequency'].append(
            edited_ct / total_ct if total_ct > 0 else np.nan)

        # Amino acid correction for CtoGA
        if 'AA sequence - reference' in design_row.index and type(
                design_row['AA sequence - reference']) == str:

            orients = list('-+')
            d1 = bool(design_row['Designed orientation w.r.t. genome'] == '+')
            d2 = bool(design_row['AA frame strand'] == '+')
            xor_int = int(d1 == d2)
            aa_strand_relative_to_seq = orients[xor_int]

            aa_stats = {
                'Unedited AA': 0,
                'Edited AA': 0,
                'Goal AA': 0,
            }
            if design_row['AA sequence - pathogenic'] != design_row[
                    'AA sequence - reference']:
                for jdx, edit_row in df.iterrows():
                    seq_30nt = edit_row_to_seq_30nt(design_row, edit_row,
                                                    seq_col)
                    obs_aas = nts_to_aas(seq_30nt,
                                         design_row['AA frame position'],
                                         snp_pos, aa_strand_relative_to_seq)

                    pp0idx = design_row['Protospacer position zero index']
                    seq_30nt_path = design_row[seq_col][pp0idx - 9:pp0idx + 21]
                    aa_path_with_bc = nts_to_aas(
                        seq_30nt_path, design_row['AA frame position'],
                        snp_pos, aa_strand_relative_to_seq)

                    seq_30nt_wt = seq_30nt_path[:9 + snp_pos] + design_row[
                        'Corrected nucleotide (gRNA orientation)'] + seq_30nt_path[
                            9 + snp_pos + 1:]
                    aa_wt_with_bc = nts_to_aas(seq_30nt_wt,
                                               design_row['AA frame position'],
                                               snp_pos,
                                               aa_strand_relative_to_seq)

                    if obs_aas == aa_path_with_bc:
                        aa_stats['Unedited AA'] += edit_row['Count']
                    else:
                        aa_stats['Edited AA'] += edit_row['Count']

                    if obs_aas == aa_wt_with_bc:
                        aa_stats['Goal AA'] += edit_row['Count']

            stats_dd['Obs. aa correct precision among edited gts'].append(
                aa_stats['Goal AA'] / edited_ct if edited_ct > 0 else np.nan)
            stats_dd['Obs. aa correct precision among edited aas'].append(
                aa_stats['Goal AA'] /
                aa_stats['Edited AA'] if aa_stats['Edited AA'] > 0 else np.nan)
            stats_dd['Obs. aa correct precision among all reads'].append(
                aa_stats['Goal AA'] / total_ct if total_ct > 0 else np.nan)
            if stats_dd[
                    'Obs. aa correct precision among edited gts'] < stats_dd[
                        'Obs. gt correct precision in edited reads']:
                import code
                code.interact(local=dict(globals(), **locals()))

        else:
            stats_dd['Obs. aa correct precision among edited gts'].append(
                np.nan)
            stats_dd['Obs. aa correct precision among edited aas'].append(
                np.nan)
            stats_dd['Obs. aa correct precision among all reads'].append(
                np.nan)

        timer.update()

    # Save
    stats_df_collected = pd.DataFrame(stats_dd)

    stats_df = lib_design.merge(
        stats_df_collected,
        on='Name (unique)',
        how='outer',
    )

    stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' %
                    (exp_nm, start_idx, end_idx))
    return
예제 #17
0
def form_data(exp_nm, start_idx, end_idx):
  '''
    Annotate library design with total count, edited count, fraction edited, etc. 
  '''
  data = _data.load_data(exp_nm, 'ag5a4_profile_subset')
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  lib_nm = _data.get_lib_nm(exp_nm)

  lib_design = lib_design.iloc[start_idx : end_idx + 1]
  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  lib_design = lib_design[lib_design['Name (unique)'].isin(ontarget_sites)]

  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  stats_dd = defaultdict(list)
  new_data = dict()

  nms_shared = [nm for nm in nms if nm in data]
  timer = util.Timer(total = len(nms_shared))
  for iter, nm in enumerate(nms_shared):

    df = data[nm]
    seq = nm_to_seq[nm]

    num_mismatches = lambda x, y: sum([bool(n1 != n2) for n1,n2 in zip(x,y)])

    if 'index' in df.columns:
      df = df[[col for col in df.columns if col != 'index']]

    if len(df) == 0: continue


    ## 8/21/19
    '''
      Simulate bystander precision task in 12kChar by using the substrate nucleotide closest to the editor-specific center nt
    '''
    editor = _data.get_editor_nm(exp_nm)
    editor_to_central_pos = {
      'ABE': 6,
      'ABE-CP': 6,
      'AID': 6,
      'BE4': 6,
      'BE4-CP': 8,
      'CDA': 5,
      'eA3A': 6,
      'evoAPOBEC': 5,
    }
    if editor in editor_to_central_pos:
      central_pos = editor_to_central_pos[editor]
    else:
      central_pos = 6

    substrate = 'A' if 'ABE' in editor else 'C'
    nt_cols = [f'{substrate}{pos}' for pos in range(-3, 15) if f'{substrate}{pos}' in df.columns]
    central_col = find_central_col(central_pos, nt_cols, substrate)
    if central_col is None: continue

    mut_cols = [col for col in df.columns if col != 'Count']
    col_to_ref_nt = {col: col[0] for col in mut_cols}
    df_dd = defaultdict(list)
    for idx, row in df.iterrows():
      df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt))
      df_dd['Simulated precise'].append(is_simulated_precise(row, central_col, col_to_ref_nt))
    for col in df_dd:
      df[col] = df_dd[col]

    numer = sum(df[df['Simulated precise'] == True]['Count'])
    denom = sum(df[df['Num. edits'] > 0]['Count'])
    sim_precision = numer / denom if denom > 0 else np.nan
    stats_dd['Simulated bystander precision at editor-specific central nt'].append(sim_precision)

    stats_dd['Simulated bystander position'].append(int(central_col[1:]))
    stats_dd['Simulated bystander position, distance to center'].append(int(central_col[1:]) - central_pos)

    edited_ct = sum(df[df['Num. edits'] > 0]['Count'])
    stats_dd['Edited count'].append(edited_ct)

    stats_dd['Name (unique)'].append(nm)

    timer.update()


  stats_df_collected = pd.DataFrame(stats_dd)

  stats_df = lib_design.merge(
    stats_df_collected, 
    on = 'Name (unique)', 
    how = 'outer',
  )

  stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx))
  return
예제 #18
0
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = _config.QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    # Generate qsubs only for unfinished jobs
    treat_control_df = pd.read_csv(_config.DATA_DIR +
                                   'treatment_control_design.csv',
                                   index_col=0)

    num_scripts = 0
    for idx, row in treat_control_df.iterrows():
        treat_nm, control_nm = row['Treatment'], row['Control']
        lib_nm = _data.get_lib_nm(treat_nm)

        if lib_nm == 'LibA':
            num_targets = 2000
            num_targets_per_split = 200
        elif lib_nm == 'CtoGA':
            num_targets = 4000
            num_targets_per_split = 500
        else:
            num_targets = 12000
            num_targets_per_split = 2000
        '''
      Empirically determined
      pickle > 37 mb: needs 4 gb ram
      pickle > 335 mb: needs 8 gb ram
    '''
        print(treat_nm)
        mb_file_size = _data.check_file_size(treat_nm, 'ah6a1a_hf_bc')
        ram_gb = 2
        if mb_file_size > 30:
            ram_gb = 4
        if mb_file_size > 300:
            ram_gb = 8
        if mb_file_size > 1000:
            ram_gb = 16
        '''
      Can be very slow - up to 8h+ for some conditions.

      Could help to split 3 steps into 3 scripts.
      Statistical tests should be performed globally (for accurate FDR thresholds), and luckily these are the fast parts of the pipeline

      Subtracting control from treatment involves a lot of dataframe manipulations and is the bottleneck step. Fortunately, this can be parallelized
    '''

        for start_idx in range(0, num_targets, num_targets_per_split):
            end_idx = start_idx + num_targets_per_split - 1

            out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx,
                                                     end_idx)
            if os.path.exists(out_pkl_fn):
                if os.path.getsize(out_pkl_fn) > 0:
                    continue

            command = 'python %s.py %s %s %s %s' % (NAME, treat_nm, control_nm,
                                                    start_idx, end_idx)
            script_id = NAME.split('_')[0]

            # Write shell scripts
            sh_fn = qsubs_dir + 'q_%s_%s_%s_%s.sh' % (script_id, treat_nm,
                                                      control_nm, start_idx)
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

            # Write qsub commands
            qsub_commands.append(
                'qsub -V -P regevlab -l h_rt=16:00:00,h_vmem=%sG -wd %s %s &' %
                (ram_gb, _config.SRC_DIR, sh_fn))

    # Save commands
    commands_fn = qsubs_dir + '_commands.sh'
    with open(commands_fn, 'w') as f:
        f.write('\n'.join(qsub_commands))

    subprocess.check_output('chmod +x %s' % (commands_fn), shell=True)

    print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
    return
예제 #19
0
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = _config.QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    # Generate qsubs only for unfinished jobs
    treat_control_df = pd.read_csv(_config.DATA_DIR +
                                   'treatment_control_design.csv',
                                   index_col=0)

    num_scripts = 0
    for idx, row in treat_control_df.iterrows():
        treat_nm = row['Treatment']
        if 'Cas9' in treat_nm:
            continue
        lib_nm = _data.get_lib_nm(treat_nm)
        if lib_nm == 'LibA':
            num_targets = 2000
            num_targets_per_split = 200
        elif lib_nm == 'CtoGA':
            num_targets = 4000
            num_targets_per_split = 500
        else:
            num_targets = 12000
            num_targets_per_split = 2000

        for start_idx in range(0, num_targets, num_targets_per_split):
            end_idx = start_idx + num_targets_per_split - 1

            # Skip completed
            out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx,
                                                     end_idx)
            if os.path.isfile(out_pkl_fn):
                if os.path.getsize(out_pkl_fn) > 0:
                    continue

            command = 'python %s.py %s %s %s' % (NAME, treat_nm, start_idx,
                                                 end_idx)
            script_id = NAME.split('_')[0]

            try:
                mb_file_size = _data.check_file_size(treat_nm,
                                                     'ag5a4_profile_subset')
            except FileNotFoundError:
                mb_file_size = 0
            ram_gb = 2
            if mb_file_size > 140:
                ram_gb = 4
            if mb_file_size > 400:
                ram_gb = 8
            if mb_file_size > 1000:
                ram_gb = 16

            # Write shell scripts
            sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, treat_nm,
                                                   start_idx)
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

            # Write qsub commands
            qsub_commands.append(
                'qsub -V -P regevlab -l h_rt=4:00:00,h_vmem=%sG -wd %s %s &' %
                (ram_gb, _config.SRC_DIR, sh_fn))

    # Save commands
    commands_fn = qsubs_dir + '_commands.sh'
    with open(commands_fn, 'w') as f:
        f.write('\n'.join(qsub_commands))

    subprocess.check_output('chmod +x %s' % (commands_fn), shell=True)

    print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
    return
예제 #20
0
def adjust_batch_effects(lib_nm):
    print(lib_nm)
    # Gather statistics
    be_treatments = []
    batch_set = set()
    batch_to_exp_nms = defaultdict(list)
    for treat_nm in treat_control_df['Treatment']:
        if 'Cas9' in treat_nm:
            continue
        if _data.get_lib_nm(treat_nm) != lib_nm:
            continue
        batch_nm = exp_nm_to_batch[treat_nm]
        be_treatments.append(treat_nm)
        batch_set.add(batch_nm)
        batch_to_exp_nms[batch_nm].append(treat_nm)

    lib_design, seq_col = _data.get_lib_design(be_treatments[0])
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    md = dict()
    timer = util.Timer(total=len(be_treatments))
    print('Loading stats from each condition...')
    for treat_nm in be_treatments:
        with open(inp_dir + '%s.pkl' % (treat_nm), 'rb') as f:
            d = pickle.load(f)
        md[treat_nm] = d

        # df['Treatment'] = treat_nm
        # df['Batch'] = exp_nm_to_batch[treat_nm]
        # df['Editor'] = exp_nm_to_editor[treat_nm]
        timer.update()

    # ANOVA calculations
    from scipy.stats import f_oneway
    print(
        'Calculating ANOVA on all unique indels in all target sites to identify batch effects...'
    )

    dd = defaultdict(list)
    means_dd = defaultdict(lambda: defaultdict(lambda: dict()))
    timer = util.Timer(total=len(nms))
    for exp_nm in nms:

        mut_dd, all_mut_nms = form_dd(md, exp_nm)

        for mut_nm in all_mut_nms:
            anova_args = defaultdict(list)
            # Note: Ensure we do not implicitly treat a lack of data as an observation of zero
            for exp_nm_2 in mut_dd:
                anova_args[exp_nm_to_batch[exp_nm_2]].append(
                    mut_dd[exp_nm_2][mut_nm])
            '''
        Ensure non-degenerate ANOVA testing.
  
        If every batch has 0 std, we have identical values. It's likely that these identical values are 0 because of the sparsity of the data when considering unique indels (highly heterogeneous) at 12,000 target sites.

        If every batch with a non-zero value has only one observation, skip. 
      '''
            # Only perform ANOVA test on indels where at least one batch has non-zero std (otherwise it was seen only once in any batch, so it's not a batch effect)
            num_non_zero_stds = 0
            mean_d, std_d = dict(), dict()
            for batch in batch_set:
                if batch in anova_args:
                    mean_val = np.mean(anova_args[batch])
                    std_val = np.std(anova_args[batch])
                    if std_val > 0:
                        num_non_zero_stds += 1
                else:
                    mean_val = np.nan
                    std_val = np.nan
                mean_d[batch] = mean_val
                std_d[batch] = std_val

            degenerate_flag = False
            if num_non_zero_stds == 0:
                for batch in batch_set:
                    batch_data = anova_args[batch]
                    if len(batch_data) == 0:
                        continue
                    has_non_zero = bool(batch_data.count(0) != len(batch_data))
                    if has_non_zero and len(batch_data) == 1:
                        degenerate_flag = True
                    # elif has_non_zero and len(batch_data) > 1:
                    # import code; code.interact(local=dict(globals(), **locals()))
            if degenerate_flag:
                continue

            aa = tuple([s for s in anova_args.values() if len(s) != 0])
            if len(aa) < 2:
                continue

            fstat, pval = f_oneway(*aa)
            if np.isnan(pval):
                continue
            dd['Statistic'].append(fstat)
            dd['pval'].append(pval)
            dd['MutName'].append(mut_nm)
            dd['Name'].append(exp_nm)

            for batch in batch_set:
                dd['Mean %s' % (batch)].append(mean_d[batch])
                dd['Std %s' % (batch)].append(std_d[batch])
                means_dd[exp_nm][mut_nm][batch] = mean_val

        timer.update()

    stats_df = pd.DataFrame(dd)
    if len(stats_df) == 0:
        empty_df = pd.DataFrame()
        empty_df.to_csv(out_dir + 'mutation_dec_%s.csv' % (lib_nm))
        empty_df.to_csv(out_dir + 'removed_batch_effects_%s.csv' % (lib_nm))
        empty_df.to_csv(out_dir + 'removed_stats_%s.csv' % (lib_nm))
        return

    stats_df['-log10p'] = -np.log10(stats_df['pval'])

    # Apply FDR
    print(
        'Finding significant batch effects while controlling false discovery...'
    )

    fdr_threshold = 0.01
    other_distribution = stats_df[stats_df['pval'] > 0.995]
    stats_df = stats_df[stats_df['pval'] <= 0.995]
    stats_df = stats_df.sort_values(by='pval')
    stats_df = stats_df.reset_index(drop=True)

    fdr_decs, hit_reject = [], False
    for idx, pval in enumerate(stats_df['pval']):
        if hit_reject:
            dec = False
        else:
            fdr_critical = ((idx + 1) / len(stats_df)) * fdr_threshold
            dec = bool(pval <= fdr_critical)
        fdr_decs.append(dec)
        if dec is False and hit_reject is True:
            hit_reject = False
    stats_df['FDR accept'] = fdr_decs

    other_distribution['FDR accept'] = False
    stats_df = stats_df.append(other_distribution, ignore_index=True)
    stats_df.to_csv(out_dir + 'mutation_dec_%s.csv' % (lib_nm))
    '''
    Identify mutations for removal
    At mutations passing Bonferroni corrected ANOVA test,
    identify batches where mutations are frequent
  '''
    print('Identifying batches to remove mutations from...')
    to_remove = stats_df[stats_df['FDR accept'] == True]

    dd = defaultdict(list)
    dd_stats = defaultdict(list)
    timer = util.Timer(total=len(to_remove))
    for idx, row in to_remove.iterrows():
        timer.update()
        exp_nm = row['Name']
        mut_nm = row['MutName']

        means = means_dd[exp_nm][mut_nm]
        mean_vals = list(means.values())
        mean_means = np.mean(mean_vals)

        for batch_nm in means:
            if means[batch_nm] >= mean_means or means[batch_nm] >= 0.005:
                dd['Batch'].append(batch_nm)
                dd['Name'].append(exp_nm)
                dd['MutName'].append(mut_nm)

        for batch_nm in means:
            dd_stats['%s' % (batch_nm)].append(means[batch_nm])
        dd_stats['MutName'].append(mut_nm)
        dd_stats['Name'].append(exp_nm)
    batch_muts_to_remove = pd.DataFrame(dd)
    batch_muts_to_remove.to_csv(out_dir + 'removed_batch_effects_%s.csv' %
                                (lib_nm))

    batch_muts_stats = pd.DataFrame(dd_stats)
    batch_muts_stats.to_csv(out_dir + 'removed_stats_%s.csv' % (lib_nm))

    # Mutations are removed in ah6a3

    return