def gather_statistics(exp_nm, params):
  (muts, allowed_pos, feature_radius) = params
  # Load data
  data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0)

  # Set up library info
  lib_nm = _data.get_lib_nm(exp_nm)
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  # Prepare data
  data = data[data['Total count'] >= 100]
  data['Frequency'] = data['Count'] / data['Total count']

  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  data = data[data['Name'].isin(ontarget_sites)]

  data = data[data['Position'].isin(allowed_pos)]

  data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
  data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

  # Annotate with local sequence context
  lib_zero_idx = _data.pos_to_idx(0, exp_nm)
  dd = defaultdict(list)
  print('Annotating data with local sequence contexts...')
  timer = util.Timer(total = len(data))
  for idx, row in data.iterrows():
    seq = nm_to_seq[row['Name']]
    pidx = row['Position'] + lib_zero_idx
    local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1]
    dd['Local context'].append(local_context)
    timer.update()
  for col in dd:
    data[col] = dd[col]

  # # Gather statistics

  for mut_nm in muts:
    print(mut_nm)
    mut = muts[mut_nm]
    if len(mut) == 1:
      d_temp = data[data['Mutation'] == mut[0]]
    else:
      d_temp = data[data['Mutation'].isin(mut)]
      d_temp['Mutation'] = mut_nm
      d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
      group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
      d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    for ml_task in ['classify_zero', 'regress_nonzero']:
      print(ml_task)
      results = train_models(exp_nm, d_temp, mut_nm, ml_task)
      save_results(exp_nm, mut_nm, ml_task, results)



  return
예제 #2
0
def gather_statistics(exp_nm):

    # Load data
    data = pd.read_csv(inp_dir +
                       '_batch_adjusted_all_ratios-ps0_1bpcorrect.csv',
                       index_col=0)

    data = data[data['Condition'] == exp_nm]

    # Set up library info
    lib_nm = _data.get_lib_nm(exp_nm)
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
    data = data[data['Name (unique)'].isin(ontarget_sites)]

    # Annotate with local sequence context
    # lib_zero_idx = _data.pos_to_idx(0, exp_nm)
    dd = defaultdict(list)
    print('Annotating data with local sequence contexts...')
    timer = util.Timer(total=len(data))
    for idx, row in data.iterrows():
        seq = nm_to_seq[row['Name (unique)']]
        lib_zero_idx = _data.pos_to_idx_safe(0, exp_nm, row['Name (unique)'])
        # local_context = row['gRNA (20nt)']
        local_context = seq[lib_zero_idx - 9:lib_zero_idx + 20 + 1]
        dd['Local context'].append(local_context)
        timer.update()
    for col in dd:
        data[col] = dd[col]

    print(data.shape)
    results = train_models(exp_nm, data,
                           'Log10 batch-adjusted base edit to indel ratio')
    save_results(exp_nm, results)

    return
def gather_statistics(celltype, lib_nm, editor_nm):
    print(celltype, lib_nm, editor_nm)
    [rep1, rep2] = _data.get_replicates(celltype, lib_nm, editor_nm)

    df1 = pd.read_csv(inp_dir + '%s.csv' % (rep1), index_col=0)
    df2 = pd.read_csv(inp_dir + '%s.csv' % (rep2), index_col=0)

    lib_nm = _data.get_lib_nm(rep1)
    lib_design, seq_col = _data.get_lib_design(rep1)
    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)

    # Prepare data
    # data = data[data['Total count'] >= 100]
    df1 = df1[df1['Name (unique)'].isin(ontarget_sites)]
    df2 = df2[df2['Name (unique)'].isin(ontarget_sites)]

    id_cols = [
        'Name (unique)',
        'gRNA (20nt)',
        seq_col,
    ]
    mdf = df1.merge(df2, on=id_cols, suffixes=['_r1', '_r2'])

    stat_col = 'Fraction edited'
    mdf['absdiff'] = np.abs(mdf['%s_r1' % (stat_col)] - mdf['%s_r2' %
                                                            (stat_col)])

    mdf['abslfc'] = np.abs(
        np.log2(mdf['%s_r1' % (stat_col)]) - np.log2(mdf['%s_r2' %
                                                         (stat_col)]))

    n_col = 'Total count'
    mdf['Total n'] = mdf['%s_r1' % (n_col)] + mdf['%s_r2' % (n_col)]

    mdf.to_csv(out_dir + '%s_%s_%s.csv' % (celltype, lib_nm, editor_nm))
    return
예제 #4
0
def fig_editing_profiles(treat_nm):
  ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

  adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb'))


  lib_design, seq_col = _data.get_lib_design(treat_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  lib_nm = _data.get_lib_nm(treat_nm)
  ontarget_nms = set(_data.get_ontarget_sites(lib_design, lib_nm))

  '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
  print('Forming long df...')
  dd = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    timer.update()

    if nm not in ontarget_nms:
      continue

    pw = adj_d[nm]
    seq = nm_to_seq[nm]
    for jdx in range(len(pw)):
      tot = np.nansum(pw[jdx])
      ref_nt = seq[jdx]
      ref_idx = nt_to_idx[ref_nt]
      for kdx in range(len(pw[jdx])):
        if kdx == ref_idx:
          continue

        count = pw[jdx][kdx]
        dd['Count'].append(count)
        dd['Total count'].append(tot)
        dd['Obs nt'].append(nts[kdx])
        dd['Ref nt'].append(ref_nt)
        if tot == 0:
          dd['Frequency'].append(np.nan)
        else:
          dd['Frequency'].append(count / tot)
        dd['Position index'].append(jdx)
        dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
        dd['Name'].append(nm)

  df = pd.DataFrame(dd)
  df = df[df['Total count'] >= 100]
  n_targetsites_in_condition = len(df)

  # Form stats_df
  dd = defaultdict(list)
  pos_range = sorted(set(df['Position index']))
  timer = util.Timer(total = len(pos_range))
  for pos_idx in pos_range:
    timer.update()
    df_s1 = df[df['Position index'] == pos_idx]
    for ref_nt in nts:
      df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
      for obs_nt in nts:
        if obs_nt == ref_nt:
          continue

        crit = (df_s2['Obs nt'] == obs_nt)
        dfs = df_s2[crit]
        dfs_freq = dfs['Frequency']

        num_zeros = sum(dfs_freq == 0)
        total = len(dfs_freq)
        if total == 0:
          continue

        dd['Num target sites with zero for mutation'].append(num_zeros)
        dd['Total num target sites for mutation'].append(total)
        dd['Frequency of zero in target sites for mutation'].append(num_zeros / total)
        dd['Num target sites in condition'].append(n_targetsites_in_condition)
        dd['Mean activity'].append(np.mean(dfs_freq))
        dd['Position index'].append(pos_idx)
        dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
        dd['Obs nt'].append(obs_nt)
        dd['Ref nt'].append(ref_nt)

  hm_df = pd.DataFrame(dd)
  hm_df.to_csv(out_dir + '%s.csv' % (treat_nm))

  # Median normalize
  background_range = range(25, 34 + 1)

  for ref_nt in nts:
    for obs_nt in nts:
      if obs_nt == ref_nt:
        continue

      crit = (hm_df['Ref nt'] == ref_nt) & (hm_df['Obs nt'] == obs_nt) & (~np.isnan(hm_df['Mean activity']))
      medi = np.nanmedian(hm_df[crit & (hm_df['Position'].isin(background_range))]['Mean activity'])
      hm_df.loc[crit, 'Mean activity'] = hm_df.loc[crit, 'Mean activity'].apply(lambda x: max(0, x - medi))

  hm_df.to_csv(out_dir + '%s_median_bg_adj.csv' % (treat_nm))

  return
def gather_statistics(exp_nm):
    feature_radius = 10
    allowed_pos = range(3, 8 + 1)
    # Load data
    data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0)

    # Set up library info
    lib_nm = _data.get_lib_nm(exp_nm)
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    # Prepare data
    data = data[data['Total count'] >= 100]
    data['Frequency'] = data['Count'] / data['Total count']

    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
    data = data[data['Name'].isin(ontarget_sites)]

    data = data[data['Position'].isin(allowed_pos)]

    data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
    # data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

    # Annotate with local sequence context
    lib_zero_idx = _data.pos_to_idx(0, exp_nm)
    dd = defaultdict(list)
    print('Annotating data with local sequence contexts...')
    timer = util.Timer(total=len(data))
    for idx, row in data.iterrows():
        seq = nm_to_seq[row['Name']]
        pidx = row['Position'] + lib_zero_idx
        local_context = seq[pidx -
                            feature_radius:pidx] + seq[pidx + 1:pidx +
                                                       feature_radius + 1]
        dd['Local context'].append(local_context)
        timer.update()
    for col in dd:
        data[col] = dd[col]

    # # Gather statistics

    # for mut_nm in muts:
    #   print(mut_nm)
    #   mut = muts[mut_nm]
    #   if len(mut) == 1:
    #     d_temp = data[data['Mutation'] == mut[0]]
    #   else:
    #     d_temp = data[data['Mutation'].isin(mut)]
    #     d_temp['Mutation'] = mut_nm
    #     d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
    #     group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
    #     d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    print(data.columns)
    print(set(data['Mutation']))

    acc_muts = [
        'C_T',
        'C_G',
        'C_A',
    ]
    data = data[data['Mutation'].isin(acc_muts)]
    data = data.drop(columns=['Count', 'Total count', 'Ref nt', 'Obs nt'])
    data = data.pivot_table(
        index=['Name', 'Position', 'Local context'],
        columns='Mutation',
        values='Frequency',
    ).reset_index()
    data = data.fillna(value=0)

    numerator = data['C_G'] + data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_GA_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_T']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_T_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_A_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_A'] + data['C_G']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_GA'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    return
예제 #6
0
def form_data(exp_nm, start_idx, end_idx):
  '''
    Annotate library design with total count, edited count, fraction edited, etc. 
  '''
  data = _data.load_data(exp_nm, 'ag5a4_profile_subset')
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  lib_nm = _data.get_lib_nm(exp_nm)

  lib_design = lib_design.iloc[start_idx : end_idx + 1]
  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  lib_design = lib_design[lib_design['Name (unique)'].isin(ontarget_sites)]

  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  stats_dd = defaultdict(list)
  new_data = dict()

  nms_shared = [nm for nm in nms if nm in data]
  timer = util.Timer(total = len(nms_shared))
  for iter, nm in enumerate(nms_shared):

    df = data[nm]
    seq = nm_to_seq[nm]

    num_mismatches = lambda x, y: sum([bool(n1 != n2) for n1,n2 in zip(x,y)])

    if 'index' in df.columns:
      df = df[[col for col in df.columns if col != 'index']]

    if len(df) == 0: continue


    ## 8/21/19
    '''
      Simulate bystander precision task in 12kChar by using the substrate nucleotide closest to the editor-specific center nt
    '''
    editor = _data.get_editor_nm(exp_nm)
    editor_to_central_pos = {
      'ABE': 6,
      'ABE-CP': 6,
      'AID': 6,
      'BE4': 6,
      'BE4-CP': 8,
      'CDA': 5,
      'eA3A': 6,
      'evoAPOBEC': 5,
    }
    if editor in editor_to_central_pos:
      central_pos = editor_to_central_pos[editor]
    else:
      central_pos = 6

    substrate = 'A' if 'ABE' in editor else 'C'
    nt_cols = [f'{substrate}{pos}' for pos in range(-3, 15) if f'{substrate}{pos}' in df.columns]
    central_col = find_central_col(central_pos, nt_cols, substrate)
    if central_col is None: continue

    mut_cols = [col for col in df.columns if col != 'Count']
    col_to_ref_nt = {col: col[0] for col in mut_cols}
    df_dd = defaultdict(list)
    for idx, row in df.iterrows():
      df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt))
      df_dd['Simulated precise'].append(is_simulated_precise(row, central_col, col_to_ref_nt))
    for col in df_dd:
      df[col] = df_dd[col]

    numer = sum(df[df['Simulated precise'] == True]['Count'])
    denom = sum(df[df['Num. edits'] > 0]['Count'])
    sim_precision = numer / denom if denom > 0 else np.nan
    stats_dd['Simulated bystander precision at editor-specific central nt'].append(sim_precision)

    stats_dd['Simulated bystander position'].append(int(central_col[1:]))
    stats_dd['Simulated bystander position, distance to center'].append(int(central_col[1:]) - central_pos)

    edited_ct = sum(df[df['Num. edits'] > 0]['Count'])
    stats_dd['Edited count'].append(edited_ct)

    stats_dd['Name (unique)'].append(nm)

    timer.update()


  stats_df_collected = pd.DataFrame(stats_dd)

  stats_df = lib_design.merge(
    stats_df_collected, 
    on = 'Name (unique)', 
    how = 'outer',
  )

  stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx))
  return