コード例 #1
0
ファイル: c_ag_res.py プロジェクト: maxwshen/be-modeling
def load_human_data(dataset_id):
  if 'CSNVL' not in dataset_id:
    lib_nm = _data.get_lib_nm(dataset_id)
    lib_design, seq_col = _data.get_lib_design(dataset_id)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
  else:
    # Use any conds to load 12kChar, CtoT, and AtoG libs
    dids = ['190418_mES_12kChar_AID', '190329_HEK293T_AtoG_ABE', '190307_HEK_CtoT_BE4']
    nms, seqs = [], []
    for did in dids:
      lib_design, seq_col = _data.get_lib_design(did)
      nms += list(lib_design['Name (unique)'])
      seqs += list(lib_design[seq_col])

  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}


  Y_dir = _config.OUT_PLACE + 'combin_data_Y_imputewt/'
  with gzip.open(Y_dir + '%s.pkl.gz' % (dataset_id), 'rb') as f:
    Y = pickle.load(f)
  
  NAMES = list(Y.keys())
  Y = list(Y.values())

  # Load X
  if 'CSNVL' not in dataset_id:
    zero_idxs = [_data.pos_to_idx(0, dataset_id)] * len(NAMES)
  else:
    zero_idxs = []
    for nm in NAMES:
      if 'satmut' in nm:
        # 21
        zero_idxs.append(_data.zero_pos['12kChar'])
      else:
        # CtoT = AtoG = 10
        zero_idxs.append(_data.zero_pos['CtoT'])

  X = []
  timer = _util.Timer(total = len(NAMES))
  for nm, y, zero_idx in zip(NAMES, Y, zero_idxs):
    seq = nm_to_seq[nm]
    # seq_30nt = seq[zero_idx - 9 : zero_idx + 20 + 1]
    if zero_idx >= 9 + 10:
      # 12kChar
      pass
    else:
      # CtoT, AtoG libs
      prefix = 'GATGGGTGCGACGCGTCAT'
      seq = prefix + seq
      zero_idx += len(prefix)

    seq_50nt = seq[zero_idx - 9 - 10 : zero_idx + 20 + 10 + 1]
    assert len(seq_50nt) == 50
    X.append(seq_50nt)

  return X, Y, NAMES
コード例 #2
0
def adjust_treatment_control(treat_nm, control_nm, start_idx, end_idx):
    adj_d = _data.load_data(treat_nm, 'ah6a1a_hf_bc')
    control_data = _data.load_data(control_nm, 'h6_anyindel')

    lib_design, seq_col = _data.get_lib_design(treat_nm)
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    nms = lib_design['Name (unique)']
    '''
  '''
    print('Subtracting control from treatment data...')
    shared_nms = [nm for nm in nms if nm in adj_d]
    new_adj_d = dict()
    timer = util.Timer(total=len(shared_nms))
    for nm in shared_nms:
        t = adj_d[nm]
        if nm not in control_data:
            continue
        c = control_data[nm]

        t = subtract_treatment_control(t, c)
        new_adj_d[nm] = t
        timer.update()

    ##
    # Write
    ##
    with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx),
              'wb') as f:
        pickle.dump(new_adj_d, f)

    return
コード例 #3
0
def remove_batch_effects(treat_nm, start_idx, end_idx):
    batch_nm = exp_nm_to_batch[treat_nm]

    lib_design, seq_col = _data.get_lib_design(treat_nm)
    lib_nm = _data.get_lib_nm(treat_nm)
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    adj_d = _data.load_data(treat_nm, 'ah6a1b_subtract')

    batch_muts_to_remove = pd.read_csv(
        inp_dir + 'removed_batch_effects_%s.csv' % (lib_nm), index_col=0)

    if len(batch_muts_to_remove) == 0:
        inp_pkl = _config.OUT_PLACE + f'ah6a1b_subtract/{treat_nm}_{start_idx}_{end_idx}.pkl'
        out_pkl = out_dir + f'{treat_nm}_{start_idx}_{end_idx}.pkl'
        command = f'cp {inp_pkl} {out_pkl}'
        subprocess.check_output(command, shell=True)
        return

    # Remove mutations
    to_remove = batch_muts_to_remove[batch_muts_to_remove['Batch'] == batch_nm]
    to_remove = to_remove[to_remove['Name'].isin(nms)]

    adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm)

    with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx),
              'wb') as f:
        pickle.dump(adj_d, f)

    return
コード例 #4
0
def gather_statistics(exp_nm, params):
  (muts, allowed_pos, feature_radius) = params
  # Load data
  data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0)

  # Set up library info
  lib_nm = _data.get_lib_nm(exp_nm)
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  # Prepare data
  data = data[data['Total count'] >= 100]
  data['Frequency'] = data['Count'] / data['Total count']

  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  data = data[data['Name'].isin(ontarget_sites)]

  data = data[data['Position'].isin(allowed_pos)]

  data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
  data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

  # Annotate with local sequence context
  lib_zero_idx = _data.pos_to_idx(0, exp_nm)
  dd = defaultdict(list)
  print('Annotating data with local sequence contexts...')
  timer = util.Timer(total = len(data))
  for idx, row in data.iterrows():
    seq = nm_to_seq[row['Name']]
    pidx = row['Position'] + lib_zero_idx
    local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1]
    dd['Local context'].append(local_context)
    timer.update()
  for col in dd:
    data[col] = dd[col]

  # # Gather statistics

  for mut_nm in muts:
    print(mut_nm)
    mut = muts[mut_nm]
    if len(mut) == 1:
      d_temp = data[data['Mutation'] == mut[0]]
    else:
      d_temp = data[data['Mutation'].isin(mut)]
      d_temp['Mutation'] = mut_nm
      d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
      group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
      d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    for ml_task in ['classify_zero', 'regress_nonzero']:
      print(ml_task)
      results = train_models(exp_nm, d_temp, mut_nm, ml_task)
      save_results(exp_nm, mut_nm, ml_task, results)



  return
コード例 #5
0
def indel_anyindel(exp_nm):
  try:
    data = _data.load_data(exp_nm, 'ah6c_reduce_1bp_indel_fq')
  except:
    print('Error : could not load data')
    sys.exit(1)

  lib_design, seq_col = _data.get_lib_design(exp_nm)

  dd = defaultdict(list)
  timer = util.Timer(total = len(data))
  for target_nm in data:
    df = data[target_nm]

    tot_count = sum(df['Count'])
    dd['Total count'].append(tot_count)
    dd['Name'].append(target_nm)

    crit = (df['Category'] != 'wildtype')
    indel_count = sum(df[crit]['Count'])
    dd['Indel count'].append(indel_count)
    if tot_count != 0:
      dd['Indel freq'].append(indel_count / tot_count)
    else:
      dd['Indel freq'].append(np.nan)

    crit = (df['Category'] == 'del')
    del_count = sum(df[crit]['Count'])
    dd['Del count'].append(del_count)
    if tot_count != 0:
      dd['Del freq'].append(del_count / tot_count)
    else:
      dd['Del freq'].append(np.nan)

    crit = (df['Category'] == 'ins')
    ins_count = sum(df[crit]['Count'])
    dd['Ins count'].append(ins_count)
    if tot_count != 0:
      dd['Ins freq'].append(ins_count / tot_count)
    else:
      dd['Ins freq'].append(np.nan)

    crit = (df['Category'] == 'wildtype')
    wt_count = sum(df[crit]['Count'])
    dd['Wildtype count'].append(wt_count)
    if tot_count != 0:
      dd['Wildtype freq'].append(wt_count / tot_count)
    else:
      dd['Wildtype freq'].append(np.nan)

    timer.update()

  df = pd.DataFrame(dd)
  df.to_csv(out_dir + '%s.csv' % (exp_nm))

  return
コード例 #6
0
def adjust_batch_effects(exp_nm, start_idx, end_idx):
    '''
    Identify batch effects from position-wise analysis and remove them from combinatorial df
  '''
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    nms = lib_design['Name (unique)']

    batch_muts = pd.read_csv(_config.DATA_DIR + 'batch_effects.csv')
    batch = exp_nm_to_batch[exp_nm]
    df = batch_muts[batch_muts['Batch'] == batch]

    batch_cols, obs_nts = [], []
    for idx, row in df.iterrows():
        batch_col = '%s%s' % (row['Ref nt'], row['Position'])
        batch_cols.append(batch_col)
        obs_nts.append(row['Obs nt'])

    data = pickle.load(open(inp_dir + '%s.pkl' % (exp_nm), 'rb'))
    new_data = dict()

    nms_shared = [nm for nm in nms if nm in data]
    timer = util.Timer(total=len(nms_shared))
    for target_nm in nms_shared:
        d = data[target_nm]
        d = d[d['Count'] > 0]

        if 'index' in d.columns:
            d = d[[col for col in d.columns if col != 'index']]
        '''
      Note: .loc is SUPER SLOW. Can be 100x slower than using iterrows
    '''
        # for batch_col, obs_nt in zip(batch_cols, obs_nts):
        #   if batch_col in d.columns:
        #     d.loc[d[batch_col] == obs_nt, batch_col] = '.'

        matched_cols, matched_obs_nts = [], []
        for idx, col in enumerate(batch_cols):
            if col in d.columns:
                matched_cols.append(col)
                matched_obs_nts.append(obs_nts[idx])

        for idx, row in d.iterrows():
            for col, obs_nt in zip(matched_cols, matched_obs_nts):
                if row[col] == obs_nt:
                    row[col] = '.'

        new_data[target_nm] = d
        timer.update()

    with open(out_dir + '%s_%s_%s.pkl' % (exp_nm, start_idx, end_idx),
              'wb') as f:
        pickle.dump(new_data, f)

    return
コード例 #7
0
ファイル: b_group_L2.py プロジェクト: maxwshen/lib-analysis
def form_L2_group_ae_newgenotype_Cas9_adjust(group_nm, l1_nms):
    datas = [
        _data.load_data(l1_nm, 'ae_newgenotype_Cas9_adjust')
        for l1_nm in l1_nms
    ]
    datas = [s for s in datas if s is not None]
    lib_design, seq_col = _data.get_lib_design(l1_nms[0])
    ''' 
    g5 format: data is a dict, keys = target site names
    values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column
  '''
    group_ds = dict()

    timer = util.Timer(total=len(lib_design))
    for idx, row in lib_design.iterrows():
        nm = row['Name (unique)']
        timer.update()

        # num_present = sum([bool(nm in data) for data in datas])
        '''
    Combine: two strategies
    1. Normalize readcount, then add (equal contribution)
    2. Directly add (weighted by readcount)
      * using this strategy
    '''
        group_d = None
        for data in datas:

            if nm not in data:
                continue

            d = data[nm]
            d = d.drop(columns=['_Sequence Context', '_Cutsite', '_ExpDir'])
            nt_cols = [s for s in d.columns if s != 'Count']
            d = d.fillna(value='.')

            if group_d is None:
                group_d = d
            else:
                group_d = group_d.append(d, ignore_index=True)
                group_d = group_d.groupby(nt_cols)['Count'].sum()
                group_d = group_d.reset_index()

        if group_d is not None:
            group_d = group_d.sort_values(by='Count', ascending=False)
            group_d = group_d.reset_index()
            group_d = group_d.replace(to_replace='.', value=np.nan)
            group_ds[nm] = group_d

    with open(out_dir + '%s.pkl' % (group_nm), 'wb') as f:
        pickle.dump(group_ds, f)

    return
コード例 #8
0
def indel_anyindel_seq(exp_nm):
  '''
    Investigate if 1 nt deletions at abasic site are related to microhomology
    Control for position by focusing only on pos 5
  '''
  df = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0)

  lib_design, seq_col = _data.get_lib_design(exp_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  dd = defaultdict(list)
  all_nms = set(df['Name'])

  five_idx = _data.pos_to_idx(4, exp_nm)

  import code; code.interact(local=dict(globals(), **locals()))

  timer = util.Timer(total = len(all_nms))
  for nm in all_nms:
    dfs = df[df['Name'] == nm]

    seq = nm_to_seq[nm]
    accept = bool(seq[five_idx] == 'C') & (seq[five_idx + 1] != 'C')

    for jdx in range(five_idx - 1, -1, -1):
      if seq[jdx] != 'C':
        break
    num_c = abs(five_idx - jdx)


    if not accept:
      continue

    crit = (dfs['Category'] == 'del') & (dfs['Indel length'] == 1) & (dfs['Indel end adj'] == 5.0)
    row = dfs[crit]

    if len(row) == 0:
      dd['Frequency'].append(0)
    else:
      dd['Frequency'].append(sum(row['Frequency']))

    dd['Num C'].append(num_c)
    dd['Name'].append(nm)

    timer.update()

  df = pd.DataFrame(dd)
  df.to_csv(out_dir + '%s.csv' % (exp_nm))

  return
コード例 #9
0
def profile_subset(exp_nm, start_idx, end_idx):
    editor_nm = exp_nm_to_editor[exp_nm]

    muts_dict = get_editor_mutations(editor_nm)
    editor_mut_set = set(muts_dict.keys())
    '''
    Identify batch effects from position-wise analysis and remove them from combinatorial df
  '''
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    nms = lib_design['Name (unique)']

    data = pickle.load(open(inp_dir + '%s.pkl' % (exp_nm), 'rb'))
    new_data = dict()

    nms_shared = [nm for nm in nms if nm in data]
    timer = util.Timer(total=len(nms_shared))
    for target_nm in nms_shared:
        d = data[target_nm]
        d = d[d['Count'] > 0]

        if 'index' in d.columns:
            d = d[[col for col in d.columns if col != 'index']]

        # Subset to mutation columns
        _mut_cols = [col for col in d.columns if col in editor_mut_set]
        d = d[_mut_cols + ['Count']]

        for idx, row in d.iterrows():
            for col in _mut_cols:
                ref_nt = col[0]
                obs_nt = row[col]
                if obs_nt != ref_nt and obs_nt not in muts_dict[col]:
                    row[col] = '.'

        # Eliminate rows that only contain .
        crit = d.apply(
            lambda row: sum([row[c] == '.'
                             for c in row.index]) != len(row.index) - 1,
            axis='columns')
        d = d[crit]

        d = d.reset_index(drop=True)

        new_data[target_nm] = d
        timer.update()

    with open(out_dir + '%s_%s_%s.pkl' % (exp_nm, start_idx, end_idx),
              'wb') as f:
        pickle.dump(new_data, f)

    return
コード例 #10
0
def indel_anyindel_seq(exp_nm):
    '''
    Annotate indels with related sequence context (e.g., bases in deletions) 
  '''
    df = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0)

    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    indel_dd = defaultdict(list)
    all_nms = set(df['Name'])
    timer = util.Timer(total=len(df))
    for idx, row in df.iterrows():
        # dfs = df[df['Name'] == nm]

        nm = row['Name']
        seq = nm_to_seq[nm]
        left_del_nt = np.nan
        right_del_nt = np.nan
        del_nts = np.nan

        if row['Category'] == 'del':
            start_pos = int(row['Indel start adj'])
            start_idx = _data.pos_to_idx(start_pos, exp_nm)

            end_pos = int(row['Indel end adj'])
            end_idx = _data.pos_to_idx(end_pos, exp_nm)

            if start_idx >= 0 and end_idx <= len(seq):
                del_nts = seq[start_idx:end_idx]
                left_del_nt = del_nts[0]
                right_del_nt = del_nts[-1]

        indel_dd['Left del nt'].append(left_del_nt)
        indel_dd['Right del nt'].append(right_del_nt)
        indel_dd['Del nts'].append(del_nts)

        timer.update()

    for col in indel_dd:
        df[col] = indel_dd[col]

    df.to_csv(out_dir + '%s.csv' % (exp_nm))

    return
コード例 #11
0
ファイル: b_group_L2.py プロジェクト: maxwshen/lib-analysis
def form_L2_group_ag4_poswise_be_adjust(group_nm, l1_nms):
    datas = [
        _data.load_data(l1_nm, 'ag4_poswise_be_adjust') for l1_nm in l1_nms
    ]
    datas = [s for s in datas if s is not None]
    lib_design, seq_col = _data.get_lib_design(l1_nms[0])
    ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''
    group_ds = dict()

    timer = util.Timer(total=len(lib_design))
    for idx, row in lib_design.iterrows():
        nm = row['Name (unique)']
        timer.update()

        # num_present = sum([bool(nm in data) for data in datas])
        '''
    Combine: two strategies
    1. Normalize readcount, then add (equal contribution)
    2. Directly add (weighted by readcount)
      * using this strategy
    '''
        group_d = None
        for data in datas:

            if nm not in data:
                continue
            d = data[nm]

            if group_d is None:
                group_d = d
            else:
                group_d += d

        if group_d is not None:
            group_ds[nm] = group_d

    with open(out_dir + '%s.pkl' % (group_nm), 'wb') as f:
        pickle.dump(group_ds, f)

    return
コード例 #12
0
def gather_statistics(exp_nm):

    # Load data
    data = pd.read_csv(inp_dir +
                       '_batch_adjusted_all_ratios-ps0_1bpcorrect.csv',
                       index_col=0)

    data = data[data['Condition'] == exp_nm]

    # Set up library info
    lib_nm = _data.get_lib_nm(exp_nm)
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
    data = data[data['Name (unique)'].isin(ontarget_sites)]

    # Annotate with local sequence context
    # lib_zero_idx = _data.pos_to_idx(0, exp_nm)
    dd = defaultdict(list)
    print('Annotating data with local sequence contexts...')
    timer = util.Timer(total=len(data))
    for idx, row in data.iterrows():
        seq = nm_to_seq[row['Name (unique)']]
        lib_zero_idx = _data.pos_to_idx_safe(0, exp_nm, row['Name (unique)'])
        # local_context = row['gRNA (20nt)']
        local_context = seq[lib_zero_idx - 9:lib_zero_idx + 20 + 1]
        dd['Local context'].append(local_context)
        timer.update()
    for col in dd:
        data[col] = dd[col]

    print(data.shape)
    results = train_models(exp_nm, data,
                           'Log10 batch-adjusted base edit to indel ratio')
    save_results(exp_nm, results)

    return
コード例 #13
0
def adjust_treatment_control(treat_nm, control_nm):
    cas_data = _data.load_data(treat_nm, 'e_newgenotype_Cas9')
    lib_data = _data.load_data(control_nm, 'e_newgenotype_Cas9')
    cas_data = cas_data.drop(columns='ULMI count')
    lib_data = lib_data.drop(columns='ULMI count')

    lib_design, seq_col = _data.get_lib_design(treat_nm)

    adj_d = dict()
    stats_dd = defaultdict(list)

    timer = util.Timer(total=len(lib_design))
    for idx, row in lib_design.iterrows():
        nm = row['Name (unique)']
        timer.update()

        cas = cas_data[cas_data['_Experiment'] == nm]
        lib = lib_data[lib_data['_Experiment'] == nm]

        stats_dd['Name'].append(nm)
        if len(cas) == 0:
            stats_dd['Status'].append('No treatment')
            continue
        if len(lib) == 0:
            stats_dd['Status'].append('No control')
            adj_d[nm] = t
            continue

        stats_dd['Status'].append('Adjusted')
        new_cas = build_new_cas(lib, cas)
        adj_d[nm] = new_cas

    with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
        pickle.dump(adj_d, f)

    stats_df = pd.DataFrame(stats_dd)
    stats_df.to_csv(out_dir + '%s_stats.csv' % (treat_nm))

    return
コード例 #14
0
def adjust_treatment_control(treat_nm, control_nm, start_idx, end_idx):
    adj_d = _data.load_data(treat_nm, 'ag5a1a_hf_bc')
    control_data = _data.load_data(control_nm, 'g5_combin_be')

    lib_design, seq_col = _data.get_lib_design(treat_nm)
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}
    ''' 
    g5 format: data is a dict, keys = target site names
    values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column
  '''
    '''
  '''
    print('Subtracting control from treatment data...')
    shared_nms = [nm for nm in nms if nm in adj_d]
    new_adj_d = dict()
    timer = util.Timer(total=len(shared_nms))
    for nm in shared_nms:
        t = adj_d[nm]
        if nm not in control_data:
            continue
        c = control_data[nm]
        seq = nm_to_seq[nm]

        t = subtract_treatment_control(t, c, seq)
        new_adj_d[nm] = t
        timer.update()

    ##
    # Write
    ##
    with open(out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx),
              'wb') as f:
        pickle.dump(new_adj_d, f)

    return
コード例 #15
0
def gather_statistics(celltype, lib_nm, editor_nm):
    print(celltype, lib_nm, editor_nm)
    [rep1, rep2] = _data.get_replicates(celltype, lib_nm, editor_nm)

    df1 = pd.read_csv(inp_dir + '%s.csv' % (rep1), index_col=0)
    df2 = pd.read_csv(inp_dir + '%s.csv' % (rep2), index_col=0)

    lib_nm = _data.get_lib_nm(rep1)
    lib_design, seq_col = _data.get_lib_design(rep1)
    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)

    # Prepare data
    # data = data[data['Total count'] >= 100]
    df1 = df1[df1['Name (unique)'].isin(ontarget_sites)]
    df2 = df2[df2['Name (unique)'].isin(ontarget_sites)]

    id_cols = [
        'Name (unique)',
        'gRNA (20nt)',
        seq_col,
    ]
    mdf = df1.merge(df2, on=id_cols, suffixes=['_r1', '_r2'])

    stat_col = 'Fraction edited'
    mdf['absdiff'] = np.abs(mdf['%s_r1' % (stat_col)] - mdf['%s_r2' %
                                                            (stat_col)])

    mdf['abslfc'] = np.abs(
        np.log2(mdf['%s_r1' % (stat_col)]) - np.log2(mdf['%s_r2' %
                                                         (stat_col)]))

    n_col = 'Total count'
    mdf['Total n'] = mdf['%s_r1' % (n_col)] + mdf['%s_r2' % (n_col)]

    mdf.to_csv(out_dir + '%s_%s_%s.csv' % (celltype, lib_nm, editor_nm))
    return
コード例 #16
0
def adjust_treatment_control(treat_nm, control_nm):
    adj_d = _data.load_data(treat_nm, 'ag5a1b_subtract')
    control_data = _data.load_data(control_nm, 'g5_combin_be')

    treat_minq = _data.load_minq(treat_nm, 'g5_combin_be')
    control_minq = _data.load_minq(control_nm, 'g5_combin_be')

    lib_design, seq_col = _data.get_lib_design(treat_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}
    ''' 
    g5 format: data is a dict, keys = target site names
    values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column
  '''
    '''
    Filter treatment mutations that are best explained by 
    spontaneous random mutations.
    Tend to be very low frequency with no counterpart in control
  '''
    print(
        'Gathering statistics on treatment mutations explained by Illumina sequencing errors...'
    )
    ie_decisions = defaultdict(list)
    timer = util.Timer(total=len(adj_d))
    for nm in adj_d:
        t = adj_d[nm]
        if nm not in control_data:
            continue
        c = control_data[nm]
        seq = nm_to_seq[nm]
        c_minq = control_minq[nm]
        t_minq = treat_minq[nm]

        gather_stats_illumina_errors(t, c, t_minq, c_minq, seq, treat_nm, nm,
                                     ie_decisions)
        timer.update()

    ie_fdr_threshold = 0.05
    ie_df = pd.DataFrame(ie_decisions)
    other_distribution = ie_df[ie_df['pval'] > 0.995]
    ie_df = ie_df[ie_df['pval'] <= 0.995]
    ie_df = ie_df.sort_values(by='pval')
    ie_df = ie_df.reset_index(drop=True)

    fdr_decs, hit_reject = [], False
    for idx, pval in enumerate(ie_df['pval']):
        if hit_reject:
            dec = False
        else:
            fdr_critical = ((idx + 1) / len(ie_df)) * ie_fdr_threshold
            dec = bool(pval <= fdr_critical)
        fdr_decs.append(dec)
        if dec is False and hit_reject is True:
            hit_reject = False
    ie_df['FDR accept'] = fdr_decs

    other_distribution['FDR accept'] = False
    ie_df = ie_df.append(other_distribution, ignore_index=True)
    ie_df.to_csv(out_dir + '%s_ie_dec.csv' % (treat_nm))

    print(
        'Filtering treatment mutations explained by Illumina sequencing errors...'
    )
    to_remove = ie_df[ie_df['FDR accept'] == False]
    adj_d = filter_illumina_error_muts(to_remove, adj_d, control_data,
                                       nm_to_seq)

    ##
    # Write
    ##
    with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
        pickle.dump(adj_d, f)

    return
コード例 #17
0
def adjust_treatment_control(treat_nm, control_nm):
  treat_data = _data.load_data(treat_nm, 'h6_anyindel')
  control_data = _data.load_data(control_nm, 'h6_anyindel')

  lib_design, seq_col = _data.get_lib_design(treat_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  ''' 
    h6 format: data is a dict, keys = target site names
    values = dfs
      'Category', 
      'Indel start', 
      'Indel end', 
      'Indel length', 
      'MH length',
      'Inserted bases',
      'Count',
      'Name',
  '''

  adj_d = dict()
  stats_dd = defaultdict(list)

  '''
    Filter positions with abnormally high control mut freq. 
  '''
  hc_decisions = defaultdict(list)
  print('Filtering positions with high frequency control mutations...')
  timer = util.Timer(total = len(lib_design))
  for idx, row in lib_design.iterrows():
    timer.update()
    nm = row['Name (unique)']
    seq = row[seq_col]

    stats_dd['Name'].append(nm)

    if nm not in treat_data:
      stats_dd['Status'].append('No treatment')
      continue

    t = treat_data[nm]
    if nm not in control_data:
      stats_dd['Status'].append('No control')
      adj_d[nm] = t
      continue

    stats_dd['Status'].append('Adjusted')
    c = control_data[nm]

    # Adjust
    t = filter_high_control_muts(t, c, seq, treat_nm, nm, hc_decisions)
    if t is not None:
      adj_d[nm] = t

  stats_df = pd.DataFrame(stats_dd)
  stats_df.to_csv(out_dir + '%s_stats.csv' % (treat_nm))

  hc_df = pd.DataFrame(hc_decisions)
  hc_df = hc_df.sort_values(by = 'max_hf_freq', ascending = False)
  hc_df = hc_df.reset_index(drop = True)
  hc_df.to_csv(out_dir + '%s_hc_dec.csv' % (treat_nm))

  '''
    Filter treatment mutations that can be explained by control freq.
    In practice, this step is most effective for control mutations
    with relatively high frequency => relatively high variance
  '''
  print('Gathering statistics on treatment mutations explained by control mutations...')
  bc_decisions = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    t = adj_d[nm]
    if nm not in control_data:
      continue
    c = control_data[nm]
    seq = nm_to_seq[nm]

    gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, bc_decisions)
    timer.update()

  '''
    Using global statistics, filter mutations
    while controlling false discovery rate
  '''
  bc_fdr_threshold = 0.05
  bc_df = pd.DataFrame(bc_decisions)
  other_distribution = bc_df[bc_df['pval'] > 0.995]
  bc_df = bc_df[bc_df['pval'] <= 0.995]
  bc_df = bc_df.sort_values(by = 'pval')
  bc_df = bc_df.reset_index(drop = True)

  fdr_decs, hit_reject = [], False
  for idx, pval in enumerate(bc_df['pval']):
    if hit_reject:
      dec = False
    else:
      fdr_critical = ((idx + 1) / len(bc_df)) * bc_fdr_threshold
      dec = bool(pval <= fdr_critical)
    fdr_decs.append(dec)
    if dec is False and hit_reject is True:
      hit_reject = False
  bc_df['FDR accept'] = fdr_decs

  other_distribution['FDR accept'] = False
  bc_df = bc_df.append(other_distribution, ignore_index = True)
  bc_df.to_csv(out_dir + '%s_bc_dec.csv' % (treat_nm))

  print('Filtering treatment mutations explained by control mutations...')
  to_remove = bc_df[bc_df['FDR accept'] == False]
  adj_d = filter_binom_control_muts(to_remove, adj_d, control_data, nm_to_seq)

  ##
  # Write
  ##
  with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
    pickle.dump(adj_d, f)

  return
コード例 #18
0
def adjust_treatment_control(treat_nm, control_nm):
  treat_data = _data.load_data(treat_nm, 'g5_combin_be')
  control_data = _data.load_data(control_nm, 'g5_combin_be')

  treat_minq = _data.load_minq(treat_nm, 'g5_combin_be')
  control_minq = _data.load_minq(control_nm, 'g5_combin_be')

  lib_design, seq_col = _data.get_lib_design(treat_nm)

  ''' 
    g5 format: data is a dict, keys = target site names
    values = dfs, with columns as '%s%s' % (nt, position), and 'Count' column
  '''

  adj_d = dict()
  stats_dd = defaultdict(list)
  hc_decisions = defaultdict(list)
  nm_to_seq = dict()


  '''
    Filter positions with abnormally high control mut freq. 
  '''
  print('Filtering positions with high frequency control mutations...')
  timer = util.Timer(total = len(lib_design))
  for idx, row in lib_design.iterrows():
    nm = row['Name (unique)']
    seq = row[seq_col]
    nm_to_seq[nm] = seq
    timer.update()

    stats_dd['Name'].append(nm)

    if nm not in treat_data:
      stats_dd['Status'].append('No treatment')
      continue

    t = treat_data[nm]
    if nm not in control_data:
      stats_dd['Status'].append('No control')
      adj_d[nm] = t
      continue

    stats_dd['Status'].append('Adjusted')
    c = control_data[nm]

    # Adjust
    t = filter_high_control_muts(t, c, seq, treat_nm, nm, hc_decisions)
    adj_d[nm] = t

  stats_df = pd.DataFrame(stats_dd)
  stats_df.to_csv(out_dir + '%s_stats.csv' % (treat_nm))

  hc_df = pd.DataFrame(hc_decisions)
  hc_df = hc_df.sort_values(by = 'c_fq', ascending = False)
  hc_df = hc_df.reset_index(drop = True)
  hc_df.to_csv(out_dir + '%s_hc_dec.csv' % (treat_nm))

  '''
    Filter treatment mutations that can be explained by control freq.
    In practice, this step is most effective for control mutations
    with relatively high frequency => relatively high variance
  '''
  print('Gathering statistics on treatment mutations explained by control mutations...')
  bc_decisions = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    t = adj_d[nm]
    if nm not in control_data:
      continue
    c = control_data[nm]
    seq = nm_to_seq[nm]

    gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, bc_decisions)
    timer.update()

  '''
    Using global statistics, filter mutations
    while controlling false discovery rate
  '''
  bc_fdr_threshold = 0.05
  bc_df = pd.DataFrame(bc_decisions)
  other_distribution = bc_df[bc_df['pval'] > 0.995]
  bc_df = bc_df[bc_df['pval'] <= 0.995]
  bc_df = bc_df.sort_values(by = 'pval')
  bc_df = bc_df.reset_index(drop = True)

  fdr_decs, hit_reject = [], False
  for idx, pval in enumerate(bc_df['pval']):
    if hit_reject:
      dec = False
    else:
      fdr_critical = ((idx + 1) / len(bc_df)) * bc_fdr_threshold
      dec = bool(pval <= fdr_critical)
    fdr_decs.append(dec)
    if dec is False and hit_reject is True:
      hit_reject = False
  bc_df['FDR accept'] = fdr_decs

  other_distribution['FDR accept'] = False
  bc_df = bc_df.append(other_distribution, ignore_index = True)
  bc_df.to_csv(out_dir + '%s_bc_dec.csv' % (treat_nm))

  print('Filtering treatment mutations explained by control mutations...')
  to_remove = bc_df[bc_df['FDR accept'] == False]
  adj_d = filter_binom_control_muts(to_remove, adj_d, control_data, nm_to_seq)


  '''
  '''
  print('Subtracting control from treatment data...')
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    t = adj_d[nm]
    if nm not in control_data:
      continue
    c = control_data[nm]
    seq = nm_to_seq[nm]

    t = subtract_treatment_control(t, c, seq)
    adj_d[nm] = t
    timer.update()


  '''
    Filter treatment mutations that are best explained by 
    spontaneous random mutations.
    Tend to be very low frequency with no counterpart in control
  '''
  print('Gathering statistics on treatment mutations explained by Illumina sequencing errors...')
  ie_decisions = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    t = adj_d[nm]
    if nm not in control_data:
      continue
    c = control_data[nm]
    seq = nm_to_seq[nm]
    c_minq = control_minq[nm]
    t_minq = treat_minq[nm]

    gather_stats_illumina_errors(t, c, t_minq, c_minq, seq, treat_nm, nm, ie_decisions)
    timer.update()

  ie_fdr_threshold = 0.05
  ie_df = pd.DataFrame(ie_decisions)
  other_distribution = ie_df[ie_df['pval'] > 0.995]
  ie_df = ie_df[ie_df['pval'] <= 0.995]
  ie_df = ie_df.sort_values(by = 'pval')
  ie_df = ie_df.reset_index(drop = True)

  fdr_decs, hit_reject = [], False
  for idx, pval in enumerate(ie_df['pval']):
    if hit_reject:
      dec = False
    else:
      fdr_critical = ((idx + 1) / len(ie_df)) * ie_fdr_threshold
      dec = bool(pval <= fdr_critical)
    fdr_decs.append(dec)
    if dec is False and hit_reject is True:
      hit_reject = False
  ie_df['FDR accept'] = fdr_decs

  other_distribution['FDR accept'] = False
  ie_df = ie_df.append(other_distribution, ignore_index = True)
  ie_df.to_csv(out_dir + '%s_ie_dec.csv' % (treat_nm))

  print('Filtering treatment mutations explained by Illumina sequencing errors...')
  to_remove = ie_df[ie_df['FDR accept'] == False]
  adj_d = filter_illumina_error_muts(to_remove, adj_d, control_data, nm_to_seq)

  ##
  # Write
  ##
  with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
    pickle.dump(adj_d, f)

  return
コード例 #19
0
def fig_editing_profiles(treat_nm):
  ''' 
    g4 format: data is a dict, keys = target site names
    values = np.array with shape = (target site len, 4)
      entries = int for num. Q30 observations
  '''

  adj_d = pickle.load(open(inp_dir + '%s.pkl' % (treat_nm), 'rb'))


  lib_design, seq_col = _data.get_lib_design(treat_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  lib_nm = _data.get_lib_nm(treat_nm)
  ontarget_nms = set(_data.get_ontarget_sites(lib_design, lib_nm))

  '''
    Filter treatment mutations that match the unedited background profile
    using the statistic: fraction of target sites with non-zero event frequency
  '''
  print('Forming long df...')
  dd = defaultdict(list)
  timer = util.Timer(total = len(adj_d))
  for nm in adj_d:
    timer.update()

    if nm not in ontarget_nms:
      continue

    pw = adj_d[nm]
    seq = nm_to_seq[nm]
    for jdx in range(len(pw)):
      tot = np.nansum(pw[jdx])
      ref_nt = seq[jdx]
      ref_idx = nt_to_idx[ref_nt]
      for kdx in range(len(pw[jdx])):
        if kdx == ref_idx:
          continue

        count = pw[jdx][kdx]
        dd['Count'].append(count)
        dd['Total count'].append(tot)
        dd['Obs nt'].append(nts[kdx])
        dd['Ref nt'].append(ref_nt)
        if tot == 0:
          dd['Frequency'].append(np.nan)
        else:
          dd['Frequency'].append(count / tot)
        dd['Position index'].append(jdx)
        dd['Position'].append(_data.idx_to_pos(jdx, treat_nm))
        dd['Name'].append(nm)

  df = pd.DataFrame(dd)
  df = df[df['Total count'] >= 100]
  n_targetsites_in_condition = len(df)

  # Form stats_df
  dd = defaultdict(list)
  pos_range = sorted(set(df['Position index']))
  timer = util.Timer(total = len(pos_range))
  for pos_idx in pos_range:
    timer.update()
    df_s1 = df[df['Position index'] == pos_idx]
    for ref_nt in nts:
      df_s2 = df_s1[df_s1['Ref nt'] == ref_nt]
      for obs_nt in nts:
        if obs_nt == ref_nt:
          continue

        crit = (df_s2['Obs nt'] == obs_nt)
        dfs = df_s2[crit]
        dfs_freq = dfs['Frequency']

        num_zeros = sum(dfs_freq == 0)
        total = len(dfs_freq)
        if total == 0:
          continue

        dd['Num target sites with zero for mutation'].append(num_zeros)
        dd['Total num target sites for mutation'].append(total)
        dd['Frequency of zero in target sites for mutation'].append(num_zeros / total)
        dd['Num target sites in condition'].append(n_targetsites_in_condition)
        dd['Mean activity'].append(np.mean(dfs_freq))
        dd['Position index'].append(pos_idx)
        dd['Position'].append(_data.idx_to_pos(pos_idx, treat_nm))
        dd['Obs nt'].append(obs_nt)
        dd['Ref nt'].append(ref_nt)

  hm_df = pd.DataFrame(dd)
  hm_df.to_csv(out_dir + '%s.csv' % (treat_nm))

  # Median normalize
  background_range = range(25, 34 + 1)

  for ref_nt in nts:
    for obs_nt in nts:
      if obs_nt == ref_nt:
        continue

      crit = (hm_df['Ref nt'] == ref_nt) & (hm_df['Obs nt'] == obs_nt) & (~np.isnan(hm_df['Mean activity']))
      medi = np.nanmedian(hm_df[crit & (hm_df['Position'].isin(background_range))]['Mean activity'])
      hm_df.loc[crit, 'Mean activity'] = hm_df.loc[crit, 'Mean activity'].apply(lambda x: max(0, x - medi))

  hm_df.to_csv(out_dir + '%s_median_bg_adj.csv' % (treat_nm))

  return
コード例 #20
0
def gather_statistics(exp_nm):
    feature_radius = 10
    allowed_pos = range(3, 8 + 1)
    # Load data
    data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0)

    # Set up library info
    lib_nm = _data.get_lib_nm(exp_nm)
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    # Prepare data
    data = data[data['Total count'] >= 100]
    data['Frequency'] = data['Count'] / data['Total count']

    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
    data = data[data['Name'].isin(ontarget_sites)]

    data = data[data['Position'].isin(allowed_pos)]

    data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
    # data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

    # Annotate with local sequence context
    lib_zero_idx = _data.pos_to_idx(0, exp_nm)
    dd = defaultdict(list)
    print('Annotating data with local sequence contexts...')
    timer = util.Timer(total=len(data))
    for idx, row in data.iterrows():
        seq = nm_to_seq[row['Name']]
        pidx = row['Position'] + lib_zero_idx
        local_context = seq[pidx -
                            feature_radius:pidx] + seq[pidx + 1:pidx +
                                                       feature_radius + 1]
        dd['Local context'].append(local_context)
        timer.update()
    for col in dd:
        data[col] = dd[col]

    # # Gather statistics

    # for mut_nm in muts:
    #   print(mut_nm)
    #   mut = muts[mut_nm]
    #   if len(mut) == 1:
    #     d_temp = data[data['Mutation'] == mut[0]]
    #   else:
    #     d_temp = data[data['Mutation'].isin(mut)]
    #     d_temp['Mutation'] = mut_nm
    #     d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
    #     group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
    #     d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    print(data.columns)
    print(set(data['Mutation']))

    acc_muts = [
        'C_T',
        'C_G',
        'C_A',
    ]
    data = data[data['Mutation'].isin(acc_muts)]
    data = data.drop(columns=['Count', 'Total count', 'Ref nt', 'Obs nt'])
    data = data.pivot_table(
        index=['Name', 'Position', 'Local context'],
        columns='Mutation',
        values='Frequency',
    ).reset_index()
    data = data.fillna(value=0)

    numerator = data['C_G'] + data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_GA_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_T']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_T_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_A_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_A'] + data['C_G']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_GA'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    return
コード例 #21
0
def form_data(exp_nm, start_idx, end_idx):
  '''
    Annotate library design with total count, edited count, fraction edited, etc. 
  '''
  data = _data.load_data(exp_nm, 'ag5a4_profile_subset')
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  lib_nm = _data.get_lib_nm(exp_nm)

  lib_design = lib_design.iloc[start_idx : end_idx + 1]
  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  lib_design = lib_design[lib_design['Name (unique)'].isin(ontarget_sites)]

  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  stats_dd = defaultdict(list)
  new_data = dict()

  nms_shared = [nm for nm in nms if nm in data]
  timer = util.Timer(total = len(nms_shared))
  for iter, nm in enumerate(nms_shared):

    df = data[nm]
    seq = nm_to_seq[nm]

    num_mismatches = lambda x, y: sum([bool(n1 != n2) for n1,n2 in zip(x,y)])

    if 'index' in df.columns:
      df = df[[col for col in df.columns if col != 'index']]

    if len(df) == 0: continue


    ## 8/21/19
    '''
      Simulate bystander precision task in 12kChar by using the substrate nucleotide closest to the editor-specific center nt
    '''
    editor = _data.get_editor_nm(exp_nm)
    editor_to_central_pos = {
      'ABE': 6,
      'ABE-CP': 6,
      'AID': 6,
      'BE4': 6,
      'BE4-CP': 8,
      'CDA': 5,
      'eA3A': 6,
      'evoAPOBEC': 5,
    }
    if editor in editor_to_central_pos:
      central_pos = editor_to_central_pos[editor]
    else:
      central_pos = 6

    substrate = 'A' if 'ABE' in editor else 'C'
    nt_cols = [f'{substrate}{pos}' for pos in range(-3, 15) if f'{substrate}{pos}' in df.columns]
    central_col = find_central_col(central_pos, nt_cols, substrate)
    if central_col is None: continue

    mut_cols = [col for col in df.columns if col != 'Count']
    col_to_ref_nt = {col: col[0] for col in mut_cols}
    df_dd = defaultdict(list)
    for idx, row in df.iterrows():
      df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt))
      df_dd['Simulated precise'].append(is_simulated_precise(row, central_col, col_to_ref_nt))
    for col in df_dd:
      df[col] = df_dd[col]

    numer = sum(df[df['Simulated precise'] == True]['Count'])
    denom = sum(df[df['Num. edits'] > 0]['Count'])
    sim_precision = numer / denom if denom > 0 else np.nan
    stats_dd['Simulated bystander precision at editor-specific central nt'].append(sim_precision)

    stats_dd['Simulated bystander position'].append(int(central_col[1:]))
    stats_dd['Simulated bystander position, distance to center'].append(int(central_col[1:]) - central_pos)

    edited_ct = sum(df[df['Num. edits'] > 0]['Count'])
    stats_dd['Edited count'].append(edited_ct)

    stats_dd['Name (unique)'].append(nm)

    timer.update()


  stats_df_collected = pd.DataFrame(stats_dd)

  stats_df = lib_design.merge(
    stats_df_collected, 
    on = 'Name (unique)', 
    how = 'outer',
  )

  stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' % (exp_nm, start_idx, end_idx))
  return
コード例 #22
0
def form_data(exp_nm, start_idx, end_idx):
    data = _data.load_data(exp_nm, 'ag5a4_profile_subset')
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    lib_nm = _data.get_lib_nm(exp_nm)
    disease_nms = _data.get_disease_sites(lib_design, lib_nm)

    # Subset for dumb parallelization, ensure only disease target sites used
    lib_design = lib_design.iloc[start_idx:end_idx + 1]
    lib_design = lib_design[lib_design['Name (unique)'].isin(disease_nms)]

    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    stats_dd = defaultdict(list)

    nms_shared = [nm for nm in nms if nm in data]
    timer = util.Timer(total=len(nms_shared))
    for iter, nm in enumerate(nms_shared):

        df = data[nm]
        seq = nm_to_seq[nm]

        design_row = lib_design[lib_design['Name (unique)'] == nm].iloc[0]
        snp_pos = int(design_row['Position of SNP in gRNA'])
        correct_nt = design_row['Corrected nucleotide (gRNA orientation)']
        path_nt = design_row['Pathogenic nucleotide (gRNA orientation)']

        nt_cols = [
            col for col in df.columns if col != 'Count' and col != 'Frequency'
        ]

        # Impute . as wildtype
        df = impute_dot_as_wildtype(df, nt_cols)
        total_ct = sum(df['Count'])

        # Ensure each row is unique
        df = df.groupby(nt_cols)['Count'].agg('sum').reset_index()

        # Filter unedited columns
        df = subset_edited_rows(df, nt_cols)
        edited_ct = sum(df['Count'])

        df = remove_noisy_edits(df, nt_cols, exp_nm)

        gt_correct_ct = get_precise_gt_correction_count(
            df, nt_cols, snp_pos, correct_nt, path_nt)

        ## Overall statistics
        stats_dd['Name (unique)'].append(nm)

        stats_dd['Obs. correction count'].append(gt_correct_ct)
        stats_dd['Obs. total count'].append(total_ct)
        stats_dd['Obs. edited count'].append(edited_ct)

        stats_dd['Obs. gt correct fraction in all reads'].append(
            gt_correct_ct / total_ct if total_ct > 0 else np.nan)
        stats_dd['Obs. gt correct precision in edited reads'].append(
            gt_correct_ct / edited_ct if edited_ct > 0 else np.nan)
        stats_dd['Obs. editing frequency'].append(
            edited_ct / total_ct if total_ct > 0 else np.nan)

        # Amino acid correction for CtoGA
        if 'AA sequence - reference' in design_row.index and type(
                design_row['AA sequence - reference']) == str:

            orients = list('-+')
            d1 = bool(design_row['Designed orientation w.r.t. genome'] == '+')
            d2 = bool(design_row['AA frame strand'] == '+')
            xor_int = int(d1 == d2)
            aa_strand_relative_to_seq = orients[xor_int]

            aa_stats = {
                'Unedited AA': 0,
                'Edited AA': 0,
                'Goal AA': 0,
            }
            if design_row['AA sequence - pathogenic'] != design_row[
                    'AA sequence - reference']:
                for jdx, edit_row in df.iterrows():
                    seq_30nt = edit_row_to_seq_30nt(design_row, edit_row,
                                                    seq_col)
                    obs_aas = nts_to_aas(seq_30nt,
                                         design_row['AA frame position'],
                                         snp_pos, aa_strand_relative_to_seq)

                    pp0idx = design_row['Protospacer position zero index']
                    seq_30nt_path = design_row[seq_col][pp0idx - 9:pp0idx + 21]
                    aa_path_with_bc = nts_to_aas(
                        seq_30nt_path, design_row['AA frame position'],
                        snp_pos, aa_strand_relative_to_seq)

                    seq_30nt_wt = seq_30nt_path[:9 + snp_pos] + design_row[
                        'Corrected nucleotide (gRNA orientation)'] + seq_30nt_path[
                            9 + snp_pos + 1:]
                    aa_wt_with_bc = nts_to_aas(seq_30nt_wt,
                                               design_row['AA frame position'],
                                               snp_pos,
                                               aa_strand_relative_to_seq)

                    if obs_aas == aa_path_with_bc:
                        aa_stats['Unedited AA'] += edit_row['Count']
                    else:
                        aa_stats['Edited AA'] += edit_row['Count']

                    if obs_aas == aa_wt_with_bc:
                        aa_stats['Goal AA'] += edit_row['Count']

            stats_dd['Obs. aa correct precision among edited gts'].append(
                aa_stats['Goal AA'] / edited_ct if edited_ct > 0 else np.nan)
            stats_dd['Obs. aa correct precision among edited aas'].append(
                aa_stats['Goal AA'] /
                aa_stats['Edited AA'] if aa_stats['Edited AA'] > 0 else np.nan)
            stats_dd['Obs. aa correct precision among all reads'].append(
                aa_stats['Goal AA'] / total_ct if total_ct > 0 else np.nan)
            if stats_dd[
                    'Obs. aa correct precision among edited gts'] < stats_dd[
                        'Obs. gt correct precision in edited reads']:
                import code
                code.interact(local=dict(globals(), **locals()))

        else:
            stats_dd['Obs. aa correct precision among edited gts'].append(
                np.nan)
            stats_dd['Obs. aa correct precision among edited aas'].append(
                np.nan)
            stats_dd['Obs. aa correct precision among all reads'].append(
                np.nan)

        timer.update()

    # Save
    stats_df_collected = pd.DataFrame(stats_dd)

    stats_df = lib_design.merge(
        stats_df_collected,
        on='Name (unique)',
        how='outer',
    )

    stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' %
                    (exp_nm, start_idx, end_idx))
    return
コード例 #23
0
def adjust_batch_effects(lib_nm):
    print(lib_nm)
    # Gather statistics
    be_treatments = []
    batch_set = set()
    batch_to_exp_nms = defaultdict(list)
    for treat_nm in treat_control_df['Treatment']:
        if 'Cas9' in treat_nm:
            continue
        if _data.get_lib_nm(treat_nm) != lib_nm:
            continue
        batch_nm = exp_nm_to_batch[treat_nm]
        be_treatments.append(treat_nm)
        batch_set.add(batch_nm)
        batch_to_exp_nms[batch_nm].append(treat_nm)

    lib_design, seq_col = _data.get_lib_design(be_treatments[0])
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    md = dict()
    timer = util.Timer(total=len(be_treatments))
    print('Loading stats from each condition...')
    for treat_nm in be_treatments:
        with open(inp_dir + '%s.pkl' % (treat_nm), 'rb') as f:
            d = pickle.load(f)
        md[treat_nm] = d

        # df['Treatment'] = treat_nm
        # df['Batch'] = exp_nm_to_batch[treat_nm]
        # df['Editor'] = exp_nm_to_editor[treat_nm]
        timer.update()

    # ANOVA calculations
    from scipy.stats import f_oneway
    print(
        'Calculating ANOVA on all unique indels in all target sites to identify batch effects...'
    )

    dd = defaultdict(list)
    means_dd = defaultdict(lambda: defaultdict(lambda: dict()))
    timer = util.Timer(total=len(nms))
    for exp_nm in nms:

        mut_dd, all_mut_nms = form_dd(md, exp_nm)

        for mut_nm in all_mut_nms:
            anova_args = defaultdict(list)
            # Note: Ensure we do not implicitly treat a lack of data as an observation of zero
            for exp_nm_2 in mut_dd:
                anova_args[exp_nm_to_batch[exp_nm_2]].append(
                    mut_dd[exp_nm_2][mut_nm])
            '''
        Ensure non-degenerate ANOVA testing.
  
        If every batch has 0 std, we have identical values. It's likely that these identical values are 0 because of the sparsity of the data when considering unique indels (highly heterogeneous) at 12,000 target sites.

        If every batch with a non-zero value has only one observation, skip. 
      '''
            # Only perform ANOVA test on indels where at least one batch has non-zero std (otherwise it was seen only once in any batch, so it's not a batch effect)
            num_non_zero_stds = 0
            mean_d, std_d = dict(), dict()
            for batch in batch_set:
                if batch in anova_args:
                    mean_val = np.mean(anova_args[batch])
                    std_val = np.std(anova_args[batch])
                    if std_val > 0:
                        num_non_zero_stds += 1
                else:
                    mean_val = np.nan
                    std_val = np.nan
                mean_d[batch] = mean_val
                std_d[batch] = std_val

            degenerate_flag = False
            if num_non_zero_stds == 0:
                for batch in batch_set:
                    batch_data = anova_args[batch]
                    if len(batch_data) == 0:
                        continue
                    has_non_zero = bool(batch_data.count(0) != len(batch_data))
                    if has_non_zero and len(batch_data) == 1:
                        degenerate_flag = True
                    # elif has_non_zero and len(batch_data) > 1:
                    # import code; code.interact(local=dict(globals(), **locals()))
            if degenerate_flag:
                continue

            aa = tuple([s for s in anova_args.values() if len(s) != 0])
            if len(aa) < 2:
                continue

            fstat, pval = f_oneway(*aa)
            if np.isnan(pval):
                continue
            dd['Statistic'].append(fstat)
            dd['pval'].append(pval)
            dd['MutName'].append(mut_nm)
            dd['Name'].append(exp_nm)

            for batch in batch_set:
                dd['Mean %s' % (batch)].append(mean_d[batch])
                dd['Std %s' % (batch)].append(std_d[batch])
                means_dd[exp_nm][mut_nm][batch] = mean_val

        timer.update()

    stats_df = pd.DataFrame(dd)
    if len(stats_df) == 0:
        empty_df = pd.DataFrame()
        empty_df.to_csv(out_dir + 'mutation_dec_%s.csv' % (lib_nm))
        empty_df.to_csv(out_dir + 'removed_batch_effects_%s.csv' % (lib_nm))
        empty_df.to_csv(out_dir + 'removed_stats_%s.csv' % (lib_nm))
        return

    stats_df['-log10p'] = -np.log10(stats_df['pval'])

    # Apply FDR
    print(
        'Finding significant batch effects while controlling false discovery...'
    )

    fdr_threshold = 0.01
    other_distribution = stats_df[stats_df['pval'] > 0.995]
    stats_df = stats_df[stats_df['pval'] <= 0.995]
    stats_df = stats_df.sort_values(by='pval')
    stats_df = stats_df.reset_index(drop=True)

    fdr_decs, hit_reject = [], False
    for idx, pval in enumerate(stats_df['pval']):
        if hit_reject:
            dec = False
        else:
            fdr_critical = ((idx + 1) / len(stats_df)) * fdr_threshold
            dec = bool(pval <= fdr_critical)
        fdr_decs.append(dec)
        if dec is False and hit_reject is True:
            hit_reject = False
    stats_df['FDR accept'] = fdr_decs

    other_distribution['FDR accept'] = False
    stats_df = stats_df.append(other_distribution, ignore_index=True)
    stats_df.to_csv(out_dir + 'mutation_dec_%s.csv' % (lib_nm))
    '''
    Identify mutations for removal
    At mutations passing Bonferroni corrected ANOVA test,
    identify batches where mutations are frequent
  '''
    print('Identifying batches to remove mutations from...')
    to_remove = stats_df[stats_df['FDR accept'] == True]

    dd = defaultdict(list)
    dd_stats = defaultdict(list)
    timer = util.Timer(total=len(to_remove))
    for idx, row in to_remove.iterrows():
        timer.update()
        exp_nm = row['Name']
        mut_nm = row['MutName']

        means = means_dd[exp_nm][mut_nm]
        mean_vals = list(means.values())
        mean_means = np.mean(mean_vals)

        for batch_nm in means:
            if means[batch_nm] >= mean_means or means[batch_nm] >= 0.005:
                dd['Batch'].append(batch_nm)
                dd['Name'].append(exp_nm)
                dd['MutName'].append(mut_nm)

        for batch_nm in means:
            dd_stats['%s' % (batch_nm)].append(means[batch_nm])
        dd_stats['MutName'].append(mut_nm)
        dd_stats['Name'].append(exp_nm)
    batch_muts_to_remove = pd.DataFrame(dd)
    batch_muts_to_remove.to_csv(out_dir + 'removed_batch_effects_%s.csv' %
                                (lib_nm))

    batch_muts_stats = pd.DataFrame(dd_stats)
    batch_muts_stats.to_csv(out_dir + 'removed_stats_%s.csv' % (lib_nm))

    # Mutations are removed in ah6a3

    return
コード例 #24
0
def indel_anyindel_pos(exp_nm):
    data = None
    if exp_nm in set(treat_control_df['Treatment']):
        data = _data.load_data(exp_nm, 'ah6a3_remove_batch')
        is_control = False
    elif exp_nm in set(treat_control_df['Control']):
        data = _data.load_data(exp_nm, 'h6_anyindel')
        is_control = True

    if data is None:
        print('Error : could not load data')
        import code
        code.interact(local=dict(globals(), **locals()))
        sys.exit(1)

    lib_design, seq_col = _data.get_lib_design(exp_nm)

    # Init
    pos_dd = dict()
    for pos_idx in range(-25, 50):
        pos_dd[pos_idx] = []
    pos_dd['Name'] = []

    # Init
    len_dd = dict()
    for len_val in range(-40, 15 + 1):
        len_dd[len_val] = []
    len_dd['Name'] = []

    # Init
    pos_len_indel = dict()
    for indel_len in range(-40, 15 + 1):
        pos_nt_indel = dict()
        for pos_idx in range(-25, 50):
            pos_nt_indel[pos_idx] = []
        pos_nt_indel['Name'] = []
        pos_len_indel[indel_len] = pos_nt_indel

    mdf = pd.DataFrame()
    timer = util.Timer(total=len(data))
    for target_nm in data:
        df = data[target_nm]

        if 'Frequency' not in df.columns:
            df['Frequency'] = df['Count'] / np.sum(df['Count'])

        tot_count = sum(df['Count'])
        if tot_count < 100:
            continue

        crit = (df['Category'] != 'wildtype')
        dfs = df[crit]

        # Init
        target_pos_vector = defaultdict(lambda: 0)
        target_pos_len_vectors = dict()
        for indel_len in range(-40, 15 + 1):
            target_pos_vector_nt = defaultdict(lambda: 0)
            target_pos_len_vectors[indel_len] = target_pos_vector_nt
        indel_len_vector = defaultdict(lambda: 0)

        # Iterate
        df_annot = defaultdict(list)
        for idx, row in dfs.iterrows():
            indel_start = int(row['Indel start'])
            indel_end = int(row['Indel end'])
            mh_len = row['MH length']
            freq = row['Frequency']
            indel_len = int(row['Indel length'])
            cat = row['Category']

            # Gather indel length frequencies
            if cat == 'del':
                indel_len = indel_len * -1
            elif cat == 'ins':
                '''
          h6_anyindel in each dir describes indel_start and indel_end for indexing (should have difference of 1 for insertions but they do not). so we fix this here
        '''
                indel_len = indel_len
                indel_end = indel_start + 1

            indel_len_vector[indel_len] += freq

            # Adjust start and end
            if not is_control:
                package = adjust_indel_pos(indel_start, indel_end, mh_len)
                (adj_indel_start, adj_indel_end) = package
            else:
                adj_indel_start, adj_indel_end = indel_start, indel_end

            # Gather total frequency by position
            for jdx in range(adj_indel_start, adj_indel_end):
                target_pos_vector[jdx] += freq

            # Gather total frequency by position of specific nt indels
            if indel_len in target_pos_len_vectors:
                target_pos_vector_nt = target_pos_len_vectors[indel_len]
                for jdx in range(adj_indel_start, adj_indel_end):
                    target_pos_vector_nt[jdx] += freq

            df_annot['Indel start adj'].append(adj_indel_start)
            df_annot['Indel end adj'].append(adj_indel_end)

        for col in df_annot:
            dfs[col] = df_annot[col]
        dfs['Name'] = target_nm
        mdf = mdf.append(dfs, ignore_index=True)

        # Gather indel length frequencies
        for col in len_dd:
            if col != 'Name':
                len_dd[col].append(indel_len_vector[col])
            else:
                len_dd[col].append(target_nm)

        # Gather total frequency by position
        for col in pos_dd:
            if col != 'Name':
                pos_dd[col].append(target_pos_vector[col])
            else:
                pos_dd[col].append(target_nm)

            # Gather total frequency by position of 1 nt indels
        for indel_len in pos_len_indel:
            pos_nt_indel = pos_len_indel[indel_len]
            tpvn = target_pos_len_vectors[indel_len]

            for col in pos_nt_indel:
                if col != 'Name':
                    pos_nt_indel[col].append(tpvn[col])
                else:
                    pos_nt_indel[col].append(target_nm)

        timer.update()

    # Save
    pos_df = pd.DataFrame(pos_dd)
    pos_df.to_csv(out_dir + '%s_pos.csv' % (exp_nm))

    pos_df_melt = pd.melt(pos_df,
                          id_vars='Name',
                          var_name='Position',
                          value_name='Frequency')
    pos_df_melt.to_csv(out_dir + '%s_pos_melt.csv' % (exp_nm))

    # Save
    for indel_len in pos_len_indel:
        pos_nt_indel = pos_len_indel[indel_len]

        pos_nt_df = pd.DataFrame(pos_nt_indel)
        pos_nt_df.to_csv(out_dir + '%s_pos_%snt.csv' % (exp_nm, indel_len))

        pos_nt_df_melt = pd.melt(pos_nt_df,
                                 id_vars='Name',
                                 var_name='Position',
                                 value_name='Frequency')
        pos_nt_df_melt.to_csv(out_dir + '%s_pos_melt_%snt.csv' %
                              (exp_nm, indel_len))

    # Save
    len_df = pd.DataFrame(len_dd)
    len_df.to_csv(out_dir + '%s_len.csv' % (exp_nm))

    len_df_melt = pd.melt(len_df,
                          id_vars='Name',
                          var_name='Indel length',
                          value_name='Frequency')
    len_df_melt.to_csv(out_dir + '%s_len_melt.csv' % (exp_nm))

    # merged
    mdf.to_csv(out_dir + '%s.csv' % (exp_nm))
    return
コード例 #25
0
def calc_indels_global(exp_nm):
  '''
    Across all target sites, measure frequency of indels by length and starting position.
    
    Allow low readcount cutoff, but also do not normalize data within each target site by total readcount.
  '''
  data = None
  if exp_nm in set(treat_control_df['Treatment']):
    data = _data.load_data(exp_nm, 'ah6a1b_subtract')
  elif exp_nm in set(treat_control_df['Control']):
    data = _data.load_data(exp_nm, 'h6_anyindel')
    pass

  if data is None:
    print('Error : could not load data')
    import code; code.interact(local=dict(globals(), **locals()))
    sys.exit(1)

  lib_design, seq_col = _data.get_lib_design(exp_nm)

  # Init
  pos_len_indel = dict()
  for indel_len in range(-40, 15 + 1):
    pos_nt_indel = dict()
    for pos_idx in range(-25, 50):
      pos_nt_indel[pos_idx] = []
    pos_nt_indel['Name'] = []
    pos_len_indel[indel_len] = pos_nt_indel

  mdf = pd.DataFrame()
  timer = util.Timer(total = len(data))
  for target_nm in data:
    df = data[target_nm]

    if 'Frequency' not in df.columns:
      df['Frequency'] = df['Count'] / np.sum(df['Count'])

    tot_count = sum(df['Count'])
    if tot_count < 100:
      continue

    crit = (df['Category'] != 'wildtype')
    dfs = df[crit]
    
    # Init
    target_pos_len_vectors = dict()
    for indel_len in range(-40, 15 + 1):
      target_pos_vector_nt = defaultdict(lambda: 0)
      target_pos_len_vectors[indel_len] = target_pos_vector_nt
    indel_len_vector = defaultdict(lambda: 0)

    # Iterate
    df_annot = defaultdict(list)
    for idx, row in dfs.iterrows():
      indel_start = int(row['Indel start'])
      indel_end = int(row['Indel end'])
      mh_len = row['MH length']
      count = row['Count']
      indel_len = int(row['Indel length'])
      cat = row['Category']

      # Gather indel length frequencies
      if cat == 'del':
        indel_len = indel_len * -1
      elif cat == 'ins':
        '''
          h6_anyindel in each dir describes indel_start and indel_end for indexing (should have difference of 1 for insertions but they do not). so we fix this here
        '''
        indel_len = indel_len
        indel_end = indel_start + 1

      indel_len_vector[indel_len] += count

      # Adjust start and end
      package = adjust_indel_pos(indel_start, indel_end, mh_len)
      (adj_indel_start, adj_indel_end) = package

      # Add count to adj. start pos at the right length
      if indel_len in target_pos_len_vectors:
        target_pos_vector_nt = target_pos_len_vectors[indel_len]
        target_pos_vector_nt[adj_indel_start] += count

      df_annot['Indel start adj'].append(adj_indel_start)
      df_annot['Indel end adj'].append(adj_indel_end)

    for col in df_annot:
      dfs[col] = df_annot[col]
    dfs['Name'] = target_nm
    mdf = mdf.append(dfs, ignore_index = True)

    # Gather total frequency by position and length of indels
    for indel_len in pos_len_indel:
      pos_nt_indel = pos_len_indel[indel_len]
      tpvn = target_pos_len_vectors[indel_len]

      for col in pos_nt_indel:
        if col != 'Name':
          pos_nt_indel[col].append(tpvn[col])
        else:
          pos_nt_indel[col].append(target_nm)

    timer.update()

  # Save
  for indel_len in pos_len_indel:
    pos_nt_indel = pos_len_indel[indel_len]

    pos_nt_df = pd.DataFrame(pos_nt_indel)
    pos_nt_df.to_csv(out_dir + '%s_pos_%snt.csv' % (exp_nm, indel_len))

    pos_nt_df_melt = pd.melt(pos_nt_df, id_vars = 'Name', var_name = 'Position', value_name = 'Count')
    pos_nt_df_melt.to_csv(out_dir + '%s_pos_melt_%snt.csv' % (exp_nm, indel_len))

  # merged
  mdf.to_csv(out_dir + '%s.csv' % (exp_nm))
  return
コード例 #26
0
def form_data(exp_nm):
    data = _data.load_data(exp_nm, 'ag4_poswise_be_adjust')
    lib_design, seq_col = _data.get_lib_design(exp_nm)

    # Get target nt
    editor_type = _data.get_editor_type(exp_nm)
    if editor_type == 'CtoTeditor':
        target_nt = 'C'
    elif editor_type == 'AtoGeditor':
        target_nt = 'A'

    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    grnas = lib_design['gRNA (20nt)']
    design_cats = lib_design['Design category']
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}
    nm_to_grna = {nm: grna for nm, grna in zip(nms, grnas)}
    nm_to_design_cat = {
        nm: design_cat
        for nm, design_cat in zip(nms, design_cats)
    }

    dd = defaultdict(list)

    timer = util.Timer(total=len(data))
    for nm in data:
        pw = data[nm]
        seq = nm_to_seq[nm]
        grna = nm_to_grna[nm]
        design_cat = nm_to_design_cat[nm]

        # Get category, subcategory, and match count
        match_count = get_match_count(grna, seq)
        if design_cat == 'guideseq':
            category = 'Off-target series'
            subcategory = nm.split('_')[2]  # gene name
        elif design_cat == 'mismatch':
            category = 'Mismatch series'
            subcategory = nm.split('_')[1]  # series number
        elif design_cat == 'chipseq':
            category = 'Chip series'
        elif design_cat == 'vivo':
            category = 'vivo'
            subcategory = 'vivo'
        else:
            assert match_count == 20, 'fail'
            category = 'On-target'
            subcategory = 'On-target'

        for jdx in range(len(pw)):
            pos = _data.idx_to_pos(jdx, exp_nm)
            if pos not in [6, 7]:
                continue
            ref_nt = seq[jdx]
            if ref_nt != target_nt:
                continue
            ref_idx = nt_to_idx[ref_nt]
            total = sum(pw[jdx])
            edit_ct = 0
            for kdx in range(len(pw[jdx])):
                if kdx == ref_idx:
                    continue
                edit_ct += pw[jdx][kdx]

            if total > 0:
                dd['Edited fraction'].append(edit_ct / total)
            else:
                dd['Edited fraction'].append(np.nan)
            dd['Edit count'].append(edit_ct)
            dd['Total count'].append(total)
            dd['Position'].append(pos)
            dd['Ref nt'].append(ref_nt)
            dd['Name'].append(nm)

            dd['Match count'].append(int(match_count))
            dd['Category'].append(category)
            dd['Subcategory'].append(subcategory)

        timer.update()

    df = pd.DataFrame(dd)
    df.to_csv(out_dir + '%s.csv' % (exp_nm))

    return
コード例 #27
0
def form_data(exp_nm, start_idx, end_idx):
    '''
    Annotate library design with total count, edited count, fraction edited, etc. 
  '''
    data = _data.load_data(exp_nm, 'ag5a4_profile_subset')
    lib_design, seq_col = _data.get_lib_design(exp_nm)

    lib_design = lib_design.iloc[start_idx:end_idx + 1]

    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    stats_dd = defaultdict(list)
    new_data = dict()

    nms_shared = [nm for nm in nms if nm in data]
    timer = util.Timer(total=len(nms_shared))
    for iter, nm in enumerate(nms_shared):

        df = data[nm]
        seq = nm_to_seq[nm]

        num_mismatches = lambda x, y: sum(
            [bool(n1 != n2) for n1, n2 in zip(x, y)])

        if 'index' in df.columns:
            df = df[[col for col in df.columns if col != 'index']]

        if len(df) == 0:
            continue

        ## Row-wise statistics
        mut_cols = [col for col in df.columns if col != 'Count']
        col_to_ref_nt = {col: col[0] for col in mut_cols}
        df_dd = defaultdict(list)
        for idx, row in df.iterrows():
            df_dd['Num. edits'].append(get_num_edits(row, col_to_ref_nt))
            df_dd['Has aberrant CBE edit'].append(
                has_aberrant_cbe_edits(row, col_to_ref_nt))
        for col in df_dd:
            df[col] = df_dd[col]
        new_data[nm] = df

        ## Overall statistics
        stats_dd['Name (unique)'].append(nm)

        edited_ct = sum(df[df['Num. edits'] > 0]['Count'])
        stats_dd['Edited count'].append(edited_ct)

        cbe_aberrant_ct = sum(df[df['Has aberrant CBE edit'] == True]['Count'])
        stats_dd['CBE aberrant count'].append(cbe_aberrant_ct)

        total_ct = sum(df['Count'])
        stats_dd['Total count'].append(total_ct)

        try:
            frac = edited_ct / total_ct
        except ZeroDivisionError:
            frac = np.nan
        stats_dd['Fraction edited'].append(frac)

        try:
            frac = cbe_aberrant_ct / total_ct
        except ZeroDivisionError:
            frac = np.nan
        stats_dd['Fraction CBE aberrant edit'].append(frac)

        timer.update()

    # Save
    with open(out_dir + '%s_%s_%s.pkl' % (exp_nm, start_idx, end_idx),
              'wb') as f:
        pickle.dump(new_data, f)

    stats_df_collected = pd.DataFrame(stats_dd)

    import code
    code.interact(local=dict(globals(), **locals()))
    stats_df = lib_design.merge(
        stats_df_collected,
        on='Name (unique)',
        how='outer',
    )

    stats_df.to_csv(out_dir + '%s_%s_%s_stats.csv' %
                    (exp_nm, start_idx, end_idx))
    return
コード例 #28
0
def indel_anyindel(exp_nm):
    try:
        data = _data.load_data(exp_nm, 'ah6a1b_subtract')
    except:
        print('Error : could not load data')
        sys.exit(1)

    lib_design, seq_col = _data.get_lib_design(exp_nm)

    dd = defaultdict(list)
    timer = util.Timer(total=len(data))
    for target_nm in data:
        df = data[target_nm]

        tot_count = sum(df['Count'])
        dd['Total count'].append(tot_count)
        dd['Name (unique)'].append(target_nm)

        crit = (df['Category'] != 'wildtype')
        indel_count = sum(df[crit]['Count'])
        dd['Indel count'].append(indel_count)
        if tot_count != 0:
            dd['Indel freq'].append(indel_count / tot_count)
        else:
            dd['Indel freq'].append(np.nan)

        crit = (df['Category'] == 'del')
        del_count = sum(df[crit]['Count'])
        dd['Del count'].append(del_count)
        if tot_count != 0:
            dd['Del freq'].append(del_count / tot_count)
        else:
            dd['Del freq'].append(np.nan)

        crit = (df['Category'] == 'ins')
        ins_count = sum(df[crit]['Count'])
        dd['Ins count'].append(ins_count)
        if tot_count != 0:
            dd['Ins freq'].append(ins_count / tot_count)
        else:
            dd['Ins freq'].append(np.nan)

        crit = (df['Category'] == 'wildtype')
        wt_count = sum(df[crit]['Count'])
        dd['Wildtype count'].append(wt_count)
        if tot_count != 0:
            dd['Wildtype freq'].append(wt_count / tot_count)
        else:
            dd['Wildtype freq'].append(np.nan)

        timer.update()

    df = pd.DataFrame(dd)

    data = lib_design.merge(
        df,
        on='Name (unique)',
        how='outer',
    )

    # Annotate csvs
    pam_start_idx = 33
    pam_len = 5
    get_pam = lambda row: row['Sequence context (61nt)'][
        pam_start_idx:pam_start_idx + pam_len]

    data['Designed PAM (5nt)'] = data.apply(get_pam, axis='columns')

    get_true_grna_len = lambda row: 20 if row['gRNA (20nt)'][0] == 'G' else 21
    data['True gRNA length'] = data.apply(get_true_grna_len, axis='columns')

    grna_pos1_idx = 13
    grna_pos0_idx = grna_pos1_idx - 1
    grna_5primeG_matches_target = lambda row: bool(row[
        'Sequence context (61nt)'][grna_pos1_idx] == 'G') if bool(row[
            'True gRNA length'] == 20) else bool(row['Sequence context (61nt)']
                                                 [grna_pos0_idx] == 'G')
    data['gRNA 5primeG matches target'] = data.apply(
        grna_5primeG_matches_target, axis='columns')

    # Prepare data
    # data = data[data['Total count'] >= 100]

    # # Gather statistics

    def grna_5primeg_and_len(row):
        grna_len = row['True gRNA length']
        match = row['gRNA 5primeG matches target']
        if match:
            return f'{grna_len}-nt gRNA, 5primeG matches'
        else:
            return f'{grna_len}-nt gRNA, 5primeG does not match'

    data['gRNA properties'] = data.apply(grna_5primeg_and_len, axis='columns')
    data.to_csv(out_dir + f'{exp_nm}.csv')

    data = data[data['Total count'] >= 100]

    pv_df = data.pivot(index='Name (unique)',
                       columns='gRNA properties',
                       values='Indel freq')
    pv_df.to_csv(out_dir + f'5primeG_{exp_nm}.csv')

    return