def gather_statistics(exp_nm, params):
  (muts, allowed_pos, feature_radius) = params
  # Load data
  data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0)

  # Set up library info
  lib_nm = _data.get_lib_nm(exp_nm)
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  # Prepare data
  data = data[data['Total count'] >= 100]
  data['Frequency'] = data['Count'] / data['Total count']

  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  data = data[data['Name'].isin(ontarget_sites)]

  data = data[data['Position'].isin(allowed_pos)]

  data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
  data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

  # Annotate with local sequence context
  lib_zero_idx = _data.pos_to_idx(0, exp_nm)
  dd = defaultdict(list)
  print('Annotating data with local sequence contexts...')
  timer = util.Timer(total = len(data))
  for idx, row in data.iterrows():
    seq = nm_to_seq[row['Name']]
    pidx = row['Position'] + lib_zero_idx
    local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1]
    dd['Local context'].append(local_context)
    timer.update()
  for col in dd:
    data[col] = dd[col]

  # # Gather statistics

  for mut_nm in muts:
    print(mut_nm)
    mut = muts[mut_nm]
    if len(mut) == 1:
      d_temp = data[data['Mutation'] == mut[0]]
    else:
      d_temp = data[data['Mutation'].isin(mut)]
      d_temp['Mutation'] = mut_nm
      d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
      group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
      d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    for ml_task in ['classify_zero', 'regress_nonzero']:
      print(ml_task)
      results = train_models(exp_nm, d_temp, mut_nm, ml_task)
      save_results(exp_nm, mut_nm, ml_task, results)



  return
def indel_anyindel_seq(exp_nm):
    '''
    Annotate indels with related sequence context (e.g., bases in deletions) 
  '''
    df = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0)

    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    indel_dd = defaultdict(list)
    all_nms = set(df['Name'])
    timer = util.Timer(total=len(df))
    for idx, row in df.iterrows():
        # dfs = df[df['Name'] == nm]

        nm = row['Name']
        seq = nm_to_seq[nm]
        left_del_nt = np.nan
        right_del_nt = np.nan
        del_nts = np.nan

        if row['Category'] == 'del':
            start_pos = int(row['Indel start adj'])
            start_idx = _data.pos_to_idx(start_pos, exp_nm)

            end_pos = int(row['Indel end adj'])
            end_idx = _data.pos_to_idx(end_pos, exp_nm)

            if start_idx >= 0 and end_idx <= len(seq):
                del_nts = seq[start_idx:end_idx]
                left_del_nt = del_nts[0]
                right_del_nt = del_nts[-1]

        indel_dd['Left del nt'].append(left_del_nt)
        indel_dd['Right del nt'].append(right_del_nt)
        indel_dd['Del nts'].append(del_nts)

        timer.update()

    for col in indel_dd:
        df[col] = indel_dd[col]

    df.to_csv(out_dir + '%s.csv' % (exp_nm))

    return
Beispiel #3
0
def load_human_data(dataset_id):
  if 'CSNVL' not in dataset_id:
    lib_nm = _data.get_lib_nm(dataset_id)
    lib_design, seq_col = _data.get_lib_design(dataset_id)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
  else:
    # Use any conds to load 12kChar, CtoT, and AtoG libs
    dids = ['190418_mES_12kChar_AID', '190329_HEK293T_AtoG_ABE', '190307_HEK_CtoT_BE4']
    nms, seqs = [], []
    for did in dids:
      lib_design, seq_col = _data.get_lib_design(did)
      nms += list(lib_design['Name (unique)'])
      seqs += list(lib_design[seq_col])

  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}


  Y_dir = _config.OUT_PLACE + 'combin_data_Y_imputewt/'
  with gzip.open(Y_dir + '%s.pkl.gz' % (dataset_id), 'rb') as f:
    Y = pickle.load(f)
  
  NAMES = list(Y.keys())
  Y = list(Y.values())

  # Load X
  if 'CSNVL' not in dataset_id:
    zero_idxs = [_data.pos_to_idx(0, dataset_id)] * len(NAMES)
  else:
    zero_idxs = []
    for nm in NAMES:
      if 'satmut' in nm:
        # 21
        zero_idxs.append(_data.zero_pos['12kChar'])
      else:
        # CtoT = AtoG = 10
        zero_idxs.append(_data.zero_pos['CtoT'])

  X = []
  timer = _util.Timer(total = len(NAMES))
  for nm, y, zero_idx in zip(NAMES, Y, zero_idxs):
    seq = nm_to_seq[nm]
    # seq_30nt = seq[zero_idx - 9 : zero_idx + 20 + 1]
    if zero_idx >= 9 + 10:
      # 12kChar
      pass
    else:
      # CtoT, AtoG libs
      prefix = 'GATGGGTGCGACGCGTCAT'
      seq = prefix + seq
      zero_idx += len(prefix)

    seq_50nt = seq[zero_idx - 9 - 10 : zero_idx + 20 + 10 + 1]
    assert len(seq_50nt) == 50
    X.append(seq_50nt)

  return X, Y, NAMES
def indel_anyindel_seq(exp_nm):
  '''
    Investigate if 1 nt deletions at abasic site are related to microhomology
    Control for position by focusing only on pos 5
  '''
  df = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0)

  lib_design, seq_col = _data.get_lib_design(exp_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  dd = defaultdict(list)
  all_nms = set(df['Name'])

  five_idx = _data.pos_to_idx(4, exp_nm)

  import code; code.interact(local=dict(globals(), **locals()))

  timer = util.Timer(total = len(all_nms))
  for nm in all_nms:
    dfs = df[df['Name'] == nm]

    seq = nm_to_seq[nm]
    accept = bool(seq[five_idx] == 'C') & (seq[five_idx + 1] != 'C')

    for jdx in range(five_idx - 1, -1, -1):
      if seq[jdx] != 'C':
        break
    num_c = abs(five_idx - jdx)


    if not accept:
      continue

    crit = (dfs['Category'] == 'del') & (dfs['Indel length'] == 1) & (dfs['Indel end adj'] == 5.0)
    row = dfs[crit]

    if len(row) == 0:
      dd['Frequency'].append(0)
    else:
      dd['Frequency'].append(sum(row['Frequency']))

    dd['Num C'].append(num_c)
    dd['Name'].append(nm)

    timer.update()

  df = pd.DataFrame(dd)
  df.to_csv(out_dir + '%s.csv' % (exp_nm))

  return
def filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm):
    # timer = util.Timer(total = len(to_remove))
    for idx, row in to_remove.iterrows():
        pos_idx = _data.pos_to_idx(row['Position'], treat_nm)
        kdx = nt_to_idx[row['Obs nt']]
        ref_nt = row['Ref nt']

        for nm in adj_d:
            seq = nm_to_seq[nm]
            if seq[pos_idx] == ref_nt:
                t = adj_d[nm]
                t[pos_idx][kdx] = 0
                adj_d[nm] = t
        # timer.update()

    return adj_d
Beispiel #6
0
def filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm):
    # timer = util.Timer(total = len(to_remove))
    for idx, row in to_remove.iterrows():
        pos_idx = _data.pos_to_idx(row['Position'], treat_nm)
        kdx = nt_to_idx[row['Obs nt']]
        ref_nt = row['Ref nt']

        for nm in adj_d:
            seq = nm_to_seq[nm]

            try:
                if seq[pos_idx] == ref_nt:
                    t = adj_d[nm]
                    t[pos_idx][kdx] = np.nan
                    adj_d[nm] = t
            except IndexError:
                # 8/14/19: Not sure why this would happen -- if indexerror, that pos_idx shouldn't be able to be considered for this treat_nm in the first place to identify a batch effect. Hacky fix :/
                print(treat_nm, nm, pos_idx, len(seq))
                pass
        # timer.update()

    return adj_d
def gather_statistics(exp_nm):
    feature_radius = 10
    allowed_pos = range(3, 8 + 1)
    # Load data
    data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col=0)

    # Set up library info
    lib_nm = _data.get_lib_nm(exp_nm)
    lib_design, seq_col = _data.get_lib_design(exp_nm)
    nms = lib_design['Name (unique)']
    seqs = lib_design[seq_col]
    nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

    # Prepare data
    data = data[data['Total count'] >= 100]
    data['Frequency'] = data['Count'] / data['Total count']

    ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
    data = data[data['Name'].isin(ontarget_sites)]

    data = data[data['Position'].isin(allowed_pos)]

    data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
    # data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

    # Annotate with local sequence context
    lib_zero_idx = _data.pos_to_idx(0, exp_nm)
    dd = defaultdict(list)
    print('Annotating data with local sequence contexts...')
    timer = util.Timer(total=len(data))
    for idx, row in data.iterrows():
        seq = nm_to_seq[row['Name']]
        pidx = row['Position'] + lib_zero_idx
        local_context = seq[pidx -
                            feature_radius:pidx] + seq[pidx + 1:pidx +
                                                       feature_radius + 1]
        dd['Local context'].append(local_context)
        timer.update()
    for col in dd:
        data[col] = dd[col]

    # # Gather statistics

    # for mut_nm in muts:
    #   print(mut_nm)
    #   mut = muts[mut_nm]
    #   if len(mut) == 1:
    #     d_temp = data[data['Mutation'] == mut[0]]
    #   else:
    #     d_temp = data[data['Mutation'].isin(mut)]
    #     d_temp['Mutation'] = mut_nm
    #     d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
    #     group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
    #     d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    print(data.columns)
    print(set(data['Mutation']))

    acc_muts = [
        'C_T',
        'C_G',
        'C_A',
    ]
    data = data[data['Mutation'].isin(acc_muts)]
    data = data.drop(columns=['Count', 'Total count', 'Ref nt', 'Obs nt'])
    data = data.pivot_table(
        index=['Name', 'Position', 'Local context'],
        columns='Mutation',
        values='Frequency',
    ).reset_index()
    data = data.fillna(value=0)

    numerator = data['C_G'] + data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_GA_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_T']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_T_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_A']
    denominator = data['C_T'] + data['C_G'] + data['C_A']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_A_over_C_D'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    ##
    numerator = data['C_G']
    denominator = data['C_A'] + data['C_G']
    data['Frequency'] = numerator / denominator
    data = data.dropna()

    mut_name = 'C_G_over_C_GA'
    data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(
        str) + '_' + mut_name
    print(data.shape)

    for ml_task in ['regress_nonzero', 'classify_zero']:
        print(ml_task)
        results = train_models(exp_nm, data, mut_name, ml_task)
        save_results(exp_nm, mut_name, ml_task, results)

    return