def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(df)
    df = _lib.indels_without_mismatches_subset(df)
    df = df[df['Length'] >= 5]
    if sum(df['Count']) <= 1000:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(df)
    pred_df = _predict2.predict_mhdel(seq, cutsite)
    pred_df = pred_df[pred_df['Length'] >= 5]
    pred_df['Predicted_Frequency'] = pred_df['Predicted_Frequency'] / sum(
        pred_df['Predicted_Frequency'])

    join_cols = ['Category', 'Genotype Position', 'Length']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']

    ns_criteria = (mdf['Length'] - mdf['Genotype Position'] == 0)
    s = mdf[ns_criteria]

    alldf_dict['Predicted Ngt'] += list(s['Predicted_Frequency'])
    alldf_dict['Observed Ngt'] += list(s['Frequency'])
    alldf_dict['_Experiment'] += [exp] * len(s['Frequency'])

    return alldf_dict
Example #2
0
def calc_statistics(orig_df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    df = df[df['Length'] >= 5]
    if sum(df['Count']) <= 500:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(df)
    pred_df = _predict2.predict_mhdel_cpf1(seq, cutsite)

    join_cols = ['Category', 'Genotype Position', 'Length']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']
    r = pearsonr(obs, pred)[0]
    alldf_dict['gt_r'].append(r)

    alldf_dict['_Experiment'].append(exp)

    return alldf_dict
def predict(inp_fn):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    _predict2.init_model(run_iter='aay', param_iter='aae')
    df_buffer = init_df_buffer()
    df_buffer_nm = ''

    timer = util.Timer(total=util.line_count(inp_fn))
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 2 == 0:
                header = line.strip()
                if df_buffer_nm == '':
                    df_buffer_nm = header

            if i % 2 == 1:
                sequence = line.strip()
                if len(sequence) < 60:
                    continue
                df_buffer = add_del_profiles(header, sequence, df_buffer)

                print len(df_buffer)
                if len(df_buffer) > 100000:
                    flush_df_buffer(df_buffer, df_buffer_nm)
                    df_buffer_nm = ''
                    df_buffer = init_df_buffer()
            timer.update()
    return
def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is ins
    if sum(_lib.crispr_subset(df)['Count']) <= 1000:
        return

    ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & (
        df['Indel with Mismatches'] != 'yes')
    ins_count = sum(df[ins_criteria]['Count'])

    del_criteria = (df['Category']
                    == 'del') & (df['Indel with Mismatches'] != 'yes')
    del_count = sum(df[del_criteria]['Count'])
    if del_count == 0:
        return
    alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count))

    mhdel_crit = (df['Category']
                  == 'del') & (df['Indel with Mismatches'] !=
                               'yes') & (df['Microhomology-Based'] == 'yes')
    mhdel_count = sum(df[mhdel_crit]['Count'])
    try:
        alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count /
                                                (mhdel_count + ins_count))
    except ZeroDivisionError:
        alldf_dict['Ins1bp/MHDel Ratio'].append(0)

    ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count'])
    alldf_dict['Ins1bp Ratio'].append(ins_ratio)

    seq, cutsite = _lib.get_sequence_cutsite(df)
    alldf_dict['Sequence Context'].append(seq[-55:-30] + 'NNNN' + seq[-26:])

    alldf_dict['Fourbp'].append(seq[cutsite - 2:cutsite + 2])

    alldf_dict['Base1'].append(seq[cutsite - 2])
    alldf_dict['Base2'].append(seq[cutsite - 1])
    alldf_dict['Base3'].append(seq[cutsite])
    alldf_dict['Base4'].append(seq[cutsite + 1])

    _predict2.init_model()
    del_score = _predict2.total_deletion_score(seq, cutsite)
    alldf_dict['Del Score'].append(del_score)

    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    from scipy.stats import entropy
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    alldf_dict['Entropy'].append(norm_entropy)

    alldf_dict['_Experiment'].append(exp)

    return alldf_dict
Example #5
0
def calc_statistics(orig_df, exp, alldf_dict):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.mh_del_subset(orig_df)
  df = _lib.indels_without_mismatches_subset(df)
  if sum(df['Count']) <= 1000:
    return
  df['Frequency'] = _lib.normalize_frequency(df)

  _predict2.init_model()

  seq, cutsite = _lib.get_sequence_cutsite(df)
  pred_df = _predict2.predict_mhdel(seq, cutsite)

  join_cols = ['Category', 'Genotype Position', 'Length']
  mdf = df.merge(pred_df, how = 'outer', on = join_cols)
  mdf['Frequency'].fillna(value = 0, inplace = True)
  mdf['Predicted_Frequency'].fillna(value = 0, inplace = True)
  obs = mdf['Frequency']
  pred = mdf['Predicted_Frequency']

  obs_entropy = entropy(obs) / np.log(len(obs))
  pred_entropy = entropy(pred) / np.log(len(pred))
  alldf_dict['obs gt entropy'].append(obs_entropy)
  alldf_dict['pred gt entropy'].append(pred_entropy)

  df = orig_df[orig_df['Category'] == 'del']
  df = df[df['Length'] <= 28]
  df['Frequency'] = _lib.normalize_frequency(df)
  obs_dl = []
  for del_len in range(1, 28+1):
    freq = sum(df[df['Length'] == del_len]['Frequency'])
    obs_dl.append(freq)
  pred_dl = _predict2.deletion_length_distribution(seq, cutsite)

  obs_entropy = entropy(obs_dl) / np.log(len(obs_dl))
  pred_entropy = entropy(pred_dl) / np.log(len(pred_dl))
  alldf_dict['obs dl entropy'].append(obs_entropy)
  alldf_dict['pred dl entropy'].append(pred_entropy)

  alldf_dict['_Experiment'].append(exp)

  return alldf_dict
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs, data_nm):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.mh_del_subset(orig_df)
  df = _lib.indels_without_mismatches_subset(df)
  if sum(df['Count']) <= 1000:
    return
  
  obs_d = defaultdict(list)

  df = orig_df
  # Grab observed deletions, MH and MH-less
  for del_len in range(1, 59+1):
    crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == del_len)
    s = df[crit]

    mh_s = s[s['Microhomology-Based'] == 'yes']
    for idx, row in mh_s.iterrows():
      obs_d['Count'].append(row['Count'])
      obs_d['Genotype Position'].append(row['Genotype Position'])
      obs_d['Length'].append(row['Length'])
      obs_d['Category'].append('del')

    mhless_s = s[s['Microhomology-Based'] != 'yes']
    obs_d['Length'].append(del_len)
    obs_d['Count'].append(sum(mhless_s['Count']))
    obs_d['Genotype Position'].append('e')
    obs_d['Category'].append('del')

  obs_df = pd.DataFrame(obs_d) 

  # Grab observed 1 bp insertions
  ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) & (orig_df['Indel with Mismatches'] != 'yes')
  ins_df = orig_df[ins_crit]
  truncated_ins_d = defaultdict(list)
  for ins_base in list('ACGT'):
    crit = (ins_df['Inserted Bases'] == ins_base)
    tot_count = sum(ins_df[crit]['Count'])
    truncated_ins_d['Count'].append(tot_count)
    truncated_ins_d['Inserted Bases'].append(ins_base)
    truncated_ins_d['Category'].append('ins')
    truncated_ins_d['Length'].append(1)
  ins_df = pd.DataFrame(truncated_ins_d)
  obs_df = obs_df.append(ins_df, ignore_index = True)

  obs_df['Frequency'] = _lib.normalize_frequency(obs_df)

  crispr_subset = _lib.crispr_subset(orig_df)
  frac_explained = sum(obs_df['Count']) / sum(crispr_subset['Count'])
  # print frac_explained
  # Save this for aggregate plotting
  alldf_dict['Fraction Explained'].append(frac_explained)

  # Predict MH dels and MH-less dels
  _predict2.init_model()
  seq, cutsite = _lib.get_sequence_cutsite(orig_df)
  pred_df = _predict2.predict_indels(seq, cutsite, 
                                     rate_model, bp_model)


  mdf = obs_df.merge(pred_df, how = 'outer', on = ['Category', 'Genotype Position', 'Inserted Bases', 'Length'])
  mdf['Frequency'].fillna(value = 0, inplace = True)
  mdf['Predicted_Frequency'].fillna(value = 0, inplace = True)
  r = pearsonr(mdf['Frequency'], mdf['Predicted_Frequency'])[0]

  # Merge observed and predicted, double check correlation
  # Store in a way that I can plot it.

  data_nm_out_dir = out_dir + data_nm + '/'
  util.ensure_dir_exists(data_nm_out_dir)
  exp_out_dir = data_nm_out_dir + exp + '/'
  util.ensure_dir_exists(exp_out_dir)
  out_fn = exp_out_dir + '%.3f.csv' % (r)
  mdf.to_csv(out_fn)

  # Store in alldf_dict
  alldf_dict['_Experiment'].append(exp)
  alldf_dict['rs'].append(rs)

  return alldf_dict
Example #7
0
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    if sum(df['Count']) <= 1000:
        return

    ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1)
    ins_df = orig_df[ins_crit]
    truncated_ins_d = defaultdict(list)
    for ins_base in list('ACGT'):
        crit = (ins_df['Inserted Bases'] == ins_base)
        tot_count = sum(ins_df[crit]['Count'])
        truncated_ins_d['Count'].append(tot_count)
        truncated_ins_d['Inserted Bases'].append(ins_base)
        truncated_ins_d['Category'].append('ins')
        truncated_ins_d['Length'].append(1)
    ins_df = pd.DataFrame(truncated_ins_d)
    df = df.append(ins_df, ignore_index=True)
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(orig_df)
    pred_df = _predict2.predict_mhdel(seq, cutsite)

    # Predict rate of 1 bp insertions
    # Featurize first
    del_score = _predict2.total_deletion_score(seq, cutsite)
    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    ohmapper = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1]
    }
    fivebase = seq[cutsite - 1]
    onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score]
    onebp_features = np.array(onebp_features).reshape(1, -1)
    rate_1bpins = float(rate_model.predict(onebp_features))

    # Predict 1 bp genotype frequencies
    pred_1bpins_d = defaultdict(list)
    for ins_base in bp_model[fivebase]:
        freq = bp_model[fivebase][ins_base]
        freq *= rate_1bpins / (1 - rate_1bpins)

        pred_1bpins_d['Category'].append('ins')
        pred_1bpins_d['Length'].append(1)
        pred_1bpins_d['Inserted Bases'].append(ins_base)
        pred_1bpins_d['Predicted_Frequency'].append(freq)

    pred_1bpins_df = pd.DataFrame(pred_1bpins_d)
    pred_df = pred_df.append(pred_1bpins_df, ignore_index=True)
    pred_df['Predicted_Frequency'] /= sum(pred_df['Predicted_Frequency'])

    join_cols = ['Category', 'Genotype Position', 'Length', 'Inserted Bases']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']
    r = pearsonr(obs, pred)[0]
    alldf_dict['gt_r'].append(r)

    obs_entropy = entropy(obs) / np.log(len(obs))
    pred_entropy = entropy(pred) / np.log(len(pred))
    alldf_dict['obs entropy'].append(obs_entropy)
    alldf_dict['pred entropy'].append(pred_entropy)

    alldf_dict['_Experiment'].append(exp)
    alldf_dict['rs'].append(rs)

    return alldf_dict
Example #8
0
def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is ins
    if sum(_lib.crispr_subset(df)['Count']) <= 1000:
        return

    editing_rate = sum(_lib.crispr_subset(df)['Count']) / sum(
        _lib.notnoise_subset(df)['Count'])
    alldf_dict['Editing Rate'].append(editing_rate)

    ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & (
        df['Indel with Mismatches'] != 'yes')
    ins_count = sum(df[ins_criteria]['Count'])

    del_criteria = (df['Category']
                    == 'del') & (df['Indel with Mismatches'] != 'yes')
    del_count = sum(df[del_criteria]['Count'])
    if del_count == 0:
        return
    alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count))

    mhdel_crit = (df['Category']
                  == 'del') & (df['Indel with Mismatches'] !=
                               'yes') & (df['Microhomology-Based'] == 'yes')
    mhdel_count = sum(df[mhdel_crit]['Count'])
    try:
        alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count /
                                                (mhdel_count + ins_count))
    except ZeroDivisionError:
        alldf_dict['Ins1bp/MHDel Ratio'].append(0)

    ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count'])
    alldf_dict['Ins1bp Ratio'].append(ins_ratio)

    seq, cutsite = _lib.get_sequence_cutsite(df)
    fivebase = seq[cutsite - 1]
    alldf_dict['Fivebase'].append(fivebase)

    _predict2.init_model()
    del_score = _predict2.total_deletion_score(seq, cutsite)
    alldf_dict['Del Score'].append(del_score)

    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    from scipy.stats import entropy
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    alldf_dict['Entropy'].append(norm_entropy)

    local_seq = seq[cutsite - 4:cutsite + 4]
    gc = (local_seq.count('C') + local_seq.count('G')) / len(local_seq)
    alldf_dict['GC'].append(gc)

    if fivebase == 'A':
        fivebase_oh = np.array([1, 0, 0, 0])
    if fivebase == 'C':
        fivebase_oh = np.array([0, 1, 0, 0])
    if fivebase == 'G':
        fivebase_oh = np.array([0, 0, 1, 0])
    if fivebase == 'T':
        fivebase_oh = np.array([0, 0, 0, 1])
    alldf_dict['Fivebase_OH'].append(fivebase_oh)

    threebase = seq[cutsite]
    alldf_dict['Threebase'].append(threebase)
    if threebase == 'A':
        threebase_oh = np.array([1, 0, 0, 0])
    if threebase == 'C':
        threebase_oh = np.array([0, 1, 0, 0])
    if threebase == 'G':
        threebase_oh = np.array([0, 0, 1, 0])
    if threebase == 'T':
        threebase_oh = np.array([0, 0, 0, 1])
    alldf_dict['Threebase_OH'].append(threebase_oh)

    alldf_dict['_Experiment'].append(exp)

    return alldf_dict
Example #9
0
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    if sum(df['Count']) <= 1000:
        return

    df = orig_df

    # Get observed frameshift rates
    obs_fs = {'+0': 0, '+1': 0, '+2': 0}
    all_ins_lens = set(df[df['Category'].isin(['ins',
                                               'ins_notatcut'])]['Length'])
    for ins_len in all_ins_lens:
        crit = (df['Category'].isin(['ins', 'ins_notatcut'])) & (df['Length']
                                                                 == ins_len)
        fs = ins_len % 3
        count = sum(df[crit]['Count'])
        key = '+%s' % (int(fs))
        obs_fs[key] += count

    all_del_lens = set(df[df['Category'].isin(['del',
                                               'del_notatcut'])]['Length'])
    for del_len in all_del_lens:
        crit = (df['Category'].isin(['del', 'del_notatcut'])) & (df['Length']
                                                                 == del_len)
        fs = (-1 * del_len) % 3
        count = sum(df[crit]['Count'])
        key = '+%s' % (int(fs))
        obs_fs[key] += count

    tot = sum(obs_fs.values())
    for key in obs_fs:
        obs_fs[key] /= tot

    # Predict
    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(orig_df)

    # Predict rate of 1 bp insertions
    # Featurize first
    del_score = _predict2.total_deletion_score(seq, cutsite)
    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    ohmapper = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1]
    }
    fivebase = seq[cutsite - 1]
    onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score]
    onebp_features = np.array(onebp_features).reshape(1, -1)
    rate_1bpins = float(rate_model.predict(onebp_features))

    # Predict 1 bp frequency
    freq = rate_1bpins / (1 - rate_1bpins)
    pred = list(dlpred)
    pred.insert(0, freq)
    pred = np.array(pred) / sum(pred)

    pred_fs = {'+0': 0, '+1': 0, '+2': 0}
    pred_fs['+1'] += pred[0]
    for idx in range(1, len(pred)):
        del_len = idx
        fs = (-1 * del_len) % 3
        key = '+%s' % (int(fs))
        pred_fs[key] += pred[idx]

    # Bae predict
    bae_fs = {'+0': 0, '+1': 0, '+2': 0}
    bae_dlpred = bae_prediction(seq, cutsite)
    for idx in range(len(bae_dlpred)):
        del_len = idx + 1
        fs = (-1 * del_len) % 3
        key = '+%s' % (int(fs))
        bae_fs[key] += bae_dlpred[idx]

    for fs in ['+0', '+1', '+2']:
        alldf_dict['Frame'].append(fs)
        alldf_dict['Bae'].append(bae_fs[fs])
        alldf_dict['inDelphi'].append(pred_fs[fs])
        alldf_dict['Obs'].append(obs_fs[fs])

        alldf_dict['_Experiment'].append(exp)
        alldf_dict['rs'].append(rs)

    return alldf_dict
Example #10
0
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    if sum(df['Count']) <= 1000:
        return

    df = orig_df

    obs_dellens = []
    # 1 bp insertion
    crit = (df['Category'] == 'ins') & (df['Indel with Mismatches'] !=
                                        'yes') & (df['Length'] == 1)
    obs_dellens.append(sum(df[crit]['Count']))

    # 1 - 28 bp deletions
    for del_len in range(1, 28 + 1):
        crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] !=
                                            'yes') & (df['Length'] == del_len)
        obs_dellens.append(sum(df[crit]['Count']))
    obs_dellens = np.array(obs_dellens) / sum(obs_dellens)
    obs = obs_dellens

    # Predict
    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(orig_df)

    # Predict rate of 1 bp insertions
    # Featurize first
    del_score = _predict2.total_deletion_score(seq, cutsite)
    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    ohmapper = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1]
    }
    fivebase = seq[cutsite - 1]
    onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score]
    onebp_features = np.array(onebp_features).reshape(1, -1)
    rate_1bpins = float(rate_model.predict(onebp_features))

    # Predict 1 bp frequency
    freq = rate_1bpins / (1 - rate_1bpins)
    pred = list(dlpred)
    pred.insert(0, freq)
    pred = np.array(pred) / sum(pred)

    r = pearsonr(obs, pred)[0]
    alldf_dict['dl_r'].append(r)

    obs_entropy = entropy(obs) / np.log(len(obs))
    pred_entropy = entropy(pred) / np.log(len(pred))
    alldf_dict['obs entropy'].append(obs_entropy)
    alldf_dict['pred entropy'].append(pred_entropy)

    alldf_dict['_Experiment'].append(exp)
    alldf_dict['rs'].append(rs)

    return alldf_dict