コード例 #1
0
def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(df)
    df = _lib.indels_without_mismatches_subset(df)
    df = df[df['Length'] >= 5]
    if sum(df['Count']) <= 1000:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(df)
    pred_df = _predict2.predict_mhdel(seq, cutsite)
    pred_df = pred_df[pred_df['Length'] >= 5]
    pred_df['Predicted_Frequency'] = pred_df['Predicted_Frequency'] / sum(
        pred_df['Predicted_Frequency'])

    join_cols = ['Category', 'Genotype Position', 'Length']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']

    ns_criteria = (mdf['Length'] - mdf['Genotype Position'] == 0)
    s = mdf[ns_criteria]

    alldf_dict['Predicted Ngt'] += list(s['Predicted_Frequency'])
    alldf_dict['Observed Ngt'] += list(s['Frequency'])
    alldf_dict['_Experiment'] += [exp] * len(s['Frequency'])

    return alldf_dict
コード例 #2
0
def add_del_profiles(header, sequence, df_buffer):
    for idx in range(len(sequence)):
        seq = ''
        if sequence[idx:idx + 2] == 'CC':
            cutsite = idx + 6
            seq = sequence[cutsite - 30:cutsite + 30]
            seq = compbio.reverse_complement(seq)
        if sequence[idx:idx + 2] == 'GG':
            cutsite = idx - 4
            seq = sequence[cutsite - 30:cutsite + 30]

        if seq != '':
            if len(seq) != 60:
                continue
            local_cutsite = 30
            pred_df = _predict2.predict_mhdel(seq, local_cutsite)

            pred_df['header'] = header
            pred_df['seq'] = sequence
            pred_df['pam'] = sequence[idx:idx + 2]
            pred_df['cutsite'] = cutsite
            pred_df['shuffled'] = 'no'
            df_buffer = df_buffer.append(pred_df, ignore_index=True)

            pre, post = list(seq[:34]), list(seq[36:])
            random.shuffle(pre)
            random.shuffle(post)
            shuffled_seq = ''.join(pre) + 'GG' + ''.join(post)
            shuffled_pred_df = _predict2.predict_mhdel(seq, local_cutsite)

            shuffled_pred_df['header'] = header
            shuffled_pred_df['seq'] = sequence
            shuffled_pred_df['pam'] = sequence[idx:idx + 2]
            shuffled_pred_df['cutsite'] = cutsite
            shuffled_pred_df['shuffled'] = 'yes'
            df_buffer = df_buffer.append(shuffled_pred_df, ignore_index=True)
    return df_buffer
コード例 #3
0
def calc_statistics(orig_df, exp, alldf_dict):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.mh_del_subset(orig_df)
  df = _lib.indels_without_mismatches_subset(df)
  if sum(df['Count']) <= 1000:
    return
  df['Frequency'] = _lib.normalize_frequency(df)

  _predict2.init_model()

  seq, cutsite = _lib.get_sequence_cutsite(df)
  pred_df = _predict2.predict_mhdel(seq, cutsite)

  join_cols = ['Category', 'Genotype Position', 'Length']
  mdf = df.merge(pred_df, how = 'outer', on = join_cols)
  mdf['Frequency'].fillna(value = 0, inplace = True)
  mdf['Predicted_Frequency'].fillna(value = 0, inplace = True)
  obs = mdf['Frequency']
  pred = mdf['Predicted_Frequency']

  obs_entropy = entropy(obs) / np.log(len(obs))
  pred_entropy = entropy(pred) / np.log(len(pred))
  alldf_dict['obs gt entropy'].append(obs_entropy)
  alldf_dict['pred gt entropy'].append(pred_entropy)

  df = orig_df[orig_df['Category'] == 'del']
  df = df[df['Length'] <= 28]
  df['Frequency'] = _lib.normalize_frequency(df)
  obs_dl = []
  for del_len in range(1, 28+1):
    freq = sum(df[df['Length'] == del_len]['Frequency'])
    obs_dl.append(freq)
  pred_dl = _predict2.deletion_length_distribution(seq, cutsite)

  obs_entropy = entropy(obs_dl) / np.log(len(obs_dl))
  pred_entropy = entropy(pred_dl) / np.log(len(pred_dl))
  alldf_dict['obs dl entropy'].append(obs_entropy)
  alldf_dict['pred dl entropy'].append(pred_entropy)

  alldf_dict['_Experiment'].append(exp)

  return alldf_dict
コード例 #4
0
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    if sum(df['Count']) <= 1000:
        return

    ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1)
    ins_df = orig_df[ins_crit]
    truncated_ins_d = defaultdict(list)
    for ins_base in list('ACGT'):
        crit = (ins_df['Inserted Bases'] == ins_base)
        tot_count = sum(ins_df[crit]['Count'])
        truncated_ins_d['Count'].append(tot_count)
        truncated_ins_d['Inserted Bases'].append(ins_base)
        truncated_ins_d['Category'].append('ins')
        truncated_ins_d['Length'].append(1)
    ins_df = pd.DataFrame(truncated_ins_d)
    df = df.append(ins_df, ignore_index=True)
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(orig_df)
    pred_df = _predict2.predict_mhdel(seq, cutsite)

    # Predict rate of 1 bp insertions
    # Featurize first
    del_score = _predict2.total_deletion_score(seq, cutsite)
    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    ohmapper = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1]
    }
    fivebase = seq[cutsite - 1]
    onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score]
    onebp_features = np.array(onebp_features).reshape(1, -1)
    rate_1bpins = float(rate_model.predict(onebp_features))

    # Predict 1 bp genotype frequencies
    pred_1bpins_d = defaultdict(list)
    for ins_base in bp_model[fivebase]:
        freq = bp_model[fivebase][ins_base]
        freq *= rate_1bpins / (1 - rate_1bpins)

        pred_1bpins_d['Category'].append('ins')
        pred_1bpins_d['Length'].append(1)
        pred_1bpins_d['Inserted Bases'].append(ins_base)
        pred_1bpins_d['Predicted_Frequency'].append(freq)

    pred_1bpins_df = pd.DataFrame(pred_1bpins_d)
    pred_df = pred_df.append(pred_1bpins_df, ignore_index=True)
    pred_df['Predicted_Frequency'] /= sum(pred_df['Predicted_Frequency'])

    join_cols = ['Category', 'Genotype Position', 'Length', 'Inserted Bases']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']
    r = pearsonr(obs, pred)[0]
    alldf_dict['gt_r'].append(r)

    obs_entropy = entropy(obs) / np.log(len(obs))
    pred_entropy = entropy(pred) / np.log(len(pred))
    alldf_dict['obs entropy'].append(obs_entropy)
    alldf_dict['pred entropy'].append(pred_entropy)

    alldf_dict['_Experiment'].append(exp)
    alldf_dict['rs'].append(rs)

    return alldf_dict