def main():
    print(NAME)

    import glob
    mdf = pd.DataFrame()
    fns = glob.glob(inp_dir + '*bootstrap*')
    timer = util.Timer(total=len(fns))
    for fn in fns:
        cond = fn.split('/')[-1].replace('_bootstrap.csv', '')
        df = pd.read_csv(fn, index_col=0)
        df['Condition'] = cond
        mdf = mdf.append(df, ignore_index=True)
        timer.update()

    mdf.to_csv(out_dir + '_combined_gmean_bootstrap.csv')

    # Not bootstrap
    mdf = pd.DataFrame()
    fns = [fn for fn in os.listdir(inp_dir) if 'bootstrap' not in fn]
    timer = util.Timer(total=len(fns))
    for fn in fns:
        df = pd.read_csv(inp_dir + fn)
        cond = fn.replace('.csv', '')
        df['Condition'] = cond
        n = len(df)
        df['Regression weight'] = 1 / n
        mdf = mdf.append(df, ignore_index=True)
        timer.update()

    mdf.to_csv(out_dir + '_all_ratios.csv')

    return
Beispiel #2
0
def combine(modelexp_nm):
    # aggstats
    mdf = pd.DataFrame()
    timer = util.Timer(total=params['num_splits'])
    for split in range(params['num_splits']):
        inp_fn = inp_dir + f'{modelexp_nm}/aggstats_{split}.csv'
        if os.path.isfile(inp_fn):
            df = pd.read_csv(inp_fn, index_col=0)
            mdf = mdf.append(df, ignore_index=True, sort=False)
        timer.update()
    mdf.to_csv(out_dir + f'{modelexp_nm}-aggstats.csv')

    # evals
    me_df = pd.read_csv(_config.DATA_DIR + f'{modelexp_nm}.csv')
    mdf = pd.DataFrame()
    timer = util.Timer(total=len(me_df))
    for idx in range(len(me_df)):
        inp_fn = inp_dir + f'{modelexp_nm}/evals_{idx}.csv'
        if os.path.isfile(inp_fn):
            df = pd.read_csv(inp_fn, index_col=0)
            mdf = mdf.append(df, ignore_index=True, sort=False)
        timer.update()
    mdf.to_csv(out_dir + f'{modelexp_nm}-evals.csv')

    return
Beispiel #3
0
def gather(editor):
    print(editor)
    conds = exp_design_df[exp_design_df['Editor'] == editor]['Name']

    # mdf
    mdf = pd.DataFrame()
    timer = util.Timer(total=len(conds))
    for cond in conds:
        try:
            df = pd.read_csv(inp_dir + f'{cond}.csv', index_col=0)
            mdf = mdf.append(df, ignore_index=True, sort=False)
        except:
            continue
        timer.update()
    mdf.to_csv(out_dir + f'{editor}.csv')

    # len melt
    mdf = pd.DataFrame()
    timer = util.Timer(total=len(conds))
    for cond in conds:
        try:
            df = pd.read_csv(inp_dir + f'{cond}_len_melt.csv', index_col=0)
            df['Condition'] = cond
            mdf = mdf.append(df, ignore_index=True, sort=False)
        except:
            continue
        timer.update()
    mdf.to_csv(out_dir + f'{editor}_len_melt.csv')

    # pos_melt by nt
    # Reduce to mean
    indel_lens = range(-20, 15 + 1)
    timer = util.Timer(total=len(indel_lens))
    for indel_len in indel_lens:
        mdf = pd.DataFrame()
        for cond in conds:
            try:
                df = pd.read_csv(inp_dir + f'{cond}_pos_{indel_len}nt.csv',
                                 index_col=0)
                df['Condition'] = cond
                df['Indel length'] = indel_len
                mdf = mdf.append(df, ignore_index=True, sort=False)
            except:
                continue
        mdf.to_csv(out_dir + f'{editor}_pos_{indel_len}nt.csv')
        timer.update()

    return
def get_library_stats(inp_dir_lib, data_type):
    import glob
    fns = glob.glob(inp_dir_lib + f'*len_melt.csv')

    dd = defaultdict(list)
    timer = util.Timer(total=len(fns))
    for fn in fns:
        cond_nm = fn.split('/')[-1].replace('_len_melt.csv', '')
        df = pd.read_csv(fn, index_col=0)

        dd['Name'].append(cond_nm)
        dd['Data type'].append(data_type)
        dd['Editor'].append(
            exp_design_df[exp_design_df['Name'] == cond_nm]['Editor'].iloc[0])

        fq_1nt_indels = sum(df[df['Indel length'].isin(
            [-1, 1])]['Frequency']) / sum(df['Frequency'])
        dd['Fraction of 1-bp indels in indels'].append(fq_1nt_indels)

        fq_1nt_dels = sum(df[df['Indel length'].isin(
            [-1])]['Frequency']) / sum(df['Frequency'])
        dd['Fraction of 1-bp dels in indels'].append(fq_1nt_dels)

        fq_1nt_ins = sum(df[df['Indel length'].isin([-1])]['Frequency']) / sum(
            df['Frequency'])
        dd['Fraction of 1-bp ins in indels'].append(fq_1nt_ins)

        timer.update()

    return pd.DataFrame(dd)
Beispiel #5
0
def get_statistics(cond):

    df1 = pd.read_csv(inp_dir + f'{cond}_pos_1nt.csv', index_col=0)
    df2 = pd.read_csv(inp_dir + f'{cond}_pos_-1nt.csv', index_col=0)
    mdf = df1.append(df2, ignore_index=True)

    bs_dd = defaultdict(list)

    positions = [col for col in mdf.columns if col != 'Name']
    timer = util.Timer(total=len(positions))
    for pos in positions:

        dfs = mdf[pos]

        means = []
        for bs_idx in range(1000):
            bs_data = np.random.choice(dfs, size=len(dfs), replace=True)
            means.append(np.mean(bs_data))

        bs_dd['Position'].append(pos)
        bs_dd['Mean'].append(np.mean(dfs))
        bs_dd['Mean - stderr'].append(np.percentile(means, 50 - 34))
        bs_dd['Mean + stderr'].append(np.percentile(means, 50 + 34))
        bs_dd['2.5th percentile'].append(np.percentile(means, 2.5))
        bs_dd['97.5th percentile'].append(np.percentile(means, 97.5))
        timer.update()

    bs_df = pd.DataFrame(bs_dd)
    bs_df = bs_df.sort_values(by='Position').reset_index()

    bs_df.to_csv(out_dir + f'{cond}.csv')

    return
def individualize(inp_dir, out_dir):
    # a_gather produces large dataframes of 2000 experiments concatenated together.
    # extracting dataframes for each individual experiment is slow, while it's faster to just read in individual csv's for each experiment. (This functions produces individual csv's).

    for inp_fn in os.listdir(inp_dir):
        if not fnmatch.fnmatch(inp_fn, '*csv'):
            continue

        # if inp_fn not in ['PRL-Lib1-mES.csv', 'PRL-DisLib-mES.csv', 'Lib1-mES.csv']:
        # continue

        inp_nm = inp_fn.replace('.csv', '')
        out_fold = out_dir + inp_nm + '/'
        util.ensure_dir_exists(out_fold)

        df = pd.read_csv(inp_dir + inp_fn)
        exps = set(df['Experiment'])
        print inp_nm
        timer = util.Timer(total=len(exps))
        for exp in exps:
            out_fn = out_fold + '%s.csv' % (exp)
            d = df[df['Experiment'] == exp]
            d.to_csv(out_fn)
            timer.update()

    return
def main(nm='', start='', end=''):
    print(NAME)
    print(nm)

    start, end = int(start), int(end)
    out_dir = out_place + nm + '/'
    util.ensure_dir_exists(out_dir)

    print('Preparing alignment output directories...')
    nms = all_names[start:end + 1]
    prepare_align_outdirs(out_dir, nms)
    print('Done')

    global expected_cutsite
    expected_cutsite = len('GATGGGTGCGACGCGTCAT') + 28

    inp_dir = inp_place + nm + '/'

    timer = util.Timer(total=len(nms))
    for target_nm in nms:
        data = defaultdict(list)
        for split in os.listdir(inp_dir):
            if split == 'aligns':
                continue
            inp_fn = inp_dir + '%s/%s.txt' % (split, target_nm)
            remaster_aligns(inp_fn, data)
        save_alignments(data, out_dir, target_nm)
        timer.update()

    return
Beispiel #8
0
def main(inp_dir, out_dir, nm='none', start='none', end='none'):
    print NAME
    util.ensure_dir_exists(out_dir)

    if nm == 'none' and start == 'none' and end == 'none':
        gen_qsubs()
        return

    if nm != 'none' and start == 'none' and end == 'none':
        # Run single
        print nm
        res, context = set_master_expected_cutsite(nm)
        if res is False:
            return
        genotype_data(inp_dir, out_dir, nm, context)
        return

    # Run many
    start, end = int(start), int(end)
    timer = util.Timer(total=end - start + 1)
    for idnum in range(start, end + 1):
        srr_id = 'SRR%s' % (idnum)
        # print srr_id
        res, context = set_master_expected_cutsite(srr_id)
        if res is False:
            continue
        genotype_data(inp_dir, out_dir, srr_id, context)
        timer.update()

    return out_dir
Beispiel #9
0
def add_negative_controls(aret):
    # ensure gfp is not included
    gfpgrnas = []
    with open(_config.DATA_DIR + 'egfp_NGG_NNG_seq.patman_format.txt') as f:
        for i, line in enumerate(f):
            if line[0] != '>':
                gfpgrnas.append(line.strip())

    headers, sqs = aret
    sqs += gfpgrnas
    sqs = [s[-15:-3] for s in sqs]  # use 12-mer seed region
    sqs = set(sqs)

    new_h, reads = [], []
    timer = util.Timer(total=125)
    for i in range(125):
        ok = False
        while True:
            skip = False
            cand = ''.join(np.random.choice(['A', 'C', 'G', 'T'], 12))
            for s in sqs:
                if mismatch(cand, s) < 3:
                    skip = True
                    break
            if skip:
                continue
            # if ok... proceed
            reads.append('ATATATCTTGTGGAAAGGACGAAACACC' +
                         ''.join(np.random.choice(['A', 'C', 'G', 'T'], 8)) +
                         cand + 'GTTTAAGAGCTATGCTGGAAACAGCATAGC')
            new_h.append('neg_control_' + str(i))
            break
        timer.update()

    return new_h, reads
Beispiel #10
0
def build_vo_data(out_dir, exp):
    print exp
    inp_dir = '/cluster/mshen/prj/vanoverbeek/out/e10_control_adjustment/'

    srrids = get_srr_ids(exp.replace('VO_', ''))
    data = defaultdict(list)

    # Build data
    timer = util.Timer(total=len(srrids))
    for srr_id in srrids:
        csv_fn = inp_dir + '%s.csv' % (srr_id)
        if os.path.isfile(csv_fn):
            d = pd.read_csv(csv_fn)
            if len(d) > 0:
                individual_piechart(d, data)
        timer.update()

    # Pickle, convert defaultdict to regular dict
    picklable_data = dict()
    for key in data:
        picklable_data[key] = data[key]

    with open(out_dir + '%s.pkl' % (exp), 'w') as f:
        pickle.dump(picklable_data, f)

    return data
Beispiel #11
0
def load_transversion_data(nm_to_conds):
    nms = list(nm_to_conds.keys())

    df = pd.read_csv(inp_dir_mutant + f'mmdf_12kChar.csv')
    combined_conds = [f'C_GA_{cond}' for cond in sorted(nms)]
    id_cols = ['Name', 'Position']
    dfs = df[id_cols + combined_conds]

    dfs.to_csv(out_dir + 'transversion_purity.csv')

    # Bootstrap
    bs_dd = dict()
    timer = util.Timer(total=len(nms))
    for nm in nms:
        col = f'C_GA_{nm}'
        data = dfs[col].dropna()

        bs_means = []
        for bs_idx in range(5000):
            bs_data = np.random.choice(data, size=len(data))
            bs_means.append(np.mean(bs_data))
        bs_dd[nm] = bs_means
        timer.update()

    bs_df = pd.DataFrame(bs_dd)
    bs_df.to_csv(out_dir + 'transversion_purity-bootstrap.csv')

    return
def get_poswise_df(data, nm_to_seq, treat_nm):
    dd = defaultdict(list)
    timer = util.Timer(total=len(data))
    for nm in data:
        pw = data[nm]
        seq = nm_to_seq[nm]

        for jdx in range(len(pw)):
            pos = _data.idx_to_pos(jdx, treat_nm)
            ref_nt = seq[jdx]
            ref_idx = nt_to_idx[ref_nt]
            total = sum(pw[jdx])

            for kdx in range(len(pw[jdx])):
                if kdx == ref_idx:
                    continue

                count = pw[jdx][kdx]

                dd['Count'].append(count)
                dd['Total count'].append(total)
                dd['Obs nt'].append(nts[kdx])
                dd['Ref nt'].append(ref_nt)
                dd['Position'].append(pos)
                dd['Name'].append(nm)

        timer.update()

    df = pd.DataFrame(dd)
    return df
def filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm):
    timer = util.Timer(total=len(to_remove))
    for idx, row in to_remove.iterrows():
        exp_nm = row['Name']
        mut_nm = row['MutName']

        if exp_nm not in adj_d:
            continue

        t = adj_d[exp_nm]
        cat, ids, ide, idl, mhl, ib = mut_nm.split('_')

        t_cat_set = set(t['Category'])
        if len(t_cat_set) == 1 and 'wildtype' in t_cat_set:
            continue

        crit = (t['Category'] == cat) & (t['Indel start'] == float(ids)) & (
            t['Indel end']
            == float(ide)) & (t['Indel length'] == float(idl)) & (
                t['MH length'] == float(mhl)) & (t['Inserted bases'] == ib)
        t.loc[crit, 'Count'] = 0
        t = t[t['Count'] > 0]
        t['Frequency'] = t['Count'] / sum(t['Count'])

        adj_d[exp_nm] = t
        timer.update()

    return adj_d
Beispiel #14
0
def build_nm_to_idxs(df):
    '''
    Exploits ordered structure
  '''
    print(f'Building index ...')
    d = dict()
    curr_nm = ''
    start_idx = -1
    timer = util.Timer(total=len(df))
    for idx, row in df.iterrows():
        nm = row['Read name']
        if nm != curr_nm:
            if curr_nm != '':
                d[curr_nm] = {
                    'start_idx': start_idx,
                    'end_idx': idx,
                }
            start_idx = idx
            curr_nm = nm
        timer.update()

    # Last load
    d[curr_nm] = {
        'start_idx': start_idx,
        'end_idx': idx,
    }

    return d
Beispiel #15
0
def get_statistics(editor):

    df = pd.read_csv(inp_dir + f'{editor}_len_melt.csv', index_col=0)

    bs_dd = defaultdict(list)

    indel_lens = set(df['Indel length'])
    timer = util.Timer(total=len(indel_lens))
    for indel_len in indel_lens:
        dfs = df[df['Indel length'] == indel_len]

        means = []
        for bs_idx in range(1000):
            bs_data = np.random.choice(dfs['Frequency'],
                                       size=len(dfs),
                                       replace=True)
            means.append(np.mean(bs_data))

        bs_dd['Indel length'].append(indel_len)
        bs_dd['Mean'].append(np.mean(dfs['Frequency']))
        bs_dd['Mean - stderr'].append(np.percentile(means, 50 - 34))
        bs_dd['Mean + stderr'].append(np.percentile(means, 50 + 34))
        bs_dd['2.5th percentile'].append(np.percentile(means, 2.5))
        bs_dd['97.5th percentile'].append(np.percentile(means, 97.5))
        timer.update()

    bs_df = pd.DataFrame(bs_dd)
    bs_df = bs_df.sort_values(by='Indel length').reset_index()

    bs_df.to_csv(out_dir + f'{editor}.csv')

    return
Beispiel #16
0
def main(inp_dir, out_dir, srr_id='', start='none', end='none'):
    print NAME
    util.ensure_dir_exists(out_dir)

    # Function calls
    if srr_id == '' and start == 'none' and end == 'none':
        gen_qsubs()
        return

    if srr_id != '' and start == 'none' and end == 'none':
        if is_control(srr_id):
            print 'is control'
            return
        control_adjustment(inp_dir, out_dir, srr_id)
        return

    start, end = int(start), int(end)
    timer = util.Timer(total=end - start + 1)
    for idnum in range(start, end + 1):
        srr_id = 'SRR%s' % (idnum)
        ans = is_control(srr_id)
        if ans is False:
            control_adjustment(inp_dir, out_dir, srr_id)
        timer.update()

    return out_dir
def filter_inprofile_batch_effects():
    df = pd.read_csv(_config.DATA_DIR + 'batch_effects.csv')
    inprofile_batches = set(df['Batch'])

    be_treatments = [
        s for s in treat_control_df['Treatment'] if 'Cas9' not in s
    ]
    timer = util.Timer(total=len(be_treatments))
    for treat_nm in be_treatments:
        batch = exp_nm_to_batch[treat_nm]

        if batch in inprofile_batches:
            print(treat_nm, batch)
            adj_d = _data.load_data(treat_nm, 'ag4a2_adjust_batch_effects')
            to_remove = df[df['Batch'] == batch]

            lib_design, seq_col = _data.get_g4_lib_design(treat_nm)
            nms = lib_design['Name (unique)']
            seqs = lib_design[seq_col]
            nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

            adj_d = filter_mutations(to_remove, adj_d, nm_to_seq, treat_nm)
            with open(out_dir + '%s.pkl' % (treat_nm), 'wb') as f:
                pickle.dump(adj_d, f)

        else:
            inp_fn = inp_dir + '%s.pkl' % (treat_nm)
            subprocess.check_output('cp %s %s' % (inp_fn, out_dir), shell=True)

        timer.update()

    return
def get_cigars(inp_dir, out_dir, spacers):
  print '\tGetting cigars...'
  timer = util.Timer(total = len(spacers))
  for spc in spacers.values():
    for i in range(len(spc['runs'])):
      cigars = {}
      run = spc['runs'][i]
      foldnm = _lib.exp_fold_name(run)
      fn = _config.DATA_DIR + foldnm + '/' + run + '.sam'
      num_aligns, num_kept = 0.0, 0.0
      with open(fn) as f:
        for _, line in enumerate(f):
          if not line.startswith('@'):
            num_aligns += 1
            chro, start = line.split()[2], int(line.split()[3])
            cigar = line.split()[5]

            if chro == 'chr' + spc['chr']:
              if spc['start']-300 <= start <= spc['start']+300:
                if cigar not in cigars:
                  cigars[cigar] = 0
                cigars[cigar] += 1
                num_kept += 1

      frac_kept = num_kept / num_aligns
      if frac_kept < 0.80 and num_aligns > 1000:
        print '\tWARNING: Kept:', num_kept / num_aligns, 'of', num_aligns, 'for spacer', spc['num'], ':', run, spc['libnms'][i]

      out_fn = out_dir + spc['libnms'][i] + '.txt'
      with open(out_fn, 'w') as f:
        for cigar in cigars:
          f.write('>' + str(cigars[cigar]) + '\n' + cigar + '\n')
    timer.update()

  return
Beispiel #19
0
def build_library_data(out_dir, exp):
    print exp
    # if exp in ['2k-mES-Cas9-Tol2']:
    # inp_dir = '/cluster/mshen/prj/mmej_manda2/out/2017-10-27/e_newgenotype/'
    if exp in [
            'Lib1-mES', 'Lib1-HCT116', 'Lib1-HEK293T', 'DisLib-U2OS',
            'DisLib-mES', 'DisLib-HEK293T', 'DisLib-U2OS-HEK-Mixture',
            'PRL-Lib1-mES', 'PRL-DisLib-mES'
    ]:
        inp_dir = '/cluster/mshen/prj/mmej_figures/out/b_individualize/'
        exp_dir = inp_dir + exp + '/'

    data = defaultdict(list)

    timer = util.Timer(total=len(os.listdir(exp_dir)))
    for fn in os.listdir(exp_dir):
        if not fnmatch.fnmatch(fn, '*csv'):
            continue

        csv_fn = exp_dir + fn
        d = pd.read_csv(csv_fn)
        individual_piechart(d, data)
        timer.update()

    # Pickle, convert defaultdict to regular dict
    picklable_data = dict()
    for key in data:
        picklable_data[key] = data[key]

    with open(out_dir + '%s.pkl' % (exp), 'w') as f:
        pickle.dump(picklable_data, f)
    return data
Beispiel #20
0
def build_vo_data(out_dir, exp, wildtype = False):
  print exp
  inp_dir = '/cluster/mshen/prj/vanoverbeek/out/b_polish/'

  if wildtype:
    castype = 'WT'
  else:
    castype = '48h'

  srrids = get_srr_ids(exp.replace('VO_', ''), castype)
  data = defaultdict(list)

  # Build data
  timer = util.Timer(total = len(srrids))
  for srr_id in srrids:
    get_mismatches(inp_dir + srr_id + '/', data, srr_id = srr_id)
    timer.update()

  # Pickle, convert defaultdict to regular dict
  picklable_data = dict()
  for key in data:
    if key not in picklable_data:
      picklable_data[key] = data[key]

  with open(out_dir + '%s.pkl' % (exp), 'w') as f:
    pickle.dump(picklable_data, f)

  return data
Beispiel #21
0
def merge_lowpances(df):
    num_samples = len(aligned_lowpance['1'])

    new_df = pd.DataFrame()
    timer = util.Timer(total=num_samples)
    for idx in range(num_samples):
        new_samplenm = f'Fq {1 + idx}'

        samples = [
            f'Fq {aligned_lowpance[pance][idx]}' for pance in aligned_lowpance
        ]
        dfs = df[df['Sample name'].isin(samples)]

        pv_df = dfs.pivot(index='Full genotype',
                          columns='Sample name',
                          values='Frequency')
        pv_df = pv_df.fillna(value=0)
        pv_df['Mean fq'] = pv_df.apply(np.mean, axis='columns')
        pv_df['Mean fq'] /= sum(pv_df['Mean fq'])
        pv_df['Full genotype'] = pv_df.index
        pv_df = pv_df[['Full genotype', 'Mean fq']]

        dfm = pv_df.melt(id_vars=['Full genotype'], value_name='Frequency')
        dfm['Sample name'] = new_samplenm
        dfm['Sample'] = 1 + idx

        new_df = new_df.append(dfm, ignore_index=True)
        timer.update()

    return new_df
Beispiel #22
0
def count_reads(exp, inp_dir, lib_design):

    dd = defaultdict(list)
    timer = util.Timer(total=len(lib_design['Name (unique)']))
    for nm in lib_design['Name (unique)']:
        ctd = get_counts_subfold(inp_dir + nm + '/')

        dd['Name'].append(nm)

        try:
            dd['Total count'].append(ctd['Total count'])
        except:
            import code
            code.interact(local=dict(globals(), **locals()))
        dd['Total ULMI count'].append(ctd['Total ULMI count'])

        dd['WT count'].append(ctd['WT count'])
        dd['WT ULMI count'].append(ctd['WT ULMI count'])

        dd['Indel count'].append(ctd['Indel count'])
        dd['Indel ULMI count'].append(ctd['Indel ULMI count'])

        timer.update()

    df = pd.DataFrame(dd)
    df.to_csv(out_dir + '%s.csv' % (exp))
    return
Beispiel #23
0
def split(inp_fn, out_nm):
    inp_fn_numlines = util.line_count(inp_fn)

    num_splits = 60
    split_size = int(inp_fn_numlines / num_splits)
    if num_splits * split_size < inp_fn_numlines:
        split_size += 1
    while split_size % 4 != 0:
        split_size += 1
    # print 'Using split size %s' % (split_size)

    split_num = 0
    timer = util.Timer(total=num_splits)
    for idx in range(1, inp_fn_numlines, split_size):
        start = idx
        end = start + split_size
        out_fn = out_dir + out_nm + '_%s.fq' % (split_num)

        skip = False
        if os.path.isfile(out_fn):
            size_mb = os.path.getsize(out_fn) / 1e6
            if size_mb > 0:
                skip = True

        if not skip:
            command = 'tail -n +%s %s | head -n %s > %s' % (
                start, inp_fn, end - start, out_fn)
            subprocess.check_output(command, shell=True)

        split_num += 1
        # print(command)
        timer.update()

    return
def get_trajectory(major_threshold):

    mdf = pd.DataFrame()
    timer = util.Timer(total=len(design_df))
    for nm in design_df['Short name']:
        df = pd.read_csv(inp_dir + f'{nm}_t{major_threshold}.csv', index_col=0)

        # Filter
        df = df[df['Count'] >= 5]

        fq_col = f'Fq {nm}'
        df[fq_col] = df['Count'] / sum(df['Count'])
        df = df[['Full genotype', fq_col]]

        if len(mdf) == 0:
            mdf = df
        else:
            mdf = mdf.merge(df, on='Full genotype', how='outer')

        timer.update()

    mdf = mdf.fillna(value=0)
    mdf.to_csv(out_dir + f'pv_trajectory_t{major_threshold}.csv')

    dfm = mdf.melt(id_vars='Full genotype',
                   var_name='Sample name',
                   value_name='Frequency')
    dfm['Sample'] = [int(s.split()[-1]) for s in dfm['Sample name']]
    dfm.to_csv(out_dir + f'mel_trajectory_t{major_threshold}.csv')

    return
def main(argv):
  print(NAME)

  modelexp_nm = argv[0]
  print(modelexp_nm)

  exp_design = pd.read_csv(_config.DATA_DIR + f'{modelexp_nm}.csv')
  hyperparam_cols = [col for col in exp_design.columns if col != 'Name']

  new_out_dir = out_dir + f'{modelexp_nm}/'
  util.ensure_dir_exists(new_out_dir)

  print(f'Collating experiments...')

  model_out_dir = _config.OUT_PLACE + f'_fitness_from_reads_pt_multi/{modelexp_nm}/'
  num_fails = 0
  timer = util.Timer(total = len(exp_design))
  for idx, row in exp_design.iterrows():
    int_nm = row['Name']
    real_nm = row['dataset']

    try:
      command = f'cp {model_out_dir}/model_{int_nm}/_final_fitness.csv {new_out_dir}/fitness_{int_nm}.csv'
      subprocess.check_output(command, shell = True)

      command = f'cp {model_out_dir}/model_{int_nm}/_final_genotype_matrix.csv {new_out_dir}/genotype_matrix_{int_nm}.csv'
      subprocess.check_output(command, shell = True)
    except:
      num_fails += 1

    timer.update()

  print(f'Collated {len(exp_design)} experiments with {num_fails} failures')

  return
Beispiel #26
0
def prepare_statistics(data_nm):
  # Input: Dataset
  # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
  # Calculate statistics associated with each experiment by name

  alldf_dict = defaultdict(list)

  if 'Lib1' in data_nm or 'VO' in data_nm:
    dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name')
  if 'DisLib' in data_nm:
    dataset = _data.load_dataset(data_nm, exp_subset = 'clin', exp_subset_col = 'Designed Name')
    # Remove data with iterated editing
    dlwt = _config.d.DISLIB_WT
    for idx, row in dlwt.iterrows():
      if row['wt_repairable'] == 'iterwt':
        del dataset[row['name']]
  if dataset is None:
    return

  timer = util.Timer(total = len(dataset))
  # for exp in dataset.keys()[:100]:
  for exp in dataset.keys():
    df = dataset[exp]
    calc_statistics(df, exp, alldf_dict)
    timer.update()

  # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
  alldf = pd.DataFrame(alldf_dict)
  return alldf
def prepare_statistics(data_nm1, data_nm2):
  # Input: Dataset
  # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
  # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc.
  # Calculate statistics associated with each experiment by name

  alldf_dict = defaultdict(list)

  # If Library, subset VO spacers
  dataset1 = _data.load_dataset(data_nm1)
  dataset2 = _data.load_dataset(data_nm2)
  if dataset1 is None or dataset2 is None:
    return

  # Find shared exps and iterate through them, passing both shared exps together to calc_statistics
  shared_exps = set(dataset1.keys()) & set(dataset2.keys())
  if len(shared_exps) == 0:
    print 'ERROR: No shared exps'

  timer = util.Timer(total = len(shared_exps))
  for exp in shared_exps:
    d1 = dataset1[exp]
    d2 = dataset2[exp]
    calc_statistics(d1, d2, exp, alldf_dict)
    timer.update()

  # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
  alldf = pd.DataFrame(alldf_dict)
  return alldf
def prepare_statistics(data_nm):
  # Input: Dataset
  # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
  # Calculate statistics associated with each experiment by name

  alldf_dict = defaultdict(list)

  dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name')
  if dataset is None:
    return

  e_dir = '/cluster/mshen/prj/mmej_figures/out/e_ins_modeling/'
  timer = util.Timer(total = 100)
  for rs in range(100):
  # for rs in range(1):
    prefix = e_dir + 'len_%s_%s' % (data_nm, rs)
    test_exps = pickle.load(open(prefix + '_testexps.pkl'))
    rate_model = pickle.load(open(prefix + '_model.pkl'))
    bp_model = pickle.load(open(prefix + '_bp.pkl'))

    for exp in test_exps:
      df = dataset[exp]
      calc_statistics(df, exp, rate_model, bp_model, alldf_dict, rs, data_nm)

    timer.update()

  # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
  alldf = pd.DataFrame(alldf_dict)
  return alldf
Beispiel #29
0
def prepare_statistics(data_nm):
    # Input: Dataset
    # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
    # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc.
    # Calculate statistics associated with each experiment by name

    alldf_dict = defaultdict(list)

    dataset = _data.load_dataset(data_nm)
    if dataset is None:
        return

    timer = util.Timer(total=len(dataset))
    for exp in dataset:
        df = dataset[exp]
        calc_statistics(df, exp, alldf_dict)
        timer.update()

    # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
    alldf = pd.DataFrame(alldf_dict)
    col_order = [
        '_Experiment', 'Editing Rate', '0gt Frequency', 'Ngt Frequency', '-10',
        '-9', '-8', '-7', '-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3',
        '4', '5', '6', '7', '8', '9', '10'
    ]
    if len(col_order) != len(alldf.columns):
        print 'ERROR: Will drop columns'
    alldf = alldf[col_order]
    return alldf
def gather_statistics(exp_nm, params):
  (muts, allowed_pos, feature_radius) = params
  # Load data
  data = pd.read_csv(inp_dir + '%s.csv' % (exp_nm), index_col = 0)

  # Set up library info
  lib_nm = _data.get_lib_nm(exp_nm)
  lib_design, seq_col = _data.get_lib_design(exp_nm)
  nms = lib_design['Name (unique)']
  seqs = lib_design[seq_col]
  nm_to_seq = {nm: seq for nm, seq in zip(nms, seqs)}

  # Prepare data
  data = data[data['Total count'] >= 100]
  data['Frequency'] = data['Count'] / data['Total count']

  ontarget_sites = _data.get_ontarget_sites(lib_design, lib_nm)
  data = data[data['Name'].isin(ontarget_sites)]

  data = data[data['Position'].isin(allowed_pos)]

  data['Mutation'] = data['Ref nt'] + '_' + data['Obs nt']
  data['MutName'] = data['Name'].astype(str) + '_' + data['Position'].astype(str) + '_' + data['Mutation']

  # Annotate with local sequence context
  lib_zero_idx = _data.pos_to_idx(0, exp_nm)
  dd = defaultdict(list)
  print('Annotating data with local sequence contexts...')
  timer = util.Timer(total = len(data))
  for idx, row in data.iterrows():
    seq = nm_to_seq[row['Name']]
    pidx = row['Position'] + lib_zero_idx
    local_context = seq[pidx - feature_radius : pidx] + seq[pidx + 1 : pidx + feature_radius + 1]
    dd['Local context'].append(local_context)
    timer.update()
  for col in dd:
    data[col] = dd[col]

  # # Gather statistics

  for mut_nm in muts:
    print(mut_nm)
    mut = muts[mut_nm]
    if len(mut) == 1:
      d_temp = data[data['Mutation'] == mut[0]]
    else:
      d_temp = data[data['Mutation'].isin(mut)]
      d_temp['Mutation'] = mut_nm
      d_temp['MutName'] = d_temp['Name'].astype(str) + '_' + d_temp['Position'].astype(str) + '_' + d_temp['Mutation']
      group_cols = [s for s in d_temp.columns if s not in ['Frequency', 'Obs nt', 'Ref nt', 'Count']]
      d_temp = d_temp.groupby(group_cols)['Frequency'].agg('sum').reset_index()

    for ml_task in ['classify_zero', 'regress_nonzero']:
      print(ml_task)
      results = train_models(exp_nm, d_temp, mut_nm, ml_task)
      save_results(exp_nm, mut_nm, ml_task, results)



  return