def prepare_dataset_try3():
    dataset_nm = 'dataset_try3'
    print 'Preparing %s' % (dataset_nm)

    featurized_data = init_featurized_data()

    # Components...
    dataset = _data.load_dataset('DisLib-mES-controladj',
                                 exp_subset='longdup_series',
                                 exp_subset_col='Designed Name')
    for exp in dataset.keys():
        if exp not in _config.d.HIGHREP_DISLIB_EXPS_NMS:
            del dataset[exp]
    prepare_library_dataset(dataset, featurized_data)

    dataset = _data.load_dataset('Lib1-mES-controladj')

    # Remove VO spacers from lib 1
    for vo_spacer_idx in range(1872, 1961 + 1):
        vo_spacer_exp = str(vo_spacer_idx)
        del dataset[vo_spacer_exp]

    # Remove low rep spacers from lib1
    for exp in dataset.keys():
        if int(exp) not in _config.d.HIGHREP_LIB1_EXPS:
            del dataset[exp]

    print len(dataset)
    prepare_library_dataset(dataset, featurized_data)

    pickle_featurized_data(featurized_data, dataset_nm)
    return
def prepare_statistics(data_nm1, data_nm2):
  # Input: Dataset
  # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
  # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc.
  # Calculate statistics associated with each experiment by name

  alldf_dict = defaultdict(list)

  # If Library, subset VO spacers
  dataset1 = _data.load_dataset(data_nm1)
  dataset2 = _data.load_dataset(data_nm2)
  if dataset1 is None or dataset2 is None:
    return

  # Find shared exps and iterate through them, passing both shared exps together to calc_statistics
  shared_exps = set(dataset1.keys()) & set(dataset2.keys())
  if len(shared_exps) == 0:
    print 'ERROR: No shared exps'

  timer = util.Timer(total = len(shared_exps))
  for exp in shared_exps:
    d1 = dataset1[exp]
    d2 = dataset2[exp]
    calc_statistics(d1, d2, exp, alldf_dict)
    timer.update()

  # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
  alldf = pd.DataFrame(alldf_dict)
  return alldf
예제 #3
0
def prepare_statistics(data_nm):
  # Input: Dataset
  # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
  # Calculate statistics associated with each experiment by name

  alldf_dict = defaultdict(list)

  if 'Lib1' in data_nm or 'VO' in data_nm:
    dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name')
  if 'DisLib' in data_nm:
    dataset = _data.load_dataset(data_nm, exp_subset = 'clin', exp_subset_col = 'Designed Name')
    # Remove data with iterated editing
    dlwt = _config.d.DISLIB_WT
    for idx, row in dlwt.iterrows():
      if row['wt_repairable'] == 'iterwt':
        del dataset[row['name']]
  if dataset is None:
    return

  timer = util.Timer(total = len(dataset))
  # for exp in dataset.keys()[:100]:
  for exp in dataset.keys():
    df = dataset[exp]
    calc_statistics(df, exp, alldf_dict)
    timer.update()

  # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
  alldf = pd.DataFrame(alldf_dict)
  return alldf
def prepare_dataset_try4():
    dataset_nm = 'dataset_try4'
    print 'Preparing %s' % (dataset_nm)

    featurized_data = init_featurized_data()

    # Load dislib, longdups
    dataset = _data.load_dataset('DisLib-mES-controladj',
                                 exp_subset='longdup_series',
                                 exp_subset_col='Designed Name')
    for exp in dataset.keys():
        if exp not in _config.d.HIGHREP_DISLIB_EXPS_NMS:
            del dataset[exp]
    prepare_library_dataset(dataset, featurized_data)

    # Load dislib, clin data
    dataset = _data.load_dataset('DisLib-mES-controladj',
                                 exp_subset='clin',
                                 exp_subset_col='Designed Name')

    # Remove data with iterated editing
    dlwt = _config.d.DISLIB_WT
    for idx, row in dlwt.iterrows():
        if row['wt_repairable'] == 'iterwt':
            del dataset[row['name']]
    for exp in dataset.keys():
        if exp not in _config.d.HIGHREP_DISLIB_EXPS_NMS:
            del dataset[exp]
    print len(dataset)
    prepare_library_dataset(dataset, featurized_data)

    # Load Lib1 data
    dataset = _data.load_dataset('Lib1-mES-controladj')

    # Remove VO spacers from lib 1
    for vo_spacer_idx in range(1872, 1961 + 1):
        vo_spacer_exp = str(vo_spacer_idx)
        del dataset[vo_spacer_exp]
    # Remove low rep spacers from lib1
    for exp in dataset.keys():
        if int(exp) not in _config.d.HIGHREP_LIB1_EXPS:
            del dataset[exp]

    print len(dataset)
    prepare_library_dataset(dataset, featurized_data)

    pickle_featurized_data(featurized_data, dataset_nm)
    return
예제 #5
0
def prepare_statistics(data_nm):
    # Input: Dataset
    # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
    # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc.
    # Calculate statistics associated with each experiment by name

    alldf_dict = defaultdict(list)

    dataset = _data.load_dataset(data_nm)
    if dataset is None:
        return

    timer = util.Timer(total=len(dataset))
    for exp in dataset:
        df = dataset[exp]
        calc_statistics(df, exp, alldf_dict)
        timer.update()

    # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
    alldf = pd.DataFrame(alldf_dict)
    col_order = [
        '_Experiment', 'Editing Rate', '0gt Frequency', 'Ngt Frequency', '-10',
        '-9', '-8', '-7', '-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3',
        '4', '5', '6', '7', '8', '9', '10'
    ]
    if len(col_order) != len(alldf.columns):
        print 'ERROR: Will drop columns'
    alldf = alldf[col_order]
    return alldf
def prepare_statistics(data_nm):
  # Input: Dataset
  # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
  # Calculate statistics associated with each experiment by name

  alldf_dict = defaultdict(list)

  dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name')
  if dataset is None:
    return

  e_dir = '/cluster/mshen/prj/mmej_figures/out/e_ins_modeling/'
  timer = util.Timer(total = 100)
  for rs in range(100):
  # for rs in range(1):
    prefix = e_dir + 'len_%s_%s' % (data_nm, rs)
    test_exps = pickle.load(open(prefix + '_testexps.pkl'))
    rate_model = pickle.load(open(prefix + '_model.pkl'))
    bp_model = pickle.load(open(prefix + '_bp.pkl'))

    for exp in test_exps:
      df = dataset[exp]
      calc_statistics(df, exp, rate_model, bp_model, alldf_dict, rs, data_nm)

    timer.update()

  # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
  alldf = pd.DataFrame(alldf_dict)
  return alldf
def prepare_dataset_try1():
    dataset_nm = 'dataset_try1'
    print 'Preparing %s' % (dataset_nm)

    featurized_data = init_featurized_data()

    # Components...
    dataset = _data.load_dataset('DisLib-mES-controladj',
                                 exp_subset='longdup_series',
                                 exp_subset_col='Designed Name')
    prepare_library_dataset(dataset, featurized_data)

    dataset = _data.load_dataset('Lib1-mES-controladj')

    # Remove VO spacers from lib 1
    for vo_spacer_idx in range(1872, 1961 + 1):
        vo_spacer_exp = str(vo_spacer_idx)
        del dataset[vo_spacer_exp]
    print len(dataset)
    prepare_library_dataset(dataset, featurized_data)

    pickle_featurized_data(featurized_data, dataset_nm)
    return
예제 #8
0
def prepare_statistics():
    # Input: Dataset
    # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
    # Calculate statistics associated with each experiment by name

    alldf_dict = defaultdict(list)

    data_nm = '1207-mESC-Dislib-Cas9-Tol2-Biorep1-r1-controladj'
    # data_nm = '0105-mESC-Lib1-Cas9-Tol2-BioRep2-r1-controladj'
    dataset1 = _data.load_dataset(data_nm)

    data_nm = '1207-mESC-Dislib-Cas9-Tol2-Biorep1-r2-controladj'
    # data_nm = '0105-mESC-Lib1-Cas9-Tol2-BioRep3-r1-controladj'
    dataset2 = _data.load_dataset(data_nm)

    for exp in dataset1.keys():
        df1 = dataset1[exp]
        df2 = dataset2[exp]
        calc_statistics(df1, df2, exp, alldf_dict)

    # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
    alldf = pd.DataFrame(alldf_dict)
    return alldf
예제 #9
0
def prepare_statistics(data_nm):
    # Input: Dataset
    # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots
    # Calculate statistics associated with each experiment by name

    alldf_dict = defaultdict(list)

    dataset = _data.load_dataset(data_nm)
    if dataset is None:
        return

    timer = util.Timer(total=len(dataset))
    # for exp in dataset.keys()[:100]:
    for exp in dataset.keys():
        df = dataset[exp]
        calc_statistics(df, exp, alldf_dict)
        timer.update()

    # Return a dataframe where columns are positions and rows are experiment names, values are frequencies
    alldf = pd.DataFrame(alldf_dict)
    return alldf
def data_preload(nm):
    l2_data = _data.load_dataset(nm)
    return