def main(): (_, select_range, cc_type, antisense) = \ tuple(sys.argv) antisense = antisense.lower() == 'true' strand = 'sense' if not antisense else 'antisense' select_range = select_range.split('_') select_range = int(select_range[0]), int(select_range[1]) cross_corr_path = cross_corr_sense_path orfs = paper_orfs if antisense: cross_corr_path = cross_corr_antisense_path orfs = antisense_orfs child_name = get_name(select_range, cc_type) print_fl("Running entropy for %d-%d, %s, %s" % (select_range[0], select_range[1], cc_type, strand)) name = 'entropy_%s' % strand timer = Timer() print_fl("Reading Cross correlation...", end='') cross_correlation = pd.read_hdf(cross_corr_path, 'cross_correlation') print_fl("Done.") timer.print_time() print_fl() print_fl("Computing entropy") calculate_cc_summary_measure_range_type(orfs, cross_correlation, cc_type, select_range, strand, timer) child_done(name, WATCH_TMP_DIR, child_name)
def collect_mnase(mnase_seq, window, pos_chr_df, pos_key='position', chrom_key='chromosome', strand=None, set_index=False, log=False): collected_mnase_eq = pd.DataFrame() win_2 = window/2 timer = Timer() if log: print_fl("Collecting MNase-seq fragments for %d entries" % len(pos_chr_df)) print_fl("around a %d window" % window) i = 0 for chrom in range(1, 17): # get chromosome specific nucleosoems and MNase-seq chrom_entries = pos_chr_df[pos_chr_df[chrom_key] == chrom] if len(chrom_entries) == 0: continue chrom_mnase = filter_mnase(mnase_seq, chrom=chrom) # for each element in the dataset for idx, entry in chrom_entries.iterrows(): # get MNase-seq fragments at pos_chr_df # and 0 center center = entry[pos_key] nuc_mnase = filter_mnase(chrom_mnase, start=center-win_2, end=center+win_2) # orient properly left to right (upstream to downstream) if strand is None or entry[strand] == '+': nuc_mnase.loc[:, 'mid'] = nuc_mnase.mid - center # crick strand, flip else: nuc_mnase.loc[:, 'mid'] = center - nuc_mnase.mid select_columns = ['chr', 'length', 'mid', 'time'] if set_index: nuc_mnase['parent'] = idx select_columns.append('parent') # append to MNase-seq collected_mnase_eq = collected_mnase_eq.append(nuc_mnase[select_columns]) # print_fl progress if log and i % 200 == 0: print_fl("%d/%d - %s" % (i, len(pos_chr_df), timer.get_time())) i += 1 if log: timer.print_time() return collected_mnase_eq
def main(): from src.kernel_fitter import compute_triple_kernel (_, chrom, antisense) = \ tuple(sys.argv) antisense = antisense.lower() == 'true' chrom = int(chrom) print_fl("Running cross correlation on chromosome %d, antisense: %s" % (chrom, str(antisense))) name = task_name(antisense) timer = Timer() nuc_kernel = MNaseSeqDensityKernel(filepath=nuc_kernel_path) sm_kernel = MNaseSeqDensityKernel(filepath=sm_kernel_path) triple_kernel = compute_triple_kernel(nuc_kernel) print_fl("Reading MNase-seq...", end='') all_mnase_data = pd.read_hdf(mnase_seq_path, 'mnase_data') print_fl("Done.") timer.print_time() print_fl() if not antisense: cc_dir = cc_sense_chrom_dir cc_orfs = read_orfs_data("%s/orfs_cd_paper_dataset.csv" % OUTPUT_DIR) else: cc_dir = cc_antisense_chrom_dir cc_orfs = antisense_orfs calculate_cross_correlation_chr(all_mnase_data, cc_orfs, chrom, antisense, nuc_kernel, sm_kernel, triple_kernel, cc_dir, log=True, timer=timer) child_done(name, WATCH_TMP_DIR, chrom)
def collect_small_peaks(): from src.small_peak_calling import call_orf_small_peaks from src.timer import Timer orfs = all_orfs_TSS_PAS() timer = Timer() all_peaks = pd.DataFrame() for chrom in range(1, 17): print("Chromosome %d" % chrom) chr_orfs = orfs[orfs.chr == chrom] # load relevant cross correlations chrom_cross_correlation = pd.read_hdf( '%s/cross_correlation_chr%d.h5.z' % (cc_sense_chrom_dir, chrom)) small_cc = -1 * chrom_cross_correlation.loc['diff'] for idx, orf in chr_orfs.iterrows(): try: peaks = call_orf_small_peaks(small_cc, orf) except KeyError: continue all_peaks = all_peaks.append(peaks) timer.print_time() all_peaks = all_peaks.reset_index(drop=True) all_peaks['name'] = all_peaks['orf'] + '_' + all_peaks['time'].astype(str) + '_' + \ all_peaks['chr'].astype(str) + '_' + all_peaks['original_mid'].astype(str) all_peaks = all_peaks.set_index('name') return all_peaks
def run_model(name, save_dir, predict_TPM=True): timer = Timer() print_fl("Loading %s model" % name) print_fl("Predicting TPM: %s" % predict_TPM) sample_N = None model_fun = get_model_funs()[name] model = model_fun(sample_N=sample_N) folds = 10 print_fl("Fitting %d folds.." % folds) model.fit_cv(log=True, k=folds) # save models to disk res = model.Y.join(model.Y_predict, lsuffix='_true', rsuffix='_predicted') res.to_csv('%s/%s_results.csv' % (save_dir, name)) res = pd.DataFrame({'r2': model.r2, 'mse': model.mse}) res.to_csv('%s/res_%s.csv' % (save_dir, name)) timer.print_time()