def get_p1_mnase_by_TSS(mnase_data, p1_shift, orfs, time): timer = Timer() p1_shift = p1_shift.loc[p1_shift.index.isin(orfs.index.values)] # sort by chromosome and start, for MNase-seq caching speedup p1_shift_sorted_idx = p1_shift[[]].join( orfs[['chr', 'start']]).sort_values(['chr', 'start']).index.values all_pos = np.arange(-500, 501) orf_nuc_mid_counts = p1_shift[[]].copy() orf_nuc_start_counts = p1_shift[[]].copy() orf_nuc_stop_counts = p1_shift[[]].copy() for pos in all_pos: orf_nuc_mid_counts[pos] = 0 orf_nuc_start_counts[pos] = 0 orf_nuc_stop_counts[pos] = 0 mnase_data = mnase_data[mnase_data.time == time] i = 0 # for each +1 nucleosome, sort by shiftedness # get mnase_fragments f for orf_name, row in p1_shift.loc[p1_shift_sorted_idx].iterrows(): orf = orfs.loc[orf_name] span = orf.TSS - 500, orf.TSS + 500 chrom = orf.chr orf_nuc_mnase = filter_mnase(mnase_data, start=span[0], end=span[1], chrom=chrom, translate_origin=orf.TSS, flip=(orf.strand == '-'), length_select=(144, 174), sample=time) mid_counts = get_binned_counts(orf_nuc_mnase, 'mid') start_counts = get_binned_counts(orf_nuc_mnase, 'start') stop_counts = get_binned_counts(orf_nuc_mnase, 'stop') n = len(mid_counts) orf_nuc_mid_counts.loc[orf_name, :] = mid_counts.values.reshape(n) orf_nuc_start_counts.loc[orf_name, :] = start_counts.values.reshape(n) orf_nuc_stop_counts.loc[orf_name] = stop_counts.values.reshape(n) # get mnase-seq at this orf's TSS # get the counts of the start, stop, and mids of nucleosome sized fragments if i % 400 == 0: print("%d/%d - %s" % (i, len(p1_shift), timer.get_time())) i += 1 return (orf_nuc_mid_counts, orf_nuc_start_counts, orf_nuc_stop_counts)
def fold_cross_validation(X, Y, k=3, times=[0, 7.5, 15, 30, 60, 120], l_scale=1., l_bounds=(1, 10), time=False, log=False): np.random.seed(1) original_orfs = X.index.values shuffled_orfs_idx = X.index.values.copy() np.random.shuffle(shuffled_orfs_idx) Y_predict = pd.DataFrame(index=shuffled_orfs_idx) for t in times: Y_predict[t] = 0. N = len(X) fold_size = N / k timer = Timer() last_fold_models = {} for time in times: for fold in range(k): if log and fold % 1 == 0: print_fl("%d/%d" % ((fold + 1), k)) X_train, Y_train, X_test, Y_test = get_fold_slice( X, Y, k, fold, time) test_orfs = X_test.index model = fit_gp(X_train.values, Y_train.values, l_scale, l_bounds) Y_pred = model.predict(X_test.values) r2 = r2_score(Y_test.values, Y_pred) Y_predict.loc[test_orfs, time] = Y_pred if log: print_fl(("\t%s - %s - r2 = %.3f" % (str(time), timer.get_time(), r2))) last_fold_models[time] = model if log: print_fl('') mse = MSE(Y.loc[shuffled_orfs_idx], Y_predict.loc[shuffled_orfs_idx]) r2 = mse[[]].copy() for time in times: r2.loc[time] = r2_score(Y.loc[shuffled_orfs_idx][time], Y_predict.loc[shuffled_orfs_idx][time]) return last_fold_models, mse, r2, Y_predict.loc[original_orfs]
def collect_mnase(mnase_seq, window, pos_chr_df, pos_key='position', chrom_key='chromosome', strand=None, set_index=False, log=False): collected_mnase_eq = pd.DataFrame() win_2 = window/2 timer = Timer() if log: print_fl("Collecting MNase-seq fragments for %d entries" % len(pos_chr_df)) print_fl("around a %d window" % window) i = 0 for chrom in range(1, 17): # get chromosome specific nucleosoems and MNase-seq chrom_entries = pos_chr_df[pos_chr_df[chrom_key] == chrom] if len(chrom_entries) == 0: continue chrom_mnase = filter_mnase(mnase_seq, chrom=chrom) # for each element in the dataset for idx, entry in chrom_entries.iterrows(): # get MNase-seq fragments at pos_chr_df # and 0 center center = entry[pos_key] nuc_mnase = filter_mnase(chrom_mnase, start=center-win_2, end=center+win_2) # orient properly left to right (upstream to downstream) if strand is None or entry[strand] == '+': nuc_mnase.loc[:, 'mid'] = nuc_mnase.mid - center # crick strand, flip else: nuc_mnase.loc[:, 'mid'] = center - nuc_mnase.mid select_columns = ['chr', 'length', 'mid', 'time'] if set_index: nuc_mnase['parent'] = idx select_columns.append('parent') # append to MNase-seq collected_mnase_eq = collected_mnase_eq.append(nuc_mnase[select_columns]) # print_fl progress if log and i % 200 == 0: print_fl("%d/%d - %s" % (i, len(pos_chr_df), timer.get_time())) i += 1 if log: timer.print_time() return collected_mnase_eq
def collect_motifs(self): fimo = self.fimo timer = Timer() all_motifs = pd.DataFrame() # filter peaks outside of ORFs promoter promoters = load_calculated_promoters() search_peaks = self.collected_peaks.reset_index(drop=True).copy() # filter out peaks outside of promoters print("Filtering peaks outside of promoters") print(len(search_peaks)) for orf_name, row in promoters.iterrows(): cur_peaks = search_peaks[search_peaks.orf == orf_name] if len(cur_peaks) > 0: # remove if outside of promoter remove_peaks = cur_peaks[(cur_peaks.original_mid > row.promoter_stop) | (cur_peaks.original_mid < row.promoter_start)] search_peaks = search_peaks.drop(remove_peaks.index) self.prom_peaks = search_peaks print(len(search_peaks)) for idx, peak in search_peaks.reset_index().iterrows(): search_window = (peak.original_mid-50, peak.original_mid+50) try: motifs = find_motif(fimo, None, peak.chr, search_window) except KeyError: continue motifs['orf'] = peak.orf motifs['peak'] = peak['name'] motifs = motifs[['orf', 'tf', 'score', 'p-value', 'q-value', 'motif_mid', 'strand', 'peak']] all_motifs = all_motifs.append(motifs) if idx % 100 == 0: print("%d/%d - %s" % (idx, len(search_peaks), timer.get_time())) all_motifs = all_motifs.reset_index(drop=True) self.all_motifs = all_motifs
def link_peaks(self): all_peaks = self.all_peaks # select highest 10% of peaks q = np.quantile(all_peaks.cross_correlation, 0.9) print("Peak cutoff %.1f" % q) # Remove duplicate peaks across times test_peaks = all_peaks[all_peaks.cross_correlation > q] timer = Timer() collect_peaks = pd.DataFrame() test_peaks = test_peaks.sort_values('cross_correlation', ascending=False) window_2 = self.window_2 while len(test_peaks) > 0: highest = test_peaks.reset_index().loc[0] selected_near = test_peaks[(test_peaks.chr == highest.chr) & (test_peaks.original_mid < highest.original_mid + window_2) & (test_peaks.original_mid > highest.original_mid - window_2)] test_peaks = test_peaks.drop(selected_near.index) collect_peaks = collect_peaks.append(highest) if len(collect_peaks) % 100 == 0: print("%d, (-%d) - %s" % (len(collect_peaks), len(test_peaks), timer.get_time())) self.collected_peaks = collect_peaks timer = Timer() test_peaks = collect_peaks.set_index('name') linked_peaks = test_peaks[[]].copy() for time in times: linked_peaks[time] = 0.0 i = 0 for chrom in range(1, 17): chrom_peaks = test_peaks[test_peaks.chr == chrom] if len(chrom_peaks) == 0: continue chrom_cross_correlation = pd.read_hdf( '%s/cross_correlation_chr%d.h5.z' % (cc_sense_chrom_dir, chrom)) for idx, peak in chrom_peaks.iterrows(): cols = np.arange(peak.mid-window_2, peak.mid+window_2) try: peak_cc = chrom_cross_correlation.loc['small'].loc[peak.orf][cols].mean(axis=1) except KeyError: continue linked_peaks.loc[idx] = peak_cc if i % 100 == 0: print("%d/%d - %s" % (i, len(test_peaks), timer.get_time())) i += 1 self.linked_peaks = linked_peaks # normalize linked_peaks linked_peaks_normlized = linked_peaks.copy() value_0 = linked_peaks[0.0].copy() for time in times[1:]: # normalize to t=0's mean values = linked_peaks[time] + (value_0.mean() - linked_peaks[time].mean()) linked_peaks_normlized[time] = values self.linked_peaks_normalized = linked_peaks_normlized