Beispiel #1
0
def get_p1_mnase_by_TSS(mnase_data, p1_shift, orfs, time):
    timer = Timer()

    p1_shift = p1_shift.loc[p1_shift.index.isin(orfs.index.values)]

    # sort by chromosome and start, for MNase-seq caching speedup
    p1_shift_sorted_idx = p1_shift[[]].join(
        orfs[['chr', 'start']]).sort_values(['chr', 'start']).index.values

    all_pos = np.arange(-500, 501)

    orf_nuc_mid_counts = p1_shift[[]].copy()
    orf_nuc_start_counts = p1_shift[[]].copy()
    orf_nuc_stop_counts = p1_shift[[]].copy()

    for pos in all_pos:
        orf_nuc_mid_counts[pos] = 0
        orf_nuc_start_counts[pos] = 0
        orf_nuc_stop_counts[pos] = 0

    mnase_data = mnase_data[mnase_data.time == time]

    i = 0
    # for each +1 nucleosome, sort by shiftedness
    # get mnase_fragments f
    for orf_name, row in p1_shift.loc[p1_shift_sorted_idx].iterrows():

        orf = orfs.loc[orf_name]

        span = orf.TSS - 500, orf.TSS + 500
        chrom = orf.chr

        orf_nuc_mnase = filter_mnase(mnase_data,
                                     start=span[0],
                                     end=span[1],
                                     chrom=chrom,
                                     translate_origin=orf.TSS,
                                     flip=(orf.strand == '-'),
                                     length_select=(144, 174),
                                     sample=time)

        mid_counts = get_binned_counts(orf_nuc_mnase, 'mid')
        start_counts = get_binned_counts(orf_nuc_mnase, 'start')
        stop_counts = get_binned_counts(orf_nuc_mnase, 'stop')

        n = len(mid_counts)
        orf_nuc_mid_counts.loc[orf_name, :] = mid_counts.values.reshape(n)
        orf_nuc_start_counts.loc[orf_name, :] = start_counts.values.reshape(n)
        orf_nuc_stop_counts.loc[orf_name] = stop_counts.values.reshape(n)

        # get mnase-seq at this orf's TSS
        # get the counts of the start, stop, and mids of nucleosome sized fragments

        if i % 400 == 0:
            print("%d/%d - %s" % (i, len(p1_shift), timer.get_time()))

        i += 1

    return (orf_nuc_mid_counts, orf_nuc_start_counts, orf_nuc_stop_counts)
Beispiel #2
0
def fold_cross_validation(X,
                          Y,
                          k=3,
                          times=[0, 7.5, 15, 30, 60, 120],
                          l_scale=1.,
                          l_bounds=(1, 10),
                          time=False,
                          log=False):

    np.random.seed(1)
    original_orfs = X.index.values
    shuffled_orfs_idx = X.index.values.copy()
    np.random.shuffle(shuffled_orfs_idx)

    Y_predict = pd.DataFrame(index=shuffled_orfs_idx)
    for t in times:
        Y_predict[t] = 0.

    N = len(X)
    fold_size = N / k

    timer = Timer()
    last_fold_models = {}

    for time in times:
        for fold in range(k):

            if log and fold % 1 == 0: print_fl("%d/%d" % ((fold + 1), k))

            X_train, Y_train, X_test, Y_test = get_fold_slice(
                X, Y, k, fold, time)
            test_orfs = X_test.index

            model = fit_gp(X_train.values, Y_train.values, l_scale, l_bounds)
            Y_pred = model.predict(X_test.values)

            r2 = r2_score(Y_test.values, Y_pred)

            Y_predict.loc[test_orfs, time] = Y_pred

            if log:
                print_fl(("\t%s - %s - r2 = %.3f" %
                          (str(time), timer.get_time(), r2)))

            last_fold_models[time] = model

            if log:
                print_fl('')

    mse = MSE(Y.loc[shuffled_orfs_idx], Y_predict.loc[shuffled_orfs_idx])

    r2 = mse[[]].copy()
    for time in times:
        r2.loc[time] = r2_score(Y.loc[shuffled_orfs_idx][time],
                                Y_predict.loc[shuffled_orfs_idx][time])

    return last_fold_models, mse, r2, Y_predict.loc[original_orfs]
def collect_mnase(mnase_seq, window, pos_chr_df, 
                  pos_key='position', chrom_key='chromosome',
                  strand=None, set_index=False, log=False):

    collected_mnase_eq = pd.DataFrame()
    win_2 = window/2

    timer = Timer()

    if log:
        print_fl("Collecting MNase-seq fragments for %d entries" % len(pos_chr_df))
        print_fl("around a %d window" % window)

    i = 0
    for chrom in range(1, 17):
        
        # get chromosome specific nucleosoems and MNase-seq
        chrom_entries = pos_chr_df[pos_chr_df[chrom_key] == chrom]    
        if len(chrom_entries) == 0: continue
        chrom_mnase = filter_mnase(mnase_seq, chrom=chrom)
        
        # for each element in the dataset
        for idx, entry in chrom_entries.iterrows():
            
            # get MNase-seq fragments at pos_chr_df
            # and 0 center 
            center = entry[pos_key]
            nuc_mnase = filter_mnase(chrom_mnase, start=center-win_2, end=center+win_2)

            # orient properly left to right (upstream to downstream)
            if strand is None or entry[strand] == '+':
                nuc_mnase.loc[:, 'mid'] = nuc_mnase.mid - center
            # crick strand, flip 
            else:
                nuc_mnase.loc[:, 'mid'] = center - nuc_mnase.mid

            select_columns = ['chr', 'length', 'mid', 'time']
            if set_index:
                nuc_mnase['parent'] = idx
                select_columns.append('parent')

            # append to MNase-seq
            collected_mnase_eq = collected_mnase_eq.append(nuc_mnase[select_columns])

            # print_fl progress
            if log and i % 200 == 0: print_fl("%d/%d - %s" % (i, len(pos_chr_df), 
                timer.get_time()))
            i += 1

    if log: timer.print_time()

    return collected_mnase_eq
    def collect_motifs(self):
        
        fimo = self.fimo
        timer = Timer()

        all_motifs = pd.DataFrame()

        # filter peaks outside of ORFs promoter
        promoters = load_calculated_promoters()

        search_peaks = self.collected_peaks.reset_index(drop=True).copy()
            
        # filter out peaks outside of promoters
        print("Filtering peaks outside of promoters")
        print(len(search_peaks))
        for orf_name, row in promoters.iterrows():
            
            cur_peaks = search_peaks[search_peaks.orf == orf_name]
            
            if len(cur_peaks) > 0:
                
                # remove if outside of promoter   
                remove_peaks = cur_peaks[(cur_peaks.original_mid > row.promoter_stop) | 
                                         (cur_peaks.original_mid < row.promoter_start)]
                search_peaks = search_peaks.drop(remove_peaks.index)

        self.prom_peaks = search_peaks
        print(len(search_peaks))

        for idx, peak in search_peaks.reset_index().iterrows():
            search_window = (peak.original_mid-50, peak.original_mid+50)

            try:
                motifs = find_motif(fimo, None, peak.chr, search_window)
            except KeyError:
                continue

            motifs['orf'] = peak.orf
            motifs['peak'] = peak['name']
            motifs = motifs[['orf', 'tf', 'score', 'p-value', 'q-value', 'motif_mid', 
                             'strand', 'peak']]
            all_motifs = all_motifs.append(motifs)

            if idx % 100 == 0:
                print("%d/%d - %s" % (idx, len(search_peaks), timer.get_time()))
        all_motifs = all_motifs.reset_index(drop=True)
        self.all_motifs = all_motifs
    def link_peaks(self):

        all_peaks = self.all_peaks
        # select highest 10% of peaks
        q = np.quantile(all_peaks.cross_correlation, 0.9)
        print("Peak cutoff %.1f" % q)

        # Remove duplicate peaks across times
        test_peaks = all_peaks[all_peaks.cross_correlation > q]

        timer = Timer()

        collect_peaks = pd.DataFrame()

        test_peaks = test_peaks.sort_values('cross_correlation', ascending=False)

        window_2 = self.window_2
        while len(test_peaks) > 0:
            
            highest = test_peaks.reset_index().loc[0]
            selected_near = test_peaks[(test_peaks.chr == highest.chr) & 
                                       (test_peaks.original_mid < highest.original_mid + window_2) & 
                                       (test_peaks.original_mid > highest.original_mid - window_2)]
            test_peaks = test_peaks.drop(selected_near.index)
            collect_peaks = collect_peaks.append(highest)

            if len(collect_peaks) % 100 == 0:
                print("%d, (-%d) - %s" % (len(collect_peaks), len(test_peaks), timer.get_time()))

        self.collected_peaks = collect_peaks
        timer = Timer()

        test_peaks = collect_peaks.set_index('name')
        linked_peaks = test_peaks[[]].copy()

        for time in times:
            linked_peaks[time] = 0.0

        i = 0
        for chrom in range(1, 17):    
            
            chrom_peaks = test_peaks[test_peaks.chr == chrom]
            
            if len(chrom_peaks) == 0: continue

            chrom_cross_correlation = pd.read_hdf(
                    '%s/cross_correlation_chr%d.h5.z' % 
                    (cc_sense_chrom_dir, chrom))

            for idx, peak in chrom_peaks.iterrows():
                cols = np.arange(peak.mid-window_2, peak.mid+window_2)
                
                try:
                    peak_cc = chrom_cross_correlation.loc['small'].loc[peak.orf][cols].mean(axis=1)
                except KeyError:
                    continue

                linked_peaks.loc[idx] = peak_cc

                if i % 100 == 0:
                    print("%d/%d - %s" % (i, len(test_peaks), timer.get_time()))
                i += 1

        self.linked_peaks = linked_peaks

        # normalize linked_peaks
        linked_peaks_normlized = linked_peaks.copy()
        value_0 = linked_peaks[0.0].copy()
        for time in times[1:]:
            # normalize to t=0's mean
            values = linked_peaks[time] + (value_0.mean() - linked_peaks[time].mean())
            linked_peaks_normlized[time] = values
        self.linked_peaks_normalized = linked_peaks_normlized