Exemple #1
0
def main():

    (_, select_range, cc_type, antisense) = \
        tuple(sys.argv)

    antisense = antisense.lower() == 'true'
    strand = 'sense' if not antisense else 'antisense'
    select_range = select_range.split('_')
    select_range = int(select_range[0]), int(select_range[1])

    cross_corr_path = cross_corr_sense_path
    orfs = paper_orfs
    if antisense:
        cross_corr_path = cross_corr_antisense_path
        orfs = antisense_orfs

    child_name = get_name(select_range, cc_type)

    print_fl("Running entropy for %d-%d, %s, %s" %
             (select_range[0], select_range[1], cc_type, strand))

    name = 'entropy_%s' % strand
    timer = Timer()

    print_fl("Reading Cross correlation...", end='')
    cross_correlation = pd.read_hdf(cross_corr_path, 'cross_correlation')
    print_fl("Done.")
    timer.print_time()
    print_fl()

    print_fl("Computing entropy")
    calculate_cc_summary_measure_range_type(orfs, cross_correlation, cc_type,
                                            select_range, strand, timer)

    child_done(name, WATCH_TMP_DIR, child_name)
def collect_mnase(mnase_seq, window, pos_chr_df, 
                  pos_key='position', chrom_key='chromosome',
                  strand=None, set_index=False, log=False):

    collected_mnase_eq = pd.DataFrame()
    win_2 = window/2

    timer = Timer()

    if log:
        print_fl("Collecting MNase-seq fragments for %d entries" % len(pos_chr_df))
        print_fl("around a %d window" % window)

    i = 0
    for chrom in range(1, 17):
        
        # get chromosome specific nucleosoems and MNase-seq
        chrom_entries = pos_chr_df[pos_chr_df[chrom_key] == chrom]    
        if len(chrom_entries) == 0: continue
        chrom_mnase = filter_mnase(mnase_seq, chrom=chrom)
        
        # for each element in the dataset
        for idx, entry in chrom_entries.iterrows():
            
            # get MNase-seq fragments at pos_chr_df
            # and 0 center 
            center = entry[pos_key]
            nuc_mnase = filter_mnase(chrom_mnase, start=center-win_2, end=center+win_2)

            # orient properly left to right (upstream to downstream)
            if strand is None or entry[strand] == '+':
                nuc_mnase.loc[:, 'mid'] = nuc_mnase.mid - center
            # crick strand, flip 
            else:
                nuc_mnase.loc[:, 'mid'] = center - nuc_mnase.mid

            select_columns = ['chr', 'length', 'mid', 'time']
            if set_index:
                nuc_mnase['parent'] = idx
                select_columns.append('parent')

            # append to MNase-seq
            collected_mnase_eq = collected_mnase_eq.append(nuc_mnase[select_columns])

            # print_fl progress
            if log and i % 200 == 0: print_fl("%d/%d - %s" % (i, len(pos_chr_df), 
                timer.get_time()))
            i += 1

    if log: timer.print_time()

    return collected_mnase_eq
def main():

    from src.kernel_fitter import compute_triple_kernel

    (_, chrom, antisense) = \
        tuple(sys.argv)
    antisense = antisense.lower() == 'true'

    chrom = int(chrom)
    print_fl("Running cross correlation on chromosome %d, antisense: %s" %
             (chrom, str(antisense)))

    name = task_name(antisense)
    timer = Timer()

    nuc_kernel = MNaseSeqDensityKernel(filepath=nuc_kernel_path)
    sm_kernel = MNaseSeqDensityKernel(filepath=sm_kernel_path)
    triple_kernel = compute_triple_kernel(nuc_kernel)

    print_fl("Reading MNase-seq...", end='')
    all_mnase_data = pd.read_hdf(mnase_seq_path, 'mnase_data')
    print_fl("Done.")
    timer.print_time()
    print_fl()

    if not antisense:
        cc_dir = cc_sense_chrom_dir
        cc_orfs = read_orfs_data("%s/orfs_cd_paper_dataset.csv" % OUTPUT_DIR)
    else:
        cc_dir = cc_antisense_chrom_dir
        cc_orfs = antisense_orfs

    calculate_cross_correlation_chr(all_mnase_data,
                                    cc_orfs,
                                    chrom,
                                    antisense,
                                    nuc_kernel,
                                    sm_kernel,
                                    triple_kernel,
                                    cc_dir,
                                    log=True,
                                    timer=timer)

    child_done(name, WATCH_TMP_DIR, chrom)
def collect_small_peaks():

    from src.small_peak_calling import call_orf_small_peaks
    from src.timer import Timer
    
    orfs = all_orfs_TSS_PAS()

    timer = Timer()
    all_peaks = pd.DataFrame()

    for chrom in range(1, 17):

        print("Chromosome %d" % chrom)
        chr_orfs = orfs[orfs.chr == chrom]

        # load relevant cross correlations
        chrom_cross_correlation = pd.read_hdf(
        '%s/cross_correlation_chr%d.h5.z' % 
        (cc_sense_chrom_dir, chrom))
        small_cc = -1 * chrom_cross_correlation.loc['diff']
        
        for idx, orf in chr_orfs.iterrows():
            
            try:
                peaks = call_orf_small_peaks(small_cc, orf)
            except KeyError:
                continue

            all_peaks = all_peaks.append(peaks)
        
        timer.print_time()


    all_peaks = all_peaks.reset_index(drop=True)
    all_peaks['name'] = all_peaks['orf']  + '_' + all_peaks['time'].astype(str) + '_' + \
        all_peaks['chr'].astype(str) + '_' + all_peaks['original_mid'].astype(str)
    all_peaks = all_peaks.set_index('name')

    return all_peaks
Exemple #5
0
def run_model(name, save_dir, predict_TPM=True):

    timer = Timer()

    print_fl("Loading %s model" % name)
    print_fl("Predicting TPM: %s" % predict_TPM)

    sample_N = None

    model_fun = get_model_funs()[name]
    model = model_fun(sample_N=sample_N)

    folds = 10
    print_fl("Fitting %d folds.." % folds)
    model.fit_cv(log=True, k=folds)

    # save models to disk
    res = model.Y.join(model.Y_predict, lsuffix='_true', rsuffix='_predicted')
    res.to_csv('%s/%s_results.csv' % (save_dir, name))

    res = pd.DataFrame({'r2': model.r2, 'mse': model.mse})
    res.to_csv('%s/res_%s.csv' % (save_dir, name))
    timer.print_time()