Beispiel #1
0
def filterpeaks(peak_data, name, filtering=True):
    # filtered_peak_data = {}
    # for k, v in peak_data.iteritems():
    # print k, v.shape
    name = name
    df = peak_data
    df['chr'] = df['chr'].astype('str')
    sample = name.split(' ')
    colnames = df.columns.values.tolist()
    if filtering:
        #print(colnames)
        indices1 = [i for i, s in enumerate(colnames) if sample[0] in s]
        indices2 = [i for i, s in enumerate(colnames) if sample[2] in s]
        # indices3 = [i for i, s in enumerate(colnames) if "Input" in s]
        for i in indices1:
            if ("RA" not in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]):
                condition = colnames[i]
                break
            elif ("RA" in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]):
                condition = colnames[i]
                break
        for i in indices2:
            if ("RA" not in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]):
                control = colnames[i]
                break
            elif ("RA" in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]):
                control = colnames[i]
                break
        else:
            raise ValueError("Filtering Sample name differs from column name.")
        print('Sample lane:' + condition)
        print('Control lane:' + control)
        # inputcol = colnames[indices3[0]]
        # print inputcol
        exclude_from_filtering = ['H3K36me3', 'H3K27me3', 'H3K4me1']

        ## condition for simple filtering of preaks
        if any(s in condition for s in exclude_from_filtering):
            print('Sample name in simple filtering list')
            df1 = df[df[condition] >= 2 * df[control]]
            final = df1
        else:
            print('Using default filtering....')
            df1 = df[df[condition] >= 2 * df[control]]
            df2 = df1[((df1['stop'] - df1['start']) / df1[condition]) <= 15]
            final = df2
        print('Default peak count:', df.shape)
        print('Filtered peak count:', final.shape)
    else:
        final = df
        print('Dataframe is not filtered:', final.shape)
    with open(basepath + '/further_analysis/filtered/filteredPeaksCount.txt', 'a') as file:
        file.write(name + '\t' + str(len(df)) + '\t' + str(len(final)) + '\n')
    # filtered_peak_data[name] = final
    dir_path = basepath + '/further_analysis/filtered/' + name
    paths.ensure_path(dir_path)
    samPath = os.path.join(dir_path, name + '.tsv')
    final.to_csv(samPath, sep="\t", header=True)
    final.index = range(len(final))
    return final, dir_path
Beispiel #2
0
def OverlappingPeaks(dict_peaksdf, name, name1):

    """
    :param second_df: object of PeakAnalysis
    name: name+'vs'+name1
    :return: A dictionary of list of overlapping regions list(dict) and name
    """
    import timeit
    print('Check point: Overlapping analysis')
    print('\n', name, 'vs', name1)
    #print(df1.peaks.head())
    df1 = dict_peaksdf[name].peaks.sort_values(by='chr', ascending=True)
    df2 = dict_peaksdf[name1].peaks.sort_values(by='chr', ascending=True)
    ### Method test PeakOverlaps
    start1 = timeit.default_timer()
    try:
        overlap_list = PeakOverlaps(df1, df2)
    except Exception as e:
        print('\nWarning: Dataframe does not contain all the columns required for overlap, '
              'switching to minimal column requirement.')
        print(e)
        overlap_list = PeakOverlaps_concise(df1, df2)
    stop1 = timeit.default_timer()
    print("Time consumed by method PeakOverlaps:", stop1 - start1, 'sec')
    ddf = pd.DataFrame(overlap_list)
    dirPath = os.path.join(basepath, 'further_analysis', 'overlap', name+'_vs_'+name1)
    commons.ensure_path(dirPath)
    u_df1, u_df2 = get_unique_peaks(df1, df2, name, name1, ddf, dirPath)
    ddf.to_csv(os.path.join(dirPath, name+'_vs_'+name1+'.tsv'), sep="\t", encoding='utf-8', index=False)
    overlap_dict = {name: u_df1, 'overlap': ddf, name1: u_df2}
    stacke_plot_multiple([name, 'overlap', name1], overlap_dict, dirPath, overlap=True)
    peakTSSbinning('overlap', overlap_dict, dirPath, overlap=True)
    venn4overlap(len(df1), len(df2), ddf, dirPath, [name, name1])
    return ddf
Beispiel #3
0
def make_dir(bam_order, region='All'):
    import os
    # print 'Directory_for_result: ' + '/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/'+folder
    path = os.path.join(basepath, 'further_analysis/overlapping_plots', bam_order, region)
    print('Path created:'+path)
    commons.ensure_path(os.path.join(path, 'raw'))
    commons.ensure_path(os.path.join(path, 'norm'))
    return path
Beispiel #4
0
    def index_genome_star(self, ram_atdisposal=30, genome_dir=None):
        """
        Index downloaded genome for STAR aligner
        :return:
        """
        cmd = []
        star = os.path.join(tools_folder, 'aligners', 'STAR',
                            'bin/Linux_x86_64/STAR')
        cmd.extend([star])
        cmd.extend([
            '--runMode', 'genomeGenerate', '--runThreadN',
            multiprocessing.cpu_count() - 2
        ])
        cmd.extend(['----genomeFastaFiles', self.whole_genome])
        cmd.extend(['--sjdbGTFfile', self.gtf])

        # If you do not have enough memory
        if ram_atdisposal < 60:
            cmd.extend([
                '--limitGenomeGenerateRAM', '24000000000', '--genomeSAsparseD',
                '2'
            ])

        # Outpath to STAR index
        if genome_dir is None:
            genome_dir = commons.ensure_path(
                os.path.join(self.DownloadGenome.release_path, 'Sequence',
                             'STARIndex'))
            cmd.extend(['--genomeDir', genome_dir])
        else:
            cmd.extend(['--genomeDir', genome_dir])
Beispiel #5
0
 def __init__(self, genome, alignedlanes, outpath=None, bampaths=None):
     self.genome = genome
     if outpath is None:
         self.outpath = commons.ensure_path(
             os.path.join(basepath, 'results', 'RnaSeq'))
     else:
         self.outpath = outpath
     self.alignedlanes = alignedlanes
     self.bampaths = bampaths  # list of bam paths
Beispiel #6
0
    def run_analysis(self):
        if self.method == 'meme':
            self.path2folder = os.path.join(basepath + path_to_seq, self.name,
                                            'meme')
            commons.ensure_path(self.path2folder)
            self.peak2seq(self.name)
            if self.background is not None:
                self.peak2seq('background')
            motif_db = [
                "JASPAR_CORE_2016_vertebrates.meme", "HOCOMOCOv9.meme",
                "SwissRegulon_human_and_mouse.meme"
            ]
            self.meme_motif(motif_db)

        if self.method == 'homer':
            self.path2folder = os.path.join(basepath + path_to_seq, self.name,
                                            'homer')
            commons.ensure_path(self.path2folder)
            self.motif_analysis_homer()
Beispiel #7
0
def permutation_test4peakdensity(peak_df, name, comparisions, sname=None, n=None, niter=100, outdir=None):
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np
    '''
    This will test for the factor binding difference between two condition.
    :return:
    '''
    if (n is None) | (len(peak_df) < n):
        raise ValueError('Please provide no of peaks for selection or n is greater than total peaks')
    print('Permutation test is randomly selecting '+str(n)+' peaks for '+str(niter)+' iterations')
    print(outdir)
    commons.ensure_path(outdir)
    outpath = os.path.join(outdir, 'permutation_test', sname)
    commons.ensure_path(outpath)

    peak_df = peak_df.rename(columns={'Next Gene name':'Next transcript gene name'})
    #print peak_df.shape
    filtered_peak = {'loaded_sample': peak_df}
    try:
        print('reading count data from old file')
        diffbindDF = pd.read_csv(os.path.join(outpath,'count_data.txt'), sep='\t', header=0)
    except:
        highest = False
        diffbind = differential_binding.Overlaps(name, filtered_peak)
        diffbindDF = diffbind.diffBinding('loaded_sample', highest=highest)
        diffbindDF.to_csv(os.path.join(outpath, 'count_data.txt'), sep='\t', header=True, index=None)
    #print(diffbindDF.head())

    def plot_permuation(iterDF, mediandiff, pval, outpath, niter):
        sns.set('talk')
        plt.figure(figsize=(8,6))
        pt = sns.distplot(iterDF['median_diff'], rug=True, hist=False, color='r')
        plt.bar(mediandiff,5, width=0.01)
        low = min(min(iterDF['median_diff']), mediandiff)
        high = max(max(iterDF['median_diff']), mediandiff)
        print(low+(low/8), high+(high/8), mediandiff)
        if low < 0:
            xlow = low+(low/8.)
        else: xlow = low-(low/8.)
        plt.xlim(xlow, high+(abs(high)/8.))
        plt.ylabel('Freq. of difference')
        plt.xlabel('median diff. is '+str(iterDF['median_diff'].median()))
        plt.title('p-val of difference:'+str(pval)+' ;trial:'+str(niter))
        plt.savefig(os.path.join(outpath, '_'.join(samples)+'.png'))
        plt.clf()
        plt.close()
        #return plt

    def test_significance_of_difference(iterDF, mediandiff, trial):
        count = 0
        if mediandiff > iterDF['median_diff'].median():  # testtype == 'greater':
            count = len(iterDF[iterDF['median_diff'] >= mediandiff])

        if mediandiff < iterDF['median_diff'].median():  # testtype == 'smaller':
            count = len(iterDF[iterDF['median_diff'] <= mediandiff])
        print(count, mediandiff, trial)
        pval = (count+1.)/trial
        #pval = stats.binom_test(count, trial)
        print(pval)
        return pval

    for mediandiff, samples in comparisions.items():
        iterDF = pd.DataFrame(0, columns=[samples[0]+'_mean', samples[1]+'_mean', samples[0]+'_median', samples[1]+'_median', 'mean_diff', 'median_diff'], index=range(niter))
        print(samples)
        for i in range(niter):
            peakdf = differential_binding.random_sampleing_df(diffbindDF, n)
            iterDF.iloc[i, 0] = peakdf[samples[0]].mean()
            iterDF.iloc[i, 1] = peakdf[samples[1]].mean()
            iterDF.iloc[i, 2] = peakdf[samples[0]].median()
            iterDF.iloc[i, 3] = peakdf[samples[1]].median()
            iterDF.iloc[i, 4] = peakdf[samples[0]].mean() / peakdf[samples[1]].mean()
            iterDF.iloc[i, 5] = peakdf[samples[0]].median() / peakdf[samples[1]].median()
        iterDF.to_csv(os.path.join(outpath, '_'.join(samples)+'.txt'), sep='\t', header=True, index=None)
        pval = test_significance_of_difference(iterDF, mediandiff, niter)
        plot_permuation(iterDF, mediandiff, pval, outpath, niter)
Beispiel #8
0
    :return:
    '''
    for ind, row in meta_df_bam.iterrows():
        sample_path = os.path.join(db_path, row['File accession'] + '.bam')
        if not os.path.exists(sample_path + '.bai'):
            print('Sorting & indexing bam:', sample_path)
            pysam.sort(sample_path, sample_path)
            pysam.index(sample_path)


if __name__ == '__main__':

    start = timeit.default_timer()
    db_path = '/ps/imt/e/Encode_data_all/ENCODE_bam'
    out_dir = '/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/H3R2me2a_analysis/ENCODE_heatmaps_H3R2me2_+RA-RA'
    paths.ensure_path(out_dir)
    #/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/H3R2me2a_analysis/H3R2ame2_E9,H3R2me2a_B6.2,H3R2me2a_E9_RA,H3R2me2a_B6.2_RA,H3K4me3_E9,H3K4me3_B6.2,H3K4me3_E9_RA,H3K4me3_B6.2_RA,H3K27ac_E9,H3K27ac_B6.2,H3K27ac_E9_RA,H3K27ac_B6_RA/all6519_H3R2me2a_E9_RA vs IgG_E9_RA filtered_unique/norm/tagcountDF_all_norm.txt
    peak_df = read_csv(
        '/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/filtered/H3R2ame2_E9 vs IgG_E.9 filtered/H3R2ame2_E9 vs IgG_E.9 filtered.txt',
        header=0,
        sep='\t')
    peak_df['chr'] = peak_df['chr'].astype('str')
    peak_df = peak_df[peak_df['chr'].str.len() < 4]
    #peak_df = peak_df[peak_df['cluster'].isin([0,2,3,4,5,6,8])]
    peak_df.index = range(0, len(peak_df))

    meta_df_bam = read_csv(os.path.join(db_path, 'metadata.tsv'),
                           sep='\t',
                           header=0)
    meta_df_bam['Experiment target'] = meta_df_bam['Experiment target'].map(
        lambda x: x.split('-')[0].strip())