def filterpeaks(peak_data, name, filtering=True): # filtered_peak_data = {} # for k, v in peak_data.iteritems(): # print k, v.shape name = name df = peak_data df['chr'] = df['chr'].astype('str') sample = name.split(' ') colnames = df.columns.values.tolist() if filtering: #print(colnames) indices1 = [i for i, s in enumerate(colnames) if sample[0] in s] indices2 = [i for i, s in enumerate(colnames) if sample[2] in s] # indices3 = [i for i, s in enumerate(colnames) if "Input" in s] for i in indices1: if ("RA" not in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]): condition = colnames[i] break elif ("RA" in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]): condition = colnames[i] break for i in indices2: if ("RA" not in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]): control = colnames[i] break elif ("RA" in colnames[i]) and ("norm" not in colnames[i]) and ('Tag count' in colnames[i]): control = colnames[i] break else: raise ValueError("Filtering Sample name differs from column name.") print('Sample lane:' + condition) print('Control lane:' + control) # inputcol = colnames[indices3[0]] # print inputcol exclude_from_filtering = ['H3K36me3', 'H3K27me3', 'H3K4me1'] ## condition for simple filtering of preaks if any(s in condition for s in exclude_from_filtering): print('Sample name in simple filtering list') df1 = df[df[condition] >= 2 * df[control]] final = df1 else: print('Using default filtering....') df1 = df[df[condition] >= 2 * df[control]] df2 = df1[((df1['stop'] - df1['start']) / df1[condition]) <= 15] final = df2 print('Default peak count:', df.shape) print('Filtered peak count:', final.shape) else: final = df print('Dataframe is not filtered:', final.shape) with open(basepath + '/further_analysis/filtered/filteredPeaksCount.txt', 'a') as file: file.write(name + '\t' + str(len(df)) + '\t' + str(len(final)) + '\n') # filtered_peak_data[name] = final dir_path = basepath + '/further_analysis/filtered/' + name paths.ensure_path(dir_path) samPath = os.path.join(dir_path, name + '.tsv') final.to_csv(samPath, sep="\t", header=True) final.index = range(len(final)) return final, dir_path
def OverlappingPeaks(dict_peaksdf, name, name1): """ :param second_df: object of PeakAnalysis name: name+'vs'+name1 :return: A dictionary of list of overlapping regions list(dict) and name """ import timeit print('Check point: Overlapping analysis') print('\n', name, 'vs', name1) #print(df1.peaks.head()) df1 = dict_peaksdf[name].peaks.sort_values(by='chr', ascending=True) df2 = dict_peaksdf[name1].peaks.sort_values(by='chr', ascending=True) ### Method test PeakOverlaps start1 = timeit.default_timer() try: overlap_list = PeakOverlaps(df1, df2) except Exception as e: print('\nWarning: Dataframe does not contain all the columns required for overlap, ' 'switching to minimal column requirement.') print(e) overlap_list = PeakOverlaps_concise(df1, df2) stop1 = timeit.default_timer() print("Time consumed by method PeakOverlaps:", stop1 - start1, 'sec') ddf = pd.DataFrame(overlap_list) dirPath = os.path.join(basepath, 'further_analysis', 'overlap', name+'_vs_'+name1) commons.ensure_path(dirPath) u_df1, u_df2 = get_unique_peaks(df1, df2, name, name1, ddf, dirPath) ddf.to_csv(os.path.join(dirPath, name+'_vs_'+name1+'.tsv'), sep="\t", encoding='utf-8', index=False) overlap_dict = {name: u_df1, 'overlap': ddf, name1: u_df2} stacke_plot_multiple([name, 'overlap', name1], overlap_dict, dirPath, overlap=True) peakTSSbinning('overlap', overlap_dict, dirPath, overlap=True) venn4overlap(len(df1), len(df2), ddf, dirPath, [name, name1]) return ddf
def make_dir(bam_order, region='All'): import os # print 'Directory_for_result: ' + '/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/'+folder path = os.path.join(basepath, 'further_analysis/overlapping_plots', bam_order, region) print('Path created:'+path) commons.ensure_path(os.path.join(path, 'raw')) commons.ensure_path(os.path.join(path, 'norm')) return path
def index_genome_star(self, ram_atdisposal=30, genome_dir=None): """ Index downloaded genome for STAR aligner :return: """ cmd = [] star = os.path.join(tools_folder, 'aligners', 'STAR', 'bin/Linux_x86_64/STAR') cmd.extend([star]) cmd.extend([ '--runMode', 'genomeGenerate', '--runThreadN', multiprocessing.cpu_count() - 2 ]) cmd.extend(['----genomeFastaFiles', self.whole_genome]) cmd.extend(['--sjdbGTFfile', self.gtf]) # If you do not have enough memory if ram_atdisposal < 60: cmd.extend([ '--limitGenomeGenerateRAM', '24000000000', '--genomeSAsparseD', '2' ]) # Outpath to STAR index if genome_dir is None: genome_dir = commons.ensure_path( os.path.join(self.DownloadGenome.release_path, 'Sequence', 'STARIndex')) cmd.extend(['--genomeDir', genome_dir]) else: cmd.extend(['--genomeDir', genome_dir])
def __init__(self, genome, alignedlanes, outpath=None, bampaths=None): self.genome = genome if outpath is None: self.outpath = commons.ensure_path( os.path.join(basepath, 'results', 'RnaSeq')) else: self.outpath = outpath self.alignedlanes = alignedlanes self.bampaths = bampaths # list of bam paths
def run_analysis(self): if self.method == 'meme': self.path2folder = os.path.join(basepath + path_to_seq, self.name, 'meme') commons.ensure_path(self.path2folder) self.peak2seq(self.name) if self.background is not None: self.peak2seq('background') motif_db = [ "JASPAR_CORE_2016_vertebrates.meme", "HOCOMOCOv9.meme", "SwissRegulon_human_and_mouse.meme" ] self.meme_motif(motif_db) if self.method == 'homer': self.path2folder = os.path.join(basepath + path_to_seq, self.name, 'homer') commons.ensure_path(self.path2folder) self.motif_analysis_homer()
def permutation_test4peakdensity(peak_df, name, comparisions, sname=None, n=None, niter=100, outdir=None): import matplotlib.pyplot as plt import seaborn as sns import numpy as np ''' This will test for the factor binding difference between two condition. :return: ''' if (n is None) | (len(peak_df) < n): raise ValueError('Please provide no of peaks for selection or n is greater than total peaks') print('Permutation test is randomly selecting '+str(n)+' peaks for '+str(niter)+' iterations') print(outdir) commons.ensure_path(outdir) outpath = os.path.join(outdir, 'permutation_test', sname) commons.ensure_path(outpath) peak_df = peak_df.rename(columns={'Next Gene name':'Next transcript gene name'}) #print peak_df.shape filtered_peak = {'loaded_sample': peak_df} try: print('reading count data from old file') diffbindDF = pd.read_csv(os.path.join(outpath,'count_data.txt'), sep='\t', header=0) except: highest = False diffbind = differential_binding.Overlaps(name, filtered_peak) diffbindDF = diffbind.diffBinding('loaded_sample', highest=highest) diffbindDF.to_csv(os.path.join(outpath, 'count_data.txt'), sep='\t', header=True, index=None) #print(diffbindDF.head()) def plot_permuation(iterDF, mediandiff, pval, outpath, niter): sns.set('talk') plt.figure(figsize=(8,6)) pt = sns.distplot(iterDF['median_diff'], rug=True, hist=False, color='r') plt.bar(mediandiff,5, width=0.01) low = min(min(iterDF['median_diff']), mediandiff) high = max(max(iterDF['median_diff']), mediandiff) print(low+(low/8), high+(high/8), mediandiff) if low < 0: xlow = low+(low/8.) else: xlow = low-(low/8.) plt.xlim(xlow, high+(abs(high)/8.)) plt.ylabel('Freq. of difference') plt.xlabel('median diff. is '+str(iterDF['median_diff'].median())) plt.title('p-val of difference:'+str(pval)+' ;trial:'+str(niter)) plt.savefig(os.path.join(outpath, '_'.join(samples)+'.png')) plt.clf() plt.close() #return plt def test_significance_of_difference(iterDF, mediandiff, trial): count = 0 if mediandiff > iterDF['median_diff'].median(): # testtype == 'greater': count = len(iterDF[iterDF['median_diff'] >= mediandiff]) if mediandiff < iterDF['median_diff'].median(): # testtype == 'smaller': count = len(iterDF[iterDF['median_diff'] <= mediandiff]) print(count, mediandiff, trial) pval = (count+1.)/trial #pval = stats.binom_test(count, trial) print(pval) return pval for mediandiff, samples in comparisions.items(): iterDF = pd.DataFrame(0, columns=[samples[0]+'_mean', samples[1]+'_mean', samples[0]+'_median', samples[1]+'_median', 'mean_diff', 'median_diff'], index=range(niter)) print(samples) for i in range(niter): peakdf = differential_binding.random_sampleing_df(diffbindDF, n) iterDF.iloc[i, 0] = peakdf[samples[0]].mean() iterDF.iloc[i, 1] = peakdf[samples[1]].mean() iterDF.iloc[i, 2] = peakdf[samples[0]].median() iterDF.iloc[i, 3] = peakdf[samples[1]].median() iterDF.iloc[i, 4] = peakdf[samples[0]].mean() / peakdf[samples[1]].mean() iterDF.iloc[i, 5] = peakdf[samples[0]].median() / peakdf[samples[1]].median() iterDF.to_csv(os.path.join(outpath, '_'.join(samples)+'.txt'), sep='\t', header=True, index=None) pval = test_significance_of_difference(iterDF, mediandiff, niter) plot_permuation(iterDF, mediandiff, pval, outpath, niter)
:return: ''' for ind, row in meta_df_bam.iterrows(): sample_path = os.path.join(db_path, row['File accession'] + '.bam') if not os.path.exists(sample_path + '.bai'): print('Sorting & indexing bam:', sample_path) pysam.sort(sample_path, sample_path) pysam.index(sample_path) if __name__ == '__main__': start = timeit.default_timer() db_path = '/ps/imt/e/Encode_data_all/ENCODE_bam' out_dir = '/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/H3R2me2a_analysis/ENCODE_heatmaps_H3R2me2_+RA-RA' paths.ensure_path(out_dir) #/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/H3R2me2a_analysis/H3R2ame2_E9,H3R2me2a_B6.2,H3R2me2a_E9_RA,H3R2me2a_B6.2_RA,H3K4me3_E9,H3K4me3_B6.2,H3K4me3_E9_RA,H3K4me3_B6.2_RA,H3K27ac_E9,H3K27ac_B6.2,H3K27ac_E9_RA,H3K27ac_B6_RA/all6519_H3R2me2a_E9_RA vs IgG_E9_RA filtered_unique/norm/tagcountDF_all_norm.txt peak_df = read_csv( '/ps/imt/e/20141009_AG_Bauer_peeyush_re_analysis/further_analysis/filtered/H3R2ame2_E9 vs IgG_E.9 filtered/H3R2ame2_E9 vs IgG_E.9 filtered.txt', header=0, sep='\t') peak_df['chr'] = peak_df['chr'].astype('str') peak_df = peak_df[peak_df['chr'].str.len() < 4] #peak_df = peak_df[peak_df['cluster'].isin([0,2,3,4,5,6,8])] peak_df.index = range(0, len(peak_df)) meta_df_bam = read_csv(os.path.join(db_path, 'metadata.tsv'), sep='\t', header=0) meta_df_bam['Experiment target'] = meta_df_bam['Experiment target'].map( lambda x: x.split('-')[0].strip())