def get_bampaths_4_sample(self): ''' This will extract tag count matrix from bam files for all samples. :param bamnames: :param rna_bam: :return: ''' bam_paths = {} # Check if all the bam file exist for bam in self.bam_name: if type(bam) is str: try: bam_path = differential_binding.getBam(bam) bam_paths[bam] = bam_path print('Found bam path:', bam_path) except ValueError: raise ( 'Error: Bam file not found in the default locations:', bam) else: for ba in bam[1]: try: bam_path = differential_binding.getBam(ba) bam_paths[ba] = bam_path print('Found bam path:', bam_path) except ValueError: raise ( 'Error: Bam file not found in the default locations:', ba) return bam_paths
def create_heatmap(self): column = ['chr', 'start', 'stop', 'GenomicPosition TSS=1250 bp, upstream=5000 bp', 'Next transcript gene name', 'Next transcript strand', 'Next Transcript tss distance', 'summit', 'Next Transcript stable_id'] peakdf = self.filter_peaks() peakdf.loc[:, 'chr'] = peakdf.loc[:, 'chr'].astype(str) bam_id = self.sample_id bam_name = self.name bam_path = differential_binding.getBam(bam_id, path='/ps/imt/e/Encode_data_all/ENCODE_HL60') sample_bam = pysam.Samfile(bam_path, "rb") total_mapped = sample_bam.mapped print(bam_name+'\t'+str(total_mapped)+'\n') distribution_df_norm = pd.DataFrame() #print(peakdf.head()) # check if genome of alignment (UCSC or ENSEMBL) bam try: sample_bam.count('9', 99181564, 99181974) except ValueError: print('Bam file is UCSC aligned converting coordinates accordingly...') peakdf.loc[: 'chr'] = 'chr' + peakdf.loc[: 'chr'] pass for ind, row in peakdf.iterrows(): # reading peaksd sys.stdout.write("\r%d%%" % ind) sys.stdout.flush() strand = row['Next transcript strand'] Chr = str(row['chr']) start_pos = row['start'] next_tss_distance = row['Next Transcript tss distance'] interval = 100 list_sample_norm = [] start = (start_pos + next_tss_distance) - self.distance stop = start + interval if start > 0: for i in range(0, int((2*self.distance)/100)): # Please set based on distance on one side = s*distance/50 seqcount = sample_bam.count(Chr, start, stop) list_sample_norm.append((seqcount*(5.*10**6)/total_mapped)) # Normalized count per million start = stop stop = start + interval # divide peaks into length of 50 bp distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm), ignore_index=True) #if (strand == 1) or (strand == '+'): # distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm), ignore_index=True) #elif (strand == -1) or (strand == '-'): # distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm[::-1]), ignore_index=True) else: print('Problem with gene strand information:', row['chr'], '-', row['start']) sample_bam.close() # closing bam file print(distribution_df_norm.head()) distribution_df_norm = pd.concat([peakdf[column], distribution_df_norm], axis=1) distribution_df_norm.to_csv(os.path.join(self.outpath, 'HmapWRTTss_'+bam_name+'_norm.tsv'), header=True, index=True, sep='\t') return distribution_df_norm
def totaltagCountinPeak(peakscorDF, sampleBam): ''' This will insert a tagcount column in dataframe for first bam file in the list. :param peakscorDF: :param sampleBam: :return: ''' bam_path = differential_binding.getBam(sampleBam[0]) sample_bam = pysam.Samfile(bam_path, "rb") countList = [] #print(peakscorDF.head()) for ind, row in peakscorDF.iterrows(): chr = str(row['chr']) if 'chr' in chr: chr = row[3:] start = row['start'] stop = row['stop'] seqcount = sample_bam.count(chr, start, stop) countList.append(seqcount) peakscorDF.insert(5, 'tagcount', pd.Series(countList)) return peakscorDF
def GR_heatmaps_DF_for_peaks(bam_name_list, peak_df, region=None, sort=False, sort_column=None, scale_df=True, sample_name=None, strength_divide=False, normFact={}): ''' Suggestion: Please do not use more then 3 samples at a time. This function will take a list of bam files path and create a heatmap of genomic region for the provided peak dataset. :param bam_name_list: A list cantaining names of bam files to be vizualized eg. bam_name_list = ['PRMT6_2_seq6', 'H3K4me3_seq2', 'Sample_K9me3'] :param peak_df: A datframe containing peak information :param region: Which region to plot (eg. tss, intron, exon, intergenic or None) :param sort: True if dataframe should be sorted :param sort_column: If sorted=True; give the column name to be sorted :return: A dataframe; Column: additive length of genomic regions for all the bam files, Rows: Number of peaks defined in the peak dataframe ''' region = region.strip() peak_df = peak_df if region != 'all': peak_df = peak_df[ peak_df['GenomicPosition TSS=1250 bp, upstream=5000 bp'] == region] if region == 'tss': # Reduce peaks based on their distance from TSS print('Selecting peaks only within +-300bp') peak_df = peak_df[peak_df['Next Transcript tss distance'] < 300] peak_df = peak_df[peak_df['Next Transcript tss distance'] > -300] if len(peak_df) == 0: raise ValueError('selected region does not contain any peaks') print(region + ' found in dataframe: ', len(peak_df)) # print peak_df.head() peak_df.index = range(0, len(peak_df)) if sort: print('Dataframe is being sort...') colnames = peak_df.columns.tolist() indices1 = [i for i, s in enumerate(colnames) if sort_column in s] #print peak_df.head() for i in indices1: if "RA" not in colnames[ i] and "RA" not in sort_column and "norm" not in colnames[ i]: condition = colnames[i] print('Sorted on column: ' + condition) peak_df = peak_df.sort(condition, ascending=False) sort_column = condition break elif "RA" in colnames[ i] and "RA" in sort_column and "norm" not in colnames[i]: condition = colnames[i] print('Sorted on column: ' + condition) peak_df = peak_df.sort(condition, ascending=False) sort_column = condition break #print peak_df.head() bam_order = ','.join(bam_name_list) if not sample_name is None: path = make_dir(bam_order, region + str(len(peak_df)) + '_' + sample_name) else: path = make_dir(bam_order, region + str(len(peak_df))) # print peak_df.head() big_df = pd.DataFrame() big_df_raw = pd.DataFrame() ## Find if all bam names are found in database bam_path = get_bampaths_4_sample(bam_name_list) for v in bam_name_list: print('Sample:' + v) bam_path = differential_binding.getBam(v) df, df1 = overlapping_peaks_distribution(v, bam_path, peak_df, path) if v in normFact.keys(): print('Multiply with external norm fact.', v, normFact.get(v)) df1 = df1.multiply(normFact.get(v)) if scale_df: df = scale_dataframe(df) # scaling of dataframe print('scaled df') big_df = pd.concat([big_df, df], axis=1) big_df_raw = pd.concat([big_df_raw, df1], axis=1) big_df.columns = range(0, big_df.shape[1]) big_df_raw.columns = range(0, big_df_raw.shape[1]) #print(big_df.head()) # Plot all sample in one line plot plot_all_peaks_4_multiple_samples(big_df, bam_order, path, 'norm') plot_all_peaks_4_multiple_samples(big_df_raw, bam_order, path, 'raw') # Plot peaks after dividing them into strength basis if strength_divide: DividePeaksInStrength(big_df_raw, bam_order, path).divide_peaks_in_strength() DividePeaksInStrength(big_df, bam_order, path).divide_peaks_in_strength() # Plot peaks based on K-means clustering else: #try: big_df = kmeans_clustering(big_df, 9, 1000) # performing k-means clustering big_df_raw = kmeans_clustering(big_df_raw, 9, 1000) # performing k-means clustering dict_of_df = differential_binding.group_DF( big_df, 'cluster') # divide df in smaller dfs basis in clustering dict_of_df_raw = differential_binding.group_DF(big_df_raw, 'cluster') print(len(dict_of_df)) line_plot_peak_distribution(dict_of_df, bam_order, path, 'norm') # plotting individual clusters line_plot_peak_distribution(dict_of_df_raw, bam_order, path, 'raw') print('No. of sample to plot:', len(bam_name_list)) plot_clustered_peaks_4_multiple_samples( dict_of_df, bam_order, path, 'norm') # plotting cluster for different bam in overlapping plot plot_clustered_peaks_4_multiple_samples(dict_of_df_raw, bam_order, path, 'raw') #except: #print('Dataframe can not be clustered, scipy error.') ### adding columns to heatmap df try: colList = commons.peakdf_columns()[::-1] for col in colList: #print(col, peak_df[col]) big_df.insert(0, col, peak_df[col]) big_df_raw.insert(0, col, peak_df[col]) if sort: big_df.insert(0, sort_column, peak_df[sort_column]) big_df_raw.insert(0, sort_column, peak_df[sort_column]) except: raise ValueError('Needed columns for peak profile are missing') #print (big_df.head()) #print (big_df_raw.head()) # adding tagcount column for first bam file big_df = totaltagCountinPeak(big_df, bam_name_list) big_df_raw = totaltagCountinPeak(big_df_raw, bam_name_list) big_df.to_csv(os.path.join(path, 'norm', 'tagcountDF_' + region + '_norm.tsv'), sep="\t", encoding='utf-8') # , ignore_index=True big_df_raw.to_csv(os.path.join(path, 'raw', 'tagcountDF_' + region + '_raw.tsv'), sep="\t", encoding='utf-8') # , ignore_index=True gc.collect()