Example #1
0
 def get_bampaths_4_sample(self):
     '''
     This will extract tag count matrix from bam files for all samples.
     :param bamnames:
     :param rna_bam:
     :return:
     '''
     bam_paths = {}
     # Check if all the bam file exist
     for bam in self.bam_name:
         if type(bam) is str:
             try:
                 bam_path = differential_binding.getBam(bam)
                 bam_paths[bam] = bam_path
                 print('Found bam path:', bam_path)
             except ValueError:
                 raise (
                     'Error: Bam file not found in the default locations:',
                     bam)
         else:
             for ba in bam[1]:
                 try:
                     bam_path = differential_binding.getBam(ba)
                     bam_paths[ba] = bam_path
                     print('Found bam path:', bam_path)
                 except ValueError:
                     raise (
                         'Error: Bam file not found in the default locations:',
                         ba)
     return bam_paths
Example #2
0
    def create_heatmap(self):
        column = ['chr', 'start', 'stop', 'GenomicPosition TSS=1250 bp, upstream=5000 bp', 'Next transcript gene name',
                  'Next transcript strand', 'Next Transcript tss distance', 'summit', 'Next Transcript stable_id']
        peakdf = self.filter_peaks()
        peakdf.loc[:, 'chr'] = peakdf.loc[:, 'chr'].astype(str)
        bam_id = self.sample_id
        bam_name = self.name
        bam_path = differential_binding.getBam(bam_id, path='/ps/imt/e/Encode_data_all/ENCODE_HL60')
        sample_bam = pysam.Samfile(bam_path, "rb")
        total_mapped = sample_bam.mapped
        print(bam_name+'\t'+str(total_mapped)+'\n')
        distribution_df_norm = pd.DataFrame()
        #print(peakdf.head())

        # check if genome of alignment (UCSC or ENSEMBL) bam
        try:
            sample_bam.count('9', 99181564, 99181974)
        except ValueError:
            print('Bam file is UCSC aligned converting coordinates accordingly...')
            peakdf.loc[: 'chr'] = 'chr' + peakdf.loc[: 'chr']
            pass

        for ind, row in peakdf.iterrows():  # reading peaksd
            sys.stdout.write("\r%d%%" % ind)
            sys.stdout.flush()
            strand = row['Next transcript strand']
            Chr = str(row['chr'])
            start_pos = row['start']
            next_tss_distance = row['Next Transcript tss distance']
            interval = 100
            list_sample_norm = []

            start = (start_pos + next_tss_distance) - self.distance
            stop = start + interval

            if start > 0:
                for i in range(0, int((2*self.distance)/100)):  # Please set based on distance on one side = s*distance/50
                    seqcount = sample_bam.count(Chr, start, stop)
                    list_sample_norm.append((seqcount*(5.*10**6)/total_mapped))    # Normalized count per million
                    start = stop
                    stop = start + interval  # divide peaks into length of 50 bp

                distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm), ignore_index=True)

            #if (strand == 1) or (strand == '+'):
            #    distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm), ignore_index=True)

            #elif (strand == -1) or (strand == '-'):
            #    distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm[::-1]), ignore_index=True)

            else:
                print('Problem with gene strand information:', row['chr'], '-', row['start'])
        sample_bam.close()  # closing bam file
        print(distribution_df_norm.head())

        distribution_df_norm = pd.concat([peakdf[column], distribution_df_norm], axis=1)
        distribution_df_norm.to_csv(os.path.join(self.outpath, 'HmapWRTTss_'+bam_name+'_norm.tsv'), header=True, index=True, sep='\t')
        return distribution_df_norm
Example #3
0
def totaltagCountinPeak(peakscorDF, sampleBam):
    '''
    This will insert a tagcount column in dataframe for first bam file in the list.
    :param peakscorDF:
    :param sampleBam:
    :return:
    '''
    bam_path = differential_binding.getBam(sampleBam[0])
    sample_bam = pysam.Samfile(bam_path, "rb")
    countList = []
    #print(peakscorDF.head())
    for ind, row in peakscorDF.iterrows():
        chr = str(row['chr'])
        if 'chr' in chr:
            chr = row[3:]
        start = row['start']
        stop = row['stop']
        seqcount = sample_bam.count(chr, start, stop)
        countList.append(seqcount)
    peakscorDF.insert(5, 'tagcount', pd.Series(countList))
    return peakscorDF
Example #4
0
def GR_heatmaps_DF_for_peaks(bam_name_list,
                             peak_df,
                             region=None,
                             sort=False,
                             sort_column=None,
                             scale_df=True,
                             sample_name=None,
                             strength_divide=False,
                             normFact={}):
    '''
    Suggestion: Please do not use more then 3 samples at a time.
    This function will take a list of bam files path and create a heatmap of genomic region for the provided peak dataset.
    :param bam_name_list: A list cantaining names of bam files to be vizualized eg. bam_name_list = ['PRMT6_2_seq6', 'H3K4me3_seq2', 'Sample_K9me3']
    :param peak_df: A datframe containing peak information
    :param region: Which region to plot (eg. tss, intron, exon, intergenic or None)
    :param sort: True if dataframe should be sorted
    :param sort_column: If sorted=True; give the column name to be sorted
    :return: A dataframe; Column: additive length of genomic regions for all the bam files, Rows: Number of peaks defined in the peak dataframe
    '''
    region = region.strip()
    peak_df = peak_df
    if region != 'all':
        peak_df = peak_df[
            peak_df['GenomicPosition TSS=1250 bp, upstream=5000 bp'] == region]
    if region == 'tss':  # Reduce peaks based on their distance from TSS
        print('Selecting peaks only within +-300bp')
        peak_df = peak_df[peak_df['Next Transcript tss distance'] < 300]
        peak_df = peak_df[peak_df['Next Transcript tss distance'] > -300]
        if len(peak_df) == 0:
            raise ValueError('selected region does not contain any peaks')
    print(region + ' found in dataframe: ', len(peak_df))
    # print peak_df.head()
    peak_df.index = range(0, len(peak_df))
    if sort:
        print('Dataframe is being sort...')
        colnames = peak_df.columns.tolist()
        indices1 = [i for i, s in enumerate(colnames) if sort_column in s]
        #print peak_df.head()
        for i in indices1:
            if "RA" not in colnames[
                    i] and "RA" not in sort_column and "norm" not in colnames[
                        i]:
                condition = colnames[i]
                print('Sorted on column: ' + condition)
                peak_df = peak_df.sort(condition, ascending=False)
                sort_column = condition
                break
            elif "RA" in colnames[
                    i] and "RA" in sort_column and "norm" not in colnames[i]:
                condition = colnames[i]
                print('Sorted on column: ' + condition)
                peak_df = peak_df.sort(condition, ascending=False)
                sort_column = condition
                break
        #print peak_df.head()

    bam_order = ','.join(bam_name_list)
    if not sample_name is None:
        path = make_dir(bam_order,
                        region + str(len(peak_df)) + '_' + sample_name)
    else:
        path = make_dir(bam_order, region + str(len(peak_df)))

    # print peak_df.head()
    big_df = pd.DataFrame()
    big_df_raw = pd.DataFrame()
    ## Find if all bam names are found in database
    bam_path = get_bampaths_4_sample(bam_name_list)
    for v in bam_name_list:
        print('Sample:' + v)
        bam_path = differential_binding.getBam(v)
        df, df1 = overlapping_peaks_distribution(v, bam_path, peak_df, path)
        if v in normFact.keys():
            print('Multiply with external norm fact.', v, normFact.get(v))
            df1 = df1.multiply(normFact.get(v))
        if scale_df:
            df = scale_dataframe(df)  # scaling of dataframe
            print('scaled df')
        big_df = pd.concat([big_df, df], axis=1)
        big_df_raw = pd.concat([big_df_raw, df1], axis=1)
    big_df.columns = range(0, big_df.shape[1])
    big_df_raw.columns = range(0, big_df_raw.shape[1])
    #print(big_df.head())

    # Plot all sample in one line plot
    plot_all_peaks_4_multiple_samples(big_df, bam_order, path, 'norm')
    plot_all_peaks_4_multiple_samples(big_df_raw, bam_order, path, 'raw')

    # Plot peaks after dividing them into strength basis
    if strength_divide:
        DividePeaksInStrength(big_df_raw, bam_order,
                              path).divide_peaks_in_strength()
        DividePeaksInStrength(big_df, bam_order,
                              path).divide_peaks_in_strength()

    # Plot peaks based on K-means clustering
    else:
        #try:
        big_df = kmeans_clustering(big_df, 9,
                                   1000)  # performing k-means clustering
        big_df_raw = kmeans_clustering(big_df_raw, 9,
                                       1000)  # performing k-means clustering
        dict_of_df = differential_binding.group_DF(
            big_df, 'cluster')  # divide df in smaller dfs basis in clustering
        dict_of_df_raw = differential_binding.group_DF(big_df_raw, 'cluster')
        print(len(dict_of_df))
        line_plot_peak_distribution(dict_of_df, bam_order, path,
                                    'norm')  # plotting individual clusters
        line_plot_peak_distribution(dict_of_df_raw, bam_order, path, 'raw')
        print('No. of sample to plot:', len(bam_name_list))
        plot_clustered_peaks_4_multiple_samples(
            dict_of_df, bam_order, path,
            'norm')  # plotting cluster for different bam in overlapping plot
        plot_clustered_peaks_4_multiple_samples(dict_of_df_raw, bam_order,
                                                path, 'raw')
        #except:
        #print('Dataframe can not be clustered, scipy error.')
    ### adding columns to heatmap df
    try:
        colList = commons.peakdf_columns()[::-1]
        for col in colList:
            #print(col, peak_df[col])
            big_df.insert(0, col, peak_df[col])
            big_df_raw.insert(0, col, peak_df[col])
        if sort:
            big_df.insert(0, sort_column, peak_df[sort_column])
            big_df_raw.insert(0, sort_column, peak_df[sort_column])
    except:
        raise ValueError('Needed columns for peak profile are missing')
    #print (big_df.head())
    #print (big_df_raw.head())

    # adding tagcount column for first bam file
    big_df = totaltagCountinPeak(big_df, bam_name_list)
    big_df_raw = totaltagCountinPeak(big_df_raw, bam_name_list)

    big_df.to_csv(os.path.join(path, 'norm',
                               'tagcountDF_' + region + '_norm.tsv'),
                  sep="\t",
                  encoding='utf-8')  # , ignore_index=True
    big_df_raw.to_csv(os.path.join(path, 'raw',
                                   'tagcountDF_' + region + '_raw.tsv'),
                      sep="\t",
                      encoding='utf-8')  # , ignore_index=True
    gc.collect()