def makeProfiles(matx=[], folder='', matnames=[], title='', name='temp/peaksat.pdf', refpoint="TSS", scale=None, sort=False, withDeeptools=True, cluster=1, vmax=None, vmin=None, overlap=False, legendLoc=None): if withDeeptools: if not (len(matnames) == 2 and len(matx) == 2): raise ValueError('you need two mat.gz files and two names') h.createFoldersFor(name) cmd = 'computeMatrixOperations relabel -m ' cmd += matx[0] + ' -o ' + matx[0] + ' --groupLabels ' + matnames[0] cmd += ' && computeMatrixOperations relabel -m ' cmd += matx[1] + ' -o ' + matx[1] + ' --groupLabels ' + matnames[1] cmd += ' && computeMatrixOperations rbind -m ' cmd += matx[0] + ' ' + matx[1] + " -o " + \ '.'.join(name.split('.')[:-1]) + ".gz" cmd += ' && plotProfile' cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz" cmd += " --outFileName " + name cmd += " --refPointLabel " + refpoint if vmax is not None: cmd += " -max " + str(vmax) if vmin is not None: cmd += " -min " + str(vmin) if cluster > 1: cmd += " --perGroup --kmeans " + str(cluster) if legendLoc: cmd += " --legendLocation " + legendLoc if title: cmd += " --plotTitle " + title data = subprocess.run(cmd, shell=True, capture_output=True) print(data)
def saveConfigs(workspace, filepath): """ will save everything about a workspace into a csv and json file Args: ----- workspace: str namespace/workspace from url typically namespace (str): project to which workspace belongs workspace (str): Workspace name filepath to save files """ wm = dm.WorkspaceManager(workspace) h.createFoldersFor(filepath) conf = wm.get_configs() conf.to_csv(filepath + '.csv') params = {} params['GENERAL'] = wm.get_workspace_metadata() for k, val in conf.iterrows(): params[k] = wm.get_config(val['name']) h.dictToFile(params, filepath + '.json')
def saveWorkspace(workspace, folderpath): """ will save everything about a workspace into a csv and json file Args: ----- workspace: str namespace/workspace from url typically namespace (str): project to which workspace belongs workspace (str): Workspace name folderpath: str path to save files """ wm = dm.WorkspaceManager(workspace) h.createFoldersFor(folderpath) conf = wm.get_configs() for k, val in conf.iterrows(): with open(folderpath + val['name'] + ".wdl", "w") as f: if val.sourceRepo == 'dockstore': name = "dockstore.org/" + '/'.join(val['methodPath'].split( '/')[2:4]) + '/' + val['methodVersion'] else: name = '/'.join( val[['methodNamespace', 'methodName', 'methodVersion']].astype(str).tolist()) try: f.write(dm.get_wdl(name)) except MethodNotFound: print(name + " could not be found") conf.to_csv(folderpath + 'worflow_list.csv') params = {} params['GENERAL'] = wm.get_workspace_metadata() for k, val in conf.iterrows(): params[k] = wm.get_config(val['name']) h.dictToFile(params[k]['inputs'], folderpath + "inputs_" + val['name'] + '.json') h.dictToFile(params[k], folderpath + "conf_" + val['name'] + '.json') h.dictToFile(params[k]['outputs'], folderpath + "outputs_" + val['name'] + '.json') h.dictToFile(params, folderpath + 'all_configs.json')
def mergeReplicatePeaks(peaks, bigwigfolder, markedasbad=None, window=100, sampling=3000, mincov=4, doPlot=True, cov={}, minKL=8, use='max', MINOVERLAP=0.3, lookeverywhere=True, only='', saveloc=''): """ /!/ should only be passed peaks with at least one good replicate for each TFpeaksets, 1. find the replicate that have the most peaks 2. correlate peaks and get in highest correlation order with the replicate found in 1 3. find overlap of both and get size of second replicate 4. if small(er)-> use only to increase statistics 1. if a lot of uncalled peaks in replicate 2 at replicate 1 peaks (flag for mergebam) 5. if similar size -> get only intersect 2. add to intersect, find uncalled peaks in both replicates which are called in the other 6. repeat for all replicates ------------------------- if full overlap of one of the peak replicate, only use the overlapped one to increase confidence on peak if >80% average non overlap, print warning and percentage of overlap if <20% average non overlap, take the overlap and increase confidence and avg logfold if one is <20%: if other <40% average non overlap, take the overlap and increase confidence and avg logfold else take gets the max cov at the genomic window and if above some threshold, accepts the peak. extend peak by X bp if no TSS remove TSS from peaks create a new data frame containing merged peak size, reassembled peak data (p value etc..) and a the value for presence of each TF listed in previous df ------------------------------------ args: ---- peaks: df[bed-like] all the peaks into the sameBam with a column containing the 'name' being the id of the sample, the 'replicate' number of this sample, the 'tf' chiped here bamfolder: str, foldername avgCov: dict(filename:int) a dict where for each bam filename is given an averageCoverage if use=='max': window: mincov: if use=='max': returns: ------- mergedpeaks: dict{df-peakslike} bamtomerge: [[bam1,bam2]] """ def col_nan_scatter(x, y, **kwargs): df = pd.DataFrame({'x': x[:], 'y': y[:]}) df = df[df.sum(0) != 0] x = df['x'] y = df['y'] plt.gca() plt.scatter(x, y) def col_nan_kde_histo(x, **kwargs): df = pd.DataFrame({'x': x[:]}) df = df[df['x'] != 0] x = df['x'] plt.gca() sns.kdeplot(x) print("/!/ should only be passed peaks with at least one good replicate") # for a df containing a set of peaks in bed format and an additional column of different TF tfs = list(set(peaks['tf'])) totpeaknumber = 0 mergedpeaksdict = {} remove = [] tomergebam = [] ratiosofunique = {} h.createFoldersFor(saveloc) f = open(saveloc + 'results.txt', 'w') warnings.simplefilter("ignore") for tf in tfs: if only and tf != only: continue cpeaks = peaks[peaks.tf == tf] print('_____________________________________________________') f.write('_____________________________________________________' + '\n') if len(set(cpeaks['replicate'])) == 1: if cpeaks.name.tolist()[0] in markedasbad: print('the only replicate is considered bad!') f.write('the only replicate is considered bad!' + "\n") print('wrong TF: ' + tf) f.write('wrong TF: ' + tf + "\n") mergedpeaksdict.update({tf: cpeaks}) remove.append(tf) continue print("we only have one replicate for " + tf + " .. pass") f.write("we only have one replicate for " + tf + " .. pass" + "\n") mergedpeaksdict.update({tf: cpeaks}) continue print("merging " + tf + " peaks") f.write("merging " + tf + " peaks" + "\n") merged = simpleMergePeaks(cpeaks, window=window, maxp=False) merged_bed = merged[merged.columns[8:]] finalpeaks = merged[merged.columns[:8]] print('--> finish first overlaps lookup') f.write('--> finish first overlaps lookup' + "\n") # flag when biggest is <1000 peaks if len(finalpeaks) < 1000: print('!TF has less than 1000 PEAKS!') f.write('!TF has less than 1000 PEAKS!' + "\n") # for each TF (replicates), compute number of peaks peakmatrix = merged_bed.values.astype(bool) presence = [] for peakpres in peakmatrix.T: # https://github.com/tctianchi/pyvenn presence.append( set([i for i, val in enumerate(peakpres) if val == 1])) # compute overlap matrix (venn?) if peakmatrix.shape[1] < 7 and doPlot: plot.venn(presence, [ i + '_BAD' if i.split('-')[0] in markedasbad else i for i in merged_bed.columns ], title=tf + "_before_venn", folder=saveloc) plt.show() else: print('too many replicates for Venn: ' + str(peakmatrix.shape[1])) f.write('too many replicates for Venn: ' + str(peakmatrix.shape[1]) + "\n") if doPlot: fig = sns.pairplot(merged_bed, corner=True, diag_kind="kde", kind="reg", plot_kws={"scatter_kws": { "alpha": .05 }}) #fig = fig.map_upper(col_nan_scatter) #fig = fig.map_upper(col_nan_kde_histo) plt.suptitle("correlation of peaks in each replicate", y=1.08) if saveloc: fig.savefig(saveloc + tf + "_before_pairplot.pdf") plt.show() for i, val in enumerate(merged_bed): unique_inval = np.logical_and( np.delete(peakmatrix, i, axis=1).sum(1).astype(bool) == 0, peakmatrix[:, i]) sns.kdeplot(merged_bed[val][unique_inval], legend=True).set(xlim=(0, None)) plt.title("distribution of unique peaks in each replicate") if saveloc: plt.savefig(saveloc + tf + "_before_unique_kdeplot.pdf") plt.show() bigwigs = os.listdir(bigwigfolder) foundgood = False sort = findBestPeak(presence) for ib, sb in enumerate(sort): if merged_bed.columns[sb].split('-')[0] not in markedasbad: foundgood = True break if not foundgood: print('no peaks were good enough quality') f.write('no peaks were good enough quality' + "\n") print('bad TF: ' + tf) f.write('bad TF: ' + tf + "\n") remove.append(tf) ib = 0 # distplot # correlation plot biggest_ind = sort[ib] peakmatrix = peakmatrix.T biggest = merged_bed.columns[biggest_ind] print('-> main rep is: ' + str(biggest)) f.write('-> main rep is: ' + str(biggest) + '\n') tot = peakmatrix[biggest_ind].copy().astype(int) # starts with highest similarity and go descending j = 0 recovered = 0 additionalpeaksinbig = np.array([]) for i, val in enumerate(sort): if i == ib: continue j += 1 # if avg non overlap > 60%, and first, and none small flag TF as unreliable. overlap = len(presence[biggest_ind] & presence[val]) / len( presence[biggest_ind]) peakname = merged_bed.columns[val] print('- ' + peakname) f.write('- ' + peakname + '\n') print(' overlap: ' + str(overlap * 100) + "%") f.write(' overlap: ' + str(overlap * 100) + "%" + '\n') if overlap < MINOVERLAP: smallsupport = len(presence[biggest_ind] & presence[val]) / len(presence[val]) print(' --> not enough overlap') f.write(' --> not enough overlap' + '\n') if smallsupport < MINOVERLAP: # if the secondary does not have itself the required support if j == 1 and merged_bed.columns[val].split( '-')[0] not in markedasbad: print(" Wrong TF: " + tf) f.write(" Wrong TF: " + tf + '\n') remove.append(tf) break # if not first, throw the other replicate and continue print(" not using this replicate from the peakmatrix") f.write(" not using this replicate from the peakmatrix" + '\n') continue if lookeverywhere: tolookfor = peakmatrix[val] == 0 else: tolookfor = np.logical_and(peakmatrix[biggest_ind], peakmatrix[val] == 0) # ones that we have in the Primary but not in the secondary additionalpeaksinsec = findAdditionalPeaks( finalpeaks, tolookfor, bigwigfolder + findpeakpath(bigwigfolder, peakname), sampling=sampling, mincov=mincov, window=window, minKL=minKL, use=use) if len(additionalpeaksinsec[additionalpeaksinsec > 0]) > 0: sns.kdeplot(additionalpeaksinsec[additionalpeaksinsec > 0], label=peakname, legend=True).set(xlim=(0, None)) print(' min,max from newly found peaks: ' + str( (additionalpeaksinsec[additionalpeaksinsec > 0].min(), additionalpeaksinsec[additionalpeaksinsec > 0].max()))) f.write(' min,max from newly found peaks: ' + str( (additionalpeaksinsec[additionalpeaksinsec > 0].min(), additionalpeaksinsec[additionalpeaksinsec > 0].max())) + '\n') # for testing purposes mainly finalpeaks[additionalpeaksinsec.astype(bool)].to_csv( 'additionalpeaksinsec_mp' + merged_bed.columns[val] + '.bed', sep='\t', index=None, header=False) peakmatrix[val] = np.logical_or(peakmatrix[val], additionalpeaksinsec.astype(bool)) overlap = np.sum( np.logical_and(peakmatrix[val], peakmatrix[biggest_ind])) / np.sum( peakmatrix[biggest_ind]) if overlap < MINOVERLAP: newsmalloverlap = np.sum( np.logical_and(peakmatrix[val], peakmatrix[biggest_ind])) / np.sum( peakmatrix[val]) print(" we did not had enough initial overlap.") f.write(" we did not had enough initial overlap." + '\n') if newsmalloverlap < MINOVERLAP: if merged_bed.columns[val].split('-')[0] in markedasbad: print(' replicate ' + merged_bed.columns[val] + ' was too bad and had not enough overlap') f.write(' replicate ' + merged_bed.columns[val] + ' was too bad and had not enough overlap' + '\n') continue elif h.askif( "we have two good quality peaks that don't merge well at all: " + merged_bed.columns[val] + " and " + merged_bed.columns[biggest_ind] + " can the first one be removed?:\n \ overlap: " + str(overlap * 100) + '%\n smalloverlap: ' + str(smalloverlap * 100) + '%\n new smalloverlap: ' + str(newsmalloverlap * 100) + "%"): continue else: print(" enough from small overlaps") f.write(" enough from small overlaps" + '\n') print(' --> enough overlap') f.write(' --> enough overlap' + '\n') recovered += np.sum(additionalpeaksinsec.astype(bool)) if merged_bed.columns[val].split('-')[0] not in markedasbad: tot += peakmatrix[val].astype(int) # ones that we have in the Primary but not in the secondary if not lookeverywhere or len(additionalpeaksinbig) == 0: tolookfor = peakmatrix[ biggest_ind] == 0 if lookeverywhere else np.logical_and( peakmatrix[biggest_ind] == 0, peakmatrix[val]) additionalpeaksinbig = findAdditionalPeaks( finalpeaks, tolookfor, bigwigfolder + findpeakpath(bigwigfolder, biggest), sampling=sampling, mincov=mincov, window=window, minKL=minKL, use=use) if len(additionalpeaksinbig[additionalpeaksinbig > 0]) > 0: sns.kdeplot(additionalpeaksinbig[additionalpeaksinbig > 0], label=biggest, legend=True).set(xlim=(0, None)) print(' min,max from newly found peaks: ' + str(( additionalpeaksinbig[additionalpeaksinbig > 0].min(), additionalpeaksinbig[additionalpeaksinbig > 0].max()))) f.write(' min,max from newly found peaks: ' + str( (additionalpeaksinbig[additionalpeaksinbig > 0].min(), additionalpeaksinbig[additionalpeaksinbig > 0].max() )) + '\n') peakmatrix[biggest_ind] = np.logical_or( peakmatrix[biggest_ind], additionalpeaksinbig) tot += additionalpeaksinbig.astype(bool).astype(int) recovered += np.sum(additionalpeaksinbig.astype(bool)) print(' we have recovered ' + str(recovered) + ' peaks, equal to ' + str(100 * recovered / np.sum(peakmatrix[biggest_ind])) + '% of the peaks in main replicate') f.write(' we have recovered ' + str(recovered) + ' peaks, equal to ' + str(100 * recovered / np.sum(peakmatrix[biggest_ind])) + '% of the peaks in main replicate' + '\n') if overlap < (MINOVERLAP + 0.2) / 1.2: # we recompute to see if the overlap changed newoverlap = np.sum( np.logical_and(peakmatrix[val], peakmatrix[biggest_ind])) / np.sum( peakmatrix[biggest_ind]) smalloverlap = np.sum( np.logical_and(peakmatrix[val], peakmatrix[biggest_ind])) / np.sum( peakmatrix[val]) if newoverlap < (MINOVERLAP + 0.2) / 1.2: if smalloverlap < (2 + MINOVERLAP) / 3: print( " not enough overlap to advice to merge the bams.\n oldnew overlap: " + str(overlap * 100) + '%\n \ new overlap: ' + str(newoverlap * 100) + "%") f.write( " not enough overlap to advice to merge the bams.\n oldnew overlap: " + str(overlap * 100) + '%\n \ new overlap: ' + str(newoverlap * 100) + "%" + '\n') continue else: print( ' enough from small overlap to advice to merge the peaks' ) f.write( ' enough from small overlap to advice to merge the peaks' + '\n') tomergebam.append([biggest, peakname]) #the quality is good enough in the end we can pop from the list if it exists if tf in remove: remove.remove(tf) plt.title('distribution of new found peaks') if saveloc: plt.savefig(saveloc + tf + "_new_found_peaks_kdeplot.pdf") plt.show() # new distplot # new correlation plot ratiosofunique[tf] = len( np.argwhere(peakmatrix.sum(0) == 1)) / peakmatrix.shape[1] if doPlot: sns.pairplot(merged_bed, corner=True, diag_kind="kde", kind="reg", plot_kws={"scatter_kws": { "alpha": .05 }}) #fig = fig.map_upper(col_nan_scatter) #fig = fig.map_upper(col_nan_kde_histo) plt.suptitle( "correlation and distribution of peaks after recovery", y=1.08) if saveloc: plt.savefig(saveloc + tf + "_after_pairplot.pdf") plt.show() for i, val in enumerate(merged_bed): unique_inval = np.logical_and( np.delete(peakmatrix, i, axis=0).sum(0).astype(bool) == 0, peakmatrix[i]) sns.kdeplot(merged_bed[val][unique_inval], legend=True).set(xlim=(0, None)) plt.title( "distribution of unique peaks in each replicate after recovery" ) if saveloc: plt.savefig(saveloc + tf + "_after_unique_kdeplot.pdf") plt.show() if len(peakmatrix.shape) > 1 and doPlot: if peakmatrix.shape[0] < 7: presence = [] for peakpres in peakmatrix: # https://github.com/tctianchi/pyvenn presence.append( set([i for i, val in enumerate(peakpres) if val == 1])) title = tf + '_recovered (TOREMOVE)' if tf in remove else tf + '_recovered' plot.venn(presence, [ i + '_BAD' if i.split('-')[0] in markedasbad else i for i in merged_bed.columns ], title=title, folder=saveloc) plt.show() else: print('too many replicates for Venn') f.write('(too many replicates for Venn)' + '\n') finalpeaks = finalpeaks[np.logical_or(tot > 1, peakmatrix[biggest_ind])] finalpeaks['name'] = biggest finalpeaks['tf'] = tf mergedpeaksdict.update({tf: finalpeaks}) print(str((tf, len(finalpeaks)))) f.write(str((tf, len(finalpeaks))) + '\n') mergedpeak = pd.concat([peaks for _, peaks in mergedpeaksdict.items() ]).reset_index(drop=True) if doPlot: df = pd.DataFrame(data=ratiosofunique, index=['percentage of unique']) df['proteins'] = df.index fig = sns.barplot(data=df) plt.xticks(rotation=60, ha='right') plt.title("ratios of unique in replicates across experiments") if saveloc: plt.savefig(saveloc + "All_ratios_unique.pdf") plt.show() f.close() mergedpeak['name'] = mergedpeak.tf return mergedpeak, tomergebam, remove, ratiosofunique
def filterRNAfromQC(rnaqc, folder='tempRNAQCplot/', plot=True, qant1=0.07, qant3=0.93, thresholds={}, num_cols = 10, figsize=(10, 0.2)): thresh = {'minmapping': 0.8, # Mapping Rate 'minendmapping': 0.75, 'minefficiency': 0.6, # Expression Profiling Efficiency 'maxendmismatch': 0.025, # Base Mismatch end wise 'maxmismatch': 0.02, # Base Mismatch 'minhighqual': 0.6, # High Quality Rate 'minexon': 0.6, # Exonic Rate "maxambiguous": 0.2, # Ambiguous Alignment Rate "maxsplits": 0.1, # Avg. Splits per Read "maxalt": 0.65, # Alternative Alignments rate "maxchim": 0.3, # Chimeric Alignment Rate "minreads": 20000000, "minlength": 80, # Read Length "maxgenes": 35000, "mingenes": 10000, } thresh.update(thresholds) qcs = rnaqc.T tot = [] a = qcs[(qcs["Mapping Rate"] < thresh['minmapping']) | (qcs["Base Mismatch"] > thresh['maxmismatch']) | (qcs["End 1 Mapping Rate"] < thresh['minendmapping']) | (qcs["End 2 Mapping Rate"] < thresh['minendmapping']) | (qcs["End 1 Mismatch Rate"] > thresh['maxendmismatch']) | (qcs["End 2 Mismatch Rate"] > thresh['maxendmismatch']) | (qcs["Expression Profiling Efficiency"] < thresh['minefficiency']) | (qcs["High Quality Rate"] < thresh['minhighqual']) | (qcs["Exonic Rate"] < thresh['minexon']) | (qcs["Ambiguous Alignment Rate"] > thresh['maxambiguous']) | (qcs["Avg. Splits per Read"] < thresh['maxsplits']) | (qcs["Alternative Alignments"] > thresh['maxalt']*qcs["Total Reads"]) | (qcs["Chimeric Alignment Rate"] > thresh['maxchim']) | (qcs["Total Reads"] < thresh['minreads']) | (qcs["Read Length"] < thresh['minlength']) | (thresh['maxgenes'] < qcs["Genes Detected"]) | (qcs["Genes Detected"] < thresh['mingenes'])].index.tolist() tot.append([1 if i in qcs[(qcs["Mapping Rate"] < thresh['minmapping'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Base Mismatch"] > thresh['maxmismatch'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["End 1 Mapping Rate"] < thresh['minendmapping'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["End 2 Mapping Rate"] < thresh['minendmapping'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["End 1 Mismatch Rate"] > thresh['maxendmismatch'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["End 2 Mismatch Rate"] > thresh['maxendmismatch'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Expression Profiling Efficiency"] < thresh['minefficiency'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["High Quality Rate"] < thresh['minhighqual'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Exonic Rate"] < thresh['minexon']) ].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Ambiguous Alignment Rate"] > thresh['maxambiguous'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Avg. Splits per Read"] < thresh['maxsplits'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Alternative Alignments"] > thresh['maxalt'] * qcs["Total Reads"])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Chimeric Alignment Rate"] > thresh['maxchim'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Total Reads"] < thresh['minreads']) ].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Read Length"] < thresh['minlength'])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(thresh['maxgenes'] < qcs["Genes Detected"])].index.tolist() else 0 for i in a]) tot.append([1 if i in qcs[(qcs["Genes Detected"] < thresh['mingenes'])].index.tolist() else 0 for i in a]) res = pd.DataFrame(index=a, columns=["Mapping Rate", "Base Mismatch", "End 1 Mapping Rate", "End 2 Mapping Rate", "End 1 Mismatch Rate", "End 2 Mismatch Rate", "Expression Profiling Efficiency", "High Quality Rate", "Exonic Rate", "Ambiguous Alignment Efficiency", "Avg. Splits per Read", "Alternative Alignments", "Chimeric Alignment Rate", "Total Reads", "Read Length", "Min Genes Detected", "Max Genes Detected"], data=np.array(tot).astype(bool).T) print(a) if len(res) > 0: h.createFoldersFor(folder) res.to_csv(folder+'_qc_results.csv') if plot: _, ax = plt.subplots(figsize=(figsize[0], math.ceil(len(res)*figsize[1]))) plot = sns.heatmap(res, xticklabels=True, yticklabels=True, cbar=False) plt.yticks(rotation = 0) plt.show() plot.get_figure().savefig(folder+'failed_qc.pdf') num_rows = math.ceil(len(rnaqc)/num_cols) _, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows*2)) for val_idx, val in enumerate(rnaqc.index): ax = axes.flatten()[val_idx] qc = rnaqc.loc[val] sns.violinplot(y=qc, ax=ax) q1 = qc.quantile(qant1) q3 = qc.quantile(qant3) outlier_top_lim = q3 + 1.5 * (q3 - q1) outlier_bottom_lim = q1 - 1.5 * (q3 - q1) for k, v in qc[(qc < outlier_bottom_lim) | (qc > outlier_top_lim)].iteritems(): ax.text(0.05, v, k, ha='left', va='center', color='red' if k in a else 'black') plt.tight_layout() plt.show() plt.savefig('{}/qc_metrics.pdf'.format(folder), bbox_inches='tight') return res
def getPeaksAt(peaks, bigwigs, folder='', bigwignames=[], peaknames=[], window=1000, title='', numpeaks=4000, numthreads=8, width=5, length=10, torecompute=False, name='temp/peaksat.pdf', refpoint="TSS", scale=None, sort=False, withDeeptools=True, onlyProfile=False, cluster=1, vmax=None, vmin=None, overlap=False, legendLoc=None): """ get pysam data ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks for each counts, do a rolling average (or a convolving of the data) with numpy append to an array return array, normalized """ if withDeeptools: if isinstance(peaks, pd.DataFrame): peaks = 'peaks.bed ' peaks.to_csv('peaks.bed', sep='\t', index=False, header=False) elif type(peaks) == list: pe = '' i = 0 for n, p in enumerate(peaks): if 20 < int(os.popen('wc -l ' + p).read().split(' ')[0]): pe += p + ' ' elif len(peaknames) > 0: peaknames.pop(n - i) i += 1 peaks = pe elif type(peaks) == str: peaks += ' ' else: raise ValueError(' we dont know this filetype') if type(bigwigs) is list: pe = '' for val in bigwigs: pe += folder + val + ' ' bigwigs = pe else: bigwigs = folder + bigwigs + ' ' h.createFoldersFor(name) cmd = '' if not os.path.exists('.'.join(name.split('.')[:-1]) + ".gz") or torecompute: cmd += "computeMatrix reference-point -S " cmd += bigwigs cmd += " --referencePoint " + refpoint cmd += " --regionsFileName " + peaks cmd += " --missingDataAsZero" cmd += " --outFileName " + '.'.join(name.split('.')[:-1]) + ".gz" cmd += " --upstream " + str(window) + " --downstream " + str( window) cmd += " --numberOfProcessors " + str(numthreads) + ' && ' cmd += "plotHeatmap" if not onlyProfile else 'plotProfile' if type(name) is list: if not onlyProfile: raise ValueError( 'needs to be set to True, can\'t average heatmaps') cmd += " --matrixFile " + '.gz '.join(name) + ".gz" if average: cmd += "--averageType mean" else: cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz" cmd += " --outFileName " + name cmd += " --refPointLabel " + refpoint if vmax is not None: cmd += " -max " + str(vmax) if vmin is not None: cmd += " -min " + str(vmin) if cluster > 1: cmd += " --perGroup --kmeans " + str(cluster) if overlap: if onlyProfile: cmd += " --plotType overlapped_lines" else: raise ValueError("overlap only works when onlyProfile is set") if legendLoc: cmd += " --legendLocation " + legendLoc if len(peaknames) > 0: pe = '' for i in peaknames: pe += ' ' + i cmd += " --regionsLabel" + pe if type(bigwigs) is list: if len(bigwignames) > 0: pe = '' for i in bigwignames: pe += ' "' + i + '"' cmd += " --samplesLabel" + pe if title: cmd += " --plotTitle '" + title + "'" data = subprocess.run(cmd, shell=True, capture_output=True) print(data) else: if 'relative_summit_pos' in peaks.columns: center = [ int((val['start'] + val['relative_summit_pos'])) for k, val in peaks.iterrows() ] else: center = [ int((val['start'] + val['end']) / 2) for k, val in peaks.iterrows() ] pd.set_option('mode.chained_assignment', None) peaks['start'] = [c - window for c in center] peaks['end'] = [c + window for c in center] fig, ax = plt.subplots(1, len(bigwigs), figsize=[width, length], title=title if title else 'Chip Heatmap') if sort: peaks = peaks.sort_values(by=["foldchange"], ascending=False) if numpeaks > len(peaks): numpeaks = len(peaks) - 1 cov = {} maxs = [] for num, bigwig in enumerate(bigwigs): bw = pyBigWig.open(folder + bigwig) co = np.zeros((numpeaks, window * 2), dtype=int) scale = scale[bigwig] if scale is dict else 1 for i, (k, val) in enumerate(peaks.iloc[:numpeaks].iterrows()): try: co[i] = np.nan_to_num( bw.values(str(val.chrom), val.start, val.end), 0) except RuntimeError as e: print(str(val.chrom), val.start, val.end) pass cov[bigwig] = co maxs.append(co.max()) for num, bigwig in enumerate(bigwigs): sns.heatmap(cov[bigwig] * scale, ax=ax[num], vmax=max(maxs), yticklabels=[], cmap=cmaps[num], cbar=True) ax[num].set_title(bigwig.split('.')[0]) fig.subplots_adjust(wspace=0.1) fig.show() fig.savefig(name) return cov, fig
def runChromHMM(outdir, data, numstates=15, datatype='bed', folderPath=".", chromHMMFolderpath="~/ChromHMM/", assembly="hg38", control_bam_dir=None): """ runs the chromHMM algorithm Args: ----- outdir str: an existing dir where the results should be saved data: df[cellname,markname,markbed|bam|bigwig, ?controlbed|bam|bigwig] numstates: number of states to use datatype: flag one of bed folderPath: str folder where to save chromHMM's work chromHMMFolderpath: str folderpath to chromHMM algorithm assembly: flag one of hg38, hg37 ... control_bam_dir: str directory where the control would be stored (if not given in the ddf) Returns: ------- A dict of bed like dataframes containing the regions of the different states """ print("you need to have ChromHMM") chromHMM = "java -mx8000M -jar " + chromHMMFolderpath + "ChromHMM.jar " h.createFoldersFor(outdir + 'binarized/') data.to_csv(outdir + "input_data.tsv", sep='\t', index=None, header=None) cmd = chromHMM if datatype == "bed": cmd += "BinarizeBed " elif datatype == "bigwig": cmd += "BinarizeSignal " elif datatype == "bam": cmd += "BinarizeBam " else: raise ValueError('you need to provide one of bam, bigwig, bed') cmd += chromHMMFolderpath + "CHROMSIZES/" + assembly + ".txt " + folderPath + " " + outdir + "input_data.tsv " + outdir + "binarized" if control_bam_dir: cmd += " -c " + control_bam_dir res1 = subprocess.run(cmd, capture_output=True, shell=True) print(res1) if res1.returncode != 0: raise ValueError(str(res1.stderr)) cmd = chromHMM + "LearnModel -printposterior -noautoopen " if len(data) < 10: cmd += '-init load -m ' + chromHMMFolderpath + 'model_15_coreMarks.txt ' cmd += outdir + "binarized " + outdir + " " + str( numstates) + " " + assembly res2 = subprocess.run(cmd, capture_output=True, shell=True) print(res2) if res2.returncode != 0: raise ValueError(res2.stderr) ret = {} for v in set(data[0]): ret[v] = pd.read_csv( outdir + v + '_' + str(numstates) + '_dense.bed', sep='\t', header=None, skiprows=1).drop(columns=[4, 5, 6, 7]).rename(columns={ 0: 'chrom', 1: 'start', 2: 'end', 3: 'state', 8: "color" }) return ret