def annotation(exp): from requests.exceptions import RetryError from time import sleep out_dir = make_folder(f'{exp.scratch}/Annotated/') condition_list = exp.IPs['Condition'].unique().tolist() for condition in condition_list: if 'peaktype' not in list(exp.sample_files[condition].keys()): peakset = 'overlap_peak' if exp.sample_files[condition][ 'idr_optimal_peak'] == 'none' else 'idr_optimal_peak' exp.sample_files[condition]['peaktype'] = peakset peakfiles = { condition: read_pd(exp.sample_files[condition][ exp.sample_files[condition]['peaktype']]) for condition in exp.IPs['Condition'].unique().tolist() if exp.sample_files[condition][exp.sample_files[condition]['peaktype']] != 'none' } for condition, file in peakfiles.items(): genome = exp.IPs.loc[exp.IPs.Condition == condition, 'Genome'].unique().tolist()[0] cond_dir = make_folder(f'{out_dir}{condition}/') anno_results = annotate_peaks( {condition: file}, cond_dir, genome, db='UCSC', check=False, log_file=exp.log_file, run_main=exp.run_main)[f'{condition}_annotated'] anno_list = anno_results.SYMBOL.unique().tolist() try: sleep(1) enrichr(anno_list, f'enrichr_{condition}', cond_dir, scan=None, max_terms=10, figsize=(12, 6), run_main=exp.run_main, log_file=exp.log_file) except RetryError: output( f'No stable enrichr connection. Skipping enrichr for {condition}.', log_file=exp.log_file, run_main=exp.run_main) exp.anno_results = {**exp.anno_results, **anno_results} exp.tasks_complete.append('Annotations') return exp
def principal_component_analysis(exp): out_dir = make_folder(f'{exp.scratch}PCA/') bigwigs = { sample: exp.sample_files[sample]['bw'] for sample in exp.samples if len(exp.sample_files[sample]['bw']) != 0 } multibw_command = f"multiBigwigSummary bins -b {' '.join(list(bigwigs.values()))} -l {' '.join(list(bigwigs.keys()))} -p 4 --chromosomesToSkip chrM,chrX,chrY -o {out_dir}{exp.name}_bwsummary.npz" correlation_command = f'plotCorrelation --corData {out_dir}{exp.name}_bwsummary.npz --corMethod pearson --whatToPlot heatmap --skipZeros --plotTitle "{exp.name} Binned Pearson Correlation Heatmap" --plotFileFormat png --outFileCorMatrix {out_dir}{exp.name}_CorMatrix.tab --colorMap Purples -o {out_dir}{exp.name}_CorHeatmap.png' pca_command = f'plotPCA --corData {out_dir}{exp.name}_bwsummary.npz --plotTitle "{exp.name} PCA Plot" --plotFileFormat png --outFileNameData {out_dir}{exp.name}_PCA_data.tab --log2 -o {out_dir}{exp.name}_PCA_Plot.png' command_list = [ submission_prepend(), multibw_command, correlation_command, pca_command ] exp.job_id.append( send_job(command_list=command_list, job_name=f"{exp.name}_Cor_PCA", job_log_folder=exp.job_folder, q='general', mem=4000, log_file=exp.log_file, project=exp.project, cores=5, run_main=exp.run_main)) exp.tasks_complete.append('PCA') return exp
def preseq(exp): output( '\nRunning QC plots: library complexity extrapolation, signal correlation and pca plots.', log_file=exp.log_file, run_main=exp.run_main) for sample in exp.samples: out_dir = make_folder(f'{exp.scratch}QC/preseq/{sample}/') command_list = [ submission_prepend( f'preseq lc_extrap -bam -output {out_dir}{sample}_preseq.txt {exp.sample_files[sample]["bam"]}' ) ] exp.job_id.append( send_job(command_list=command_list, job_name=f"{sample}_preseq", job_log_folder=exp.job_folder, q='general', mem=5000, log_file=exp.log_file, project=exp.project, cores=1, run_main=exp.run_main)) exp.tasks_complete.append('preseq') return exp
def plot_venn3_set(dict_of_sets, overlap_name, folder): ''' Makes 3 way venn from 3 sets. Saves to file. Inputs ------ dict_of_sets: dictionary of sets to overlap overlap_name: string with name of overlap folder: output folder Returns ------- None ''' folder = make_folder(f"{val_folder(folder)}venn_plot") plt.clf() plt.figure(figsize=(7, 7)) font = { 'family': 'sans-serif', 'weight': 'normal', 'size': 16, } plt.rc('font', **font) set_list = [] set_names = [] for name, setlist in dict_of_sets.items(): set_list.append(setlist) set_names.append(name.replace('_', ' ')) # make venn venn_plot = venn3(subsets=set_list, set_labels=set_names) patch = ['100', '110', '101', '010', '011', '001', '111'] for p in patch: if venn_plot.get_patch_by_id(p): venn_plot.get_patch_by_id(p).set_color('none') venn_plot.get_patch_by_id(p).set_alpha(.4) venn_plot.get_patch_by_id(p).set_edgecolor('none') # make c = venn3_circles(subsets=set_list) colors_list = ['green', 'blue', 'grey'] for circle, color in zip(c, colors_list): circle.set_edgecolor(color) circle.set_alpha(0.8) circle.set_linewidth(4) plt.title(f"{overlap_name.replace('_', ' ')} Overlaps") plt.tight_layout() plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.svg") plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.png", dpi=300) plt.close()
def fastq_screen(exp): ''' Checks fastq files for contamination with alternative genomes using Bowtie2 ''' output( f'Screening for contamination during sequencing: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) # Make QC folder exp.qc_folder = make_folder(f'{exp.scratch}QC/') cwd = val_folder(os.getcwd()) os.chdir(exp.data_folder) samples = [ file for file in exp.sample_df.Scratch_File1.tolist() if is_fastq(file) ] # Submit fastqc and fastq_screen jobs for each sample for sample in samples: command_list = [ submission_prepend( f'fastq_screen --threads 4 --aligner bowtie2 {sample}') ] exp.job_id.append( send_job(command_list=command_list, job_name=f'{sample.split("/")[-1]}_fastq_screen', job_log_folder=exp.job_folder, q='general', mem=3000, log_file=exp.log_file, project=exp.project, cores=2, run_main=exp.run_main)) time.sleep(1) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file) # move to qc folder fastqs_files = glob.glob(f'{exp.data_folder}*screen*') for f in fastqs_files: copy2(f, exp.qc_folder) os.remove(f) # change to experimental directory in scratch os.chdir(cwd) exp.tasks_complete.append('Fastq_screen') output(f'Screening complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) return exp
def enrichr(gene_list, description, out_dir, scan=None, max_terms=10, figsize=(12, 6), run_main=False): ''' Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list. Uses enrichr. Inputs ------ gene_list: list of genes to perform enrichment on description: string description for title out_dir: output director scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats) max_terms: limit return plot to this max load: load results figsize: change fig size Returns ------- None ''' out_dir = make_folder(out_dir) testscan = { 'KEGG': 'KEGG_2016', 'GO_biological_process': 'GO_Biological_Process_2017b', 'ChIP-X_Consensus_TFs': 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ChEA': 'ChEA_2016', 'OMIM_Disease': 'OMIM_Disease' } if isinstance(scan, dict): testscan = {**testscan, **scan} for nick, name in testscan.items(): gseapy.enrichr(gene_list=gene_list, figsize=figsize, top_term=max_terms, description=f'{description}_{nick}', gene_sets=name, outdir=out_dir, format='png') out_result(f'{out_dir}{nick}.{name}.enrichr.reports.png', f'Enrichr: {nick} for {description}', run_main=run_main) out_list = pd.DataFrame({'Gene Name': gene_list}, index=range(len(gene_list))) out_list.to_excel(f'{out_dir}{description}_genes.xlsx', index=None)
def plot_venn3_counts(element_list, set_labels, overlap_name, folder): ''' Plot three way venn based on counts of specific overlaping numbers. Saves to file. Inputs ------ element_list: tuple with counts of the the overlaps from (Abc,aBc,ABc,abC,AbC,ABC) set_labels: list or tuple with names of the overlaps ('A','B','C') overlap_name: string with name of overlap folder: output folder Returns ------- None ''' folder = make_folder(f"{val_folder(folder)}venn_plot") plt.clf() plt.figure(figsize=(7, 7)) font = { 'family': 'sans-serif', 'weight': 'normal', 'size': 16, } plt.rc('font', **font) # make venn venn_plot = venn3( subsets=element_list, set_labels=[name.replace('_', ' ') for name in set_labels]) patch = ['100', '110', '101', '010', '011', '001', '111'] for p in patch: if venn_plot.get_patch_by_id(p): venn_plot.get_patch_by_id(p).set_color('none') venn_plot.get_patch_by_id(p).set_alpha(.4) venn_plot.get_patch_by_id(p).set_edgecolor('none') # make c = venn3_circles(subsets=element_list) colors_list = ['green', 'blue', 'grey'] for circle, color in zip(c, colors_list): circle.set_edgecolor(color) circle.set_alpha(0.8) circle.set_linewidth(4) plt.title(f"{overlap_name.replace('_', ' ')} Overlaps") plt.tight_layout() plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.svg") plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.png", dpi=300)
def fastqc(exp): ''' Performs fastq spec analysis with FastQC ''' output('Assessing fastq quality. \n', log_file=exp.log_file, run_main=exp.run_main) # Make QC folder exp.qc_folder = make_folder(f'{exp.scratch}QC/') all_samples = exp.sample_df.Scratch_File1.tolist( ) + exp.sample_df.Scratch_File2.tolist() samples = [file for file in all_samples if is_fastq(file)] for sample in samples: command_list = [submission_prepend(f'fastqc {sample}')] exp.job_id.append( send_job(command_list=command_list, job_name=f'{sample.split("/")[-1]}_fastqc', job_log_folder=exp.job_folder, q='general', mem=5000, log_file=exp.log_file, project=exp.project, run_main=exp.run_main)) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file) # move to qc folder fastqc_files = glob.glob(f'{exp.data_folder}*.zip') fastqc_files = fastqc_files + glob.glob(f'{exp.data_folder}*.html') for f in fastqc_files: copy2(f, exp.qc_folder) os.remove(f) exp.tasks_complete.append('FastQC') output(f'FastQC complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) return exp
def plot_venn2(Series, overlap_name, folder): ''' Series with with overlaps 10,01,11 Plots a 2 way venn. Saves to file. ''' folder = make_folder(f"{val_folder(folder)}venn_plot") plt.clf() plt.figure(figsize=(7, 7)) font = { 'family': 'sans-serif', 'weight': 'normal', 'size': 16, } plt.rc('font', **font) # make venn venn_plot = venn2( subsets=(Series.iloc[0], Series.iloc[1], Series.iloc[2]), set_labels=[name.replace('_', ' ') for name in Series.index.tolist()]) patch = ['10', '01', '11'] for p in patch: if venn_plot.get_patch_by_id(p): venn_plot.get_patch_by_id(p).set_color('none') venn_plot.get_patch_by_id(p).set_alpha(.4) venn_plot.get_patch_by_id(p).set_edgecolor('none') c = venn2_circles(subsets=(Series.iloc[0], Series.iloc[1], Series.iloc[2])) colors = ['green', 'blue'] for circle, color in zip(c, colors): circle.set_edgecolor(color) circle.set_alpha(0.8) circle.set_linewidth(2) plt.title(overlap_name.replace('_', ' ') + " overlaps") plt.tight_layout() plt.savefig(f'{folder}{overlap_name.replace(" ", "_")}-overlap.svg') plt.savefig(f'{folder}{overlap_name.replace(" ", "_")}-overlap.png', dpi=300)
def encode3(exp): if 'Stage' not in exp.tasks_complete: output('Files not staged.\n', log_file=exp.log_file) exp = stage(exp) output('Running alignment and peak calling using ENCODE3 standards.', log_file=exp.log_file, run_main=exp.run_main) output('ENCODE3 cromwell pipeline.', log_file=exp.log_file, run_main=exp.run_main) out_dir = make_folder(f'{exp.scratch}ENCODE3/') IPs = exp.IPs end_types = {'q.gz': 'fastq', '.bam': 'bam'} for experiment in IPs.Condition.unique().tolist(): exp_dir = make_folder(f'{out_dir}{experiment}/') IP_sample_indicies = [(rep, index) for rep, index in enumerate( IPs[IPs.Condition == experiment].index.tolist(), start=1)] if len(IP_sample_indicies) > 6: raise IOError('Pipeline cannot handle more than 6 replicates.') seq_type = False if 'none' in IPs[ IPs.Condition == experiment]['File2'].tolist() else True final_stage = 'align' if 'align' in IPs[ IPs.Condition == experiment]['Final Stage'].tolist() else 'all' UMI_list = [ x.lower() for x in IPs[IPs.Condition == experiment]['UMI'].unique().tolist() ] if len(set(UMI_list)) > 1: raise IOError( 'All samples must be UMI processed or not for each condition.') UMI = True if UMI_list[0].lower() == 'yes' else False try: file_type = end_types[exp.sample_df[exp.sample_df.Condition == experiment] ['Scratch_File1'].tolist()[0][-4:]] except KeyError: output( f"{exp.sample_df[exp.sample_df.Condition == experiment]['Scratch_File1'].tolist()[0]} not a valid file type for this pipeline.", log_file=exp.log_file, run_main=exp.run_main) genome = IPs[IPs.Condition == experiment]['Genome'].unique().tolist() if len(genome) > 1: raise IOError( 'Cannot align to more than one genome per condition.') chip_type = IPs[IPs.Condition == experiment]['ChIP Type'].unique().tolist() if len(chip_type) > 1: raise IOError( 'Cannot have more than one chip type (histone or TF) for a condition.' ) chip_type = 'histone' if chip_type[0].lower() == 'histone' else 'tf' json_file = { 'chip.pipeline_type': chip_type, 'chip.paired_end': seq_type, 'chip.genome_tsv': exp.genome_indicies['encode_tsv'][genome[0]], 'chip.bwa.mem_mb': 30000, 'chip.macs2_mem_mb': 30000, 'chip.peak_caller': 'macs2', "chip.true_rep_only": False, "chip.dup_marker": "picard", "chip.mapq_thresh": 30, "chip.regex_filter_reads": "chrM", "chip.subsample_reads": 0, "chip.ctl_subsample_reads": 0, "chip.xcor_subsample_reads": 15000000, "chip.keep_irregular_chr_in_bfilt_peak": False, "chip.always_use_pooled_ctl": False, "chip.ctl_depth_ratio": 1.2, "chip.macs2_cap_num_peak": 500000, "chip.pval_thresh": 0.01, "chip.idr_thresh": 0.05, "chip.bwa_cpu": 4, "chip.bwa_mem_mb": 20000, "chip.bwa_time_hr": 48, "chip.filter_cpu": 2, "chip.filter_mem_mb": 20000, "chip.filter_time_hr": 24, "chip.bam2ta_cpu": 2, "chip.bam2ta_mem_mb": 10000, "chip.bam2ta_time_hr": 6, "chip.fingerprint_cpu": 2, "chip.fingerprint_mem_mb": 12000, "chip.fingerprint_time_hr": 6, "chip.xcor_cpu": 2, "chip.xcor_mem_mb": 16000, "chip.xcor_time_hr": 24, "chip.macs2_time_hr": 24, "chip.spr_mem_mb": 16000 } bams = [] ctl_bams = [] for rep, index in IP_sample_indicies: sample = exp.sample_df.loc[index, 'Sample_Name'] input_sample = IPs.loc[index, 'Background_Name'] if file_type == 'fastq': json_file[f'chip.fastqs_rep{rep}_R1'] = [ f'{exp.data_folder}{sample}_trim_R1.fastq.gz' ] json_file[f'chip.ctl_fastqs_rep{rep}_R1'] = [ f'{exp.data_folder}{input_sample}_trim_R1.fastq.gz' ] if seq_type: json_file[f'chip.fastqs_rep{rep}_R2'] = [ f'{exp.data_folder}{sample}_trim_R2.fastq.gz' ] json_file[f'chip.ctl_fastqs_rep{rep}_R2'] = [ f'{exp.data_folder}{input_sample}_trim_R2.fastq.gz' ] else: bams.append(f'{exp.data_folder}{sample}.bam') ctl_bams.append(f'{exp.data_folder}{input_sample}.bam') if file_type == 'bam': json_file[f'chip.bams'] = bams json_file[f'chip.ctl_bams'] = ctl_bams json_file['chip.align_only'] = True if UMI & (file_type == 'fastq') else False json_file[ 'chip.align_only'] = True if final_stage == 'align' else False json_file['chip.no_dup_removal'] = True if UMI else False json_file['chip.title'] = f'{experiment}_postUMI_dedup' if UMI & ( file_type == 'bam') else experiment json_file[ "chip.description"] = f"Cromwell ENCODE3 {experiment}: {'paired-end' if seq_type else 'single-end'} {chip_type}." encode_file = f'{exp_dir}{experiment}_ENCODE3.json' with open(encode_file, 'w') as file: json.dump(json_file, file, indent=4, sort_keys=True) pythonpath = shutil.which('python') miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x] cromwell_jar = re.sub( r'{}/.*'.format(miniconda), '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format( miniconda), pythonpath) jar = cromwell_jar if os.path.isfile( cromwell_jar ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar' command_list = [ submission_prepend(source='encode-chip-seq-pipeline'), f'cd {exp_dir}', f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {encode_file}' ] sent_job = send_job(command_list=command_list, job_name=f"{experiment}_ENCODE3", job_log_folder=exp.job_folder, q='bigmem', mem=35000, log_file=exp.log_file, project=exp.project, cores=1, run_main=exp.run_main) exp.job_id.append(sent_job) job_pending(sent_job, exp.log_file) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file) exp = encode_results(exp) exp.tasks_complete.append('ENCODE3') return exp
def plot_col(df, title, ylabel, out='', xy=(None, None), xticks=[''], plot_type=['violin', 'swarm'], pvalue=False, compare_tags=None, log_file=None, run_main=False): ''' One or two column boxplot from dataframe. Titles x axis based on column names. Inputs ------ df: dataframe (uses first two columns) title: string of title ylabel: string of y label xy: If specified, will x is the label column and y is the data column. (default: (None,None): Data separated into two columns). xticks: list of xtick names (default is none) pvalue: bool to perform ttest (default is False). Will only work if xy=(None,None) or ther are only two labels in x. plot_type: list of one or more: violin, box, swarm (default=violin) compare_tags: if xy and pvalue is specified and there are more than two tags in x, specify the tags to compare. eg. ['a','b'] out: out parent directory. if none returns into colplot/ log_file: log_file Returns ------ None ''' out = make_folder(f'{val_folder(out)}plots/') plt.clf() sns.set(context='paper', font='Arial', font_scale=2, style='white', rc={ 'figure.dpi': 300, 'figure.figsize': (5, 6) }) if type(plot_type) != list: plot_type = plot_type.split() lower_plot_type = [x.lower() for x in plot_type] if len(lower_plot_type) == 0: raise IOError('Input a plot type.') elif True not in { x in lower_plot_type for x in ['violin', 'box', 'swarm'] }: raise IOError('Did not recognize plot type.') if 'swarm' in lower_plot_type: if xy == (None, None): fig = sns.swarmplot(data=df, color='black', s=4) else: fig = sns.swarmplot(data=df, x=xy[0], y=xy[1], color='black', s=4) if 'violin' in lower_plot_type: if xy == (None, None): fig = sns.violinplot(data=df) else: fig = sns.violinplot(data=df, x=xy[0], y=xy[1]) if 'box' in lower_plot_type: if xy == (None, None): fig = sns.boxplot(data=df) else: fig = sns.boxplot(data=df, x=xy[0], y=xy[1]) fig.yaxis.set_label_text(ylabel) fig.set_title(title) if xticks: fig.xaxis.set_ticklabels(xticks) fig.xaxis.set_label_text('') for tick in fig.xaxis.get_ticklabels(): tick.set_fontsize(12) if pvalue: if xy == (None, None): _, pvalue = stats.ttest_ind(a=df.iloc[:, 0], b=df.iloc[:, 1]) compare_tags = df.columns else: _, pvalue = stats.ttest_ind( a=df[df[xy[0]] == compare_tags[0]][xy[1]], b=df[df[xy[0]] == compare_tags[1]][xy[1]]) fig.text( s=f'p-value = {pvalue:.03g}, {compare_tags[0]} v {compare_tags[1]}', x=0, y=-.12, transform=fig.axes.transAxes, fontsize=12) sns.despine() plt.tight_layout() plt.subplots_adjust(bottom=0.17, top=0.9) plt.savefig(f"{out}{title.replace(' ', '_')}.png", dpi=300) if run_main: plt.close() out_result(f"{out}{title.replace(' ', '_')}.png", f'{title} Plot', run_main=run_main) output(f"{title.replace(' ', '_')}.png found in {out}", log_file=log_file, run_main=run_main)
def deeptools(regions, signals, matrix_name, out_name, pegasus_folder, title='', bps=(1500, 1500, 4000), type='center', scaled_names=('TSS', 'TES'), make=('matrix', 'heatmap', 'heatmap_group', 'profile', 'profile_group')): ''' Inputs ------ regions: dictionary {'region_name':'/path/to/ssh/bedfile'} signals: dictionary {'signal_name':'/path/to/ssh/bigwigfile'} matrix_name: string of matrix name or matrix to be named (before .matrix.gz) out_name: name for output file tite: plot title (optional) bps: tuple of region width on either side of center or scaled. center ignores last number. default is (1500,1500,4000) type: 'center' or 'scaled' scaled_names: optional names for scaled start and end (default ('TSS','TES')) make: tuple of deeptool commands. options: matrix, heatmap, heatmap_group, profile, profile_group copy: bool. Copy region and signal files to peagasus copy_folder: folder to copy into Returns ------- string of commands for ssh_job ''' pegasus_folder = make_folder(pegasus_folder) make_lower = [x.lower() for x in make] if type.lower() == 'center': deepMat = 'reference-point --referencePoint center' deepHeat = "--refPointLabel 'Peak Center'" deepProf = "--refPointLabel 'Peak Center'" else: deepMat = f'scale-regions --regionBodyLength {str(bps[2])}' deepHeat = f'--startLabel {scaled_names[0]} --endLabel {scaled_names[1]}' deepProf = f'--startLabel {scaled_names[0]} --endLabel {scaled_names[1]}' cmd_list = [submission_prepend()] pegasus_region_path = ' '.join([ f"{pegasus_folder}{region_path.split('/')[-1]}" for region_path in regions.values() ]) pegasus_signal_path = ' '.join([ f"{pegasus_folder}{signal_path.split('/')[-1]}" for signal_path in signals.values() ]) if 'matrix' in make_lower: signal_name = ' '.join([signal_name for signal_name in signals.keys()]) computeMatrix = f"computeMatrix {deepMat} -a {str(bps[0])} -b {str(bps[1])} -p 4 -R {pegasus_region_path} -S {pegasus_signal_path} --samplesLabel {signal_name} -o {matrix_name}.matrix.gz" cmd_list.append(computeMatrix) if 'heatmap' in make_lower or 'heatmap_group' in make_lower: region_name = ' '.join([region_name for region_name in regions.keys()]) plotHeatmap_base = f"plotHeatmap -m {matrix_name}.matrix.gz --dpi 300 {deepHeat} --regionsLabel {region_name} --plotTitle '{title.replace('_', ' ')}' --whatToShow 'heatmap and colorbar' --colorMap Reds -out {out_name}_heatmap" if 'heatmap' in make_lower: cmd_list.append(f"{plotHeatmap_base}.png") if 'heatmap_group' in make_lower: cmd_list.append(f"{plotHeatmap_base}_perGroup.png --perGroup") if 'profile' in make_lower or 'profile_group' in make_lower: region_name = ' '.join([region_name for region_name in regions.keys()]) plotProfile_base = f"plotProfile -m {matrix_name}.matrix.gz --dpi 300 {deepProf} --plotTitle '{title.replace('_', ' ')}' --regionsLabel {region_name} -out {out_name}_profile" if 'profile' in make_lower: cmd_list.append(f"{plotProfile_base}.png") if 'profile_group' in make_lower: cmd_list.append(f"{plotProfile_base}_perGroup.png --perGroup") return cmd_list
def stage(exp): ''' Stages files in scratch folder ''' output(f'Staging in {exp.scratch}\n', log_file=exp.log_file, run_main=exp.run_main) exp.data_folder = make_folder(f'{exp.scratch}raw_data/') Scratch_File1 = [] Scratch_File2 = [] # join multiple files for sample in exp.sample_df.Sample_Name.tolist(): index = exp.sample_df['Sample_Name'] == sample paired = exp.sample_df.loc[index, 'paired'].values[0] R1_list = ','.join(exp.sample_df.loc[index, 'File1']).split(',') R2_list = ','.join(exp.sample_df.loc[index, 'File2']).split(',') main_file = R1_list[0] # convert form bz2 to fastq if main_file.endswith('.tzt.bz2'): for file in R1_list + R2_list: newfile = file.replace(".txt.bz2", ".fastq.gz") os.system(f'bunzip2 -c < {file} | gzip -c > {newfile}') R1_list = [txt_replace(x) for x in R1_list] R2_list = [txt_replace(x) for x in R2_list] main_file = txt_replace(main_file) if main_file.endswith('.fastq.gz'): fileend = '_R1.fastq.gz' if paired else '.fastq.gz' filename = f'{exp.data_folder}{sample}{fileend}' os.system(f'cat {" ".join(R1_list)} > {filename}') Scratch_File1.append(filename) if paired: fileend = '_R2.fastq.gz' filename = f'{exp.data_folder}{sample}{fileend}' os.system(f'cat {" ".join(R2_list)} > {filename}') Scratch_File2.append(filename) else: Scratch_File2.append('none') elif main_file.endswith('.bam'): copy2(main_file, f'{exp.data_folder}{sample}.bam') else: raise IOError('Filetype not recognized.') exp.sample_df['Scratch_File1'] = Scratch_File1 exp.sample_df['Scratch_File2'] = Scratch_File2 exp.sample_df.replace([f'{exp.data_folder}none'], 'none', inplace=True) exp.tasks_complete.append('Stage') output(f'Staging complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) return exp
def spike(exp): ''' If calling from jupyter. Change backend as needed. Align sequencing files to drosophila. ''' import pandas as pd if len(exp.spike_samples) == 0: output('Not processing Spike-ins', log_file=exp.log_file, run_main=exp.run_main) exp.tasks_complete.append('Spike') return exp # Make QC folder spike_folder = make_folder(f'{exp.scratch}spike/') output('Processing samples with drosophila-spike in chromatin.', log_file=exp.log_file, run_main=exp.run_main) for sample in exp.spike_samples: bam = exp.sample_files[sample]['bam'] spike_command = [ submission_prepend(), f'samtools view -b -f 4 {bam} | samtools sort -n - | samtools fastq - > {spike_folder}{sample}.bwa_unaligned.fastq', f'bowtie2 -p 8 -x {exp.genome_indicies["spike_index"]} -U {spike_folder}{sample}.bwa_unaligned.fastq -S {spike_folder}{sample}.BDGP6.sam --very-sensitive-local -k 1 --no-unal', f'samtools view -b -F 4 {spike_folder}{sample}.BDGP6.sam | samtools sort - > {spike_folder}{sample}.BDGP6.bam', f'picard MarkDuplicates I={spike_folder}{sample}.BDGP6.bam O={spike_folder}{sample}.BDGP6.nodup.bam M={spike_folder}{sample}.BDGP6.nodups.markdups.qc ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true', f'samtools flagstat {spike_folder}{sample}.BDGP6.nodup.bam > {spike_folder}{sample}.unique_drosophila.flagstat.qc', f'rm {spike_folder}{sample}.BDGP6.sam {spike_folder}{sample}.BDGP6.nodup.bam {spike_folder}{sample}*.fastq' ] exp.job_id.append( send_job(command_list=spike_command, job_name=f"{sample}_spike", job_log_folder=exp.job_folder, q='general', mem=10000, log_file=exp.log_file, project=exp.project, cores=2, run_main=exp.run_main)) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file, exp.run_main) spike_reads = pd.DataFrame(index=['spike_reads', 'genome_reads']) for sample in exp.spike_samples: qc_file = f'{spike_folder}{sample}.unique_drosophila.flagstat.qc' exp.sample_files[sample]['drosophila'] = qc_file with open(qc_file, 'r') as fp: spike_number = fp.read().split(' ')[0] with open(exp.sample_files[sample]['nodup_flagstat']) as fp: target_number = fp.read().split(' ')[0] spike_reads[sample] = [spike_number, target_number] exp.spike_reads = spike_reads.T condition_dict = pd.Series(exp.sample_df.Condition.values, index=exp.sample_df.Sample_Name).to_dict() exp.spike_reads['Replicate'] = [ x.split('_')[-1] for x in exp.spike_reads.index.tolist() ] exp.spike_reads['Condition'] = [ condition_dict[x] for x in exp.splike_reads.index.tolist() ] for name, spike_conditions in exp.spike_comparisons.items(): out_dir = make_folder(f'{exp.scratch}spike/{name}') plot = spike_in_plot(exp.spike_reads, spike_conditions, name, out_dir) out_result(plot, f'{name.replace("_", " ")} Spike-In Comparison', run_main=exp.run_main) output( f'Spike-in comparison {name.replace("_", " ")} can be found here: {plot.replace(os.scratch, "")}' ) output(f'Spike-in counts:\n {spike_reads.T}', log_file=exp.log_file, run_main=exp.run_main) output('Spike-in alignment jobs finished.', log_file=exp.log_file, run_main=exp.run_main) # Generate one dataframe for all spike_counts output( f"Spike-in processing complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n", log_file=exp.log_file, run_main=exp.run_main) exp.tasks_complete.append('Spike') return exp
def annotate_peaks(dict_of_dfs, folder, genome, log_file, db='UCSC', check=False, run_main=False): ''' Annotate a dictionary of dataframes from bed files to the genome using ChIPseeker and Ensembl annotations. Inputs ------ dict_of_beds: dictionary of bed files folder: output folder genome: hg38, hg19, mm10 db: default UCSC, but can also accept Ensembl check: bool. checks whether annotation file already exists Returns ------- dictionary of annotated bed files as dataframe ''' pandas2ri.activate() ri.set_writeconsole_regular(rout_write) ri.set_writeconsole_warnerror(rout_write) folder = make_folder(folder) chipseeker = importr('ChIPseeker') genomicFeatures = importr('GenomicFeatures') makeGR = ro.r("makeGRangesFromDataFrame") check_df = { key: os.path.isfile(f'{folder}{key.replace(" ", "_")}_annotated.xlsx') for key in dict_of_dfs.keys() } return_bool = False not in set(check_df.values()) if return_bool & check: return { f'{key}_annotated': pd.from_excel(f'{folder}{key.replace(" ", "_")}_annotated.xlsx') for key in dict_of_dfs.keys() } if db.lower() == 'ucsc': species = ('Mmusculus' if genome.lower() == 'mm10' else 'Hsapiens') TxDb = importr(f'TxDb.{species}.UCSC.{genome.lower()}.knownGene') txdb = ro.r(f'txdb <- TxDb.{species}.UCSC.{genome.lower()}.knownGene') elif db.lower() == 'ensembl': pwd = 'todo' loadDb = ro.r('loadDb') txdb = loadDb(pwd.format(genome.lower())) else: raise ValueError('UCSC or Ensembl only.') if genome.lower() == 'mm10': annoDb = importr('org.Mm.eg.db') anno = 'org.Mm.eg.db' elif genome.lower() == 'hg38' or genome.lower() == 'hg19': annoDb = importr('org.Hs.eg.db') anno = 'org.Hs.eg.db' return_dict = {} output('Annotating Peaks...', log_file=log_file, run_main=run_main) for key, df in dict_of_dfs.items(): if check & check_df[key]: return_dict[f'{key}_annotated'] = pd.from_excel( f'{folder}{key.replace(" ", "_")}_annotated.xlsx') else: col_len = len(df.columns) df.columns = ["chr", "start", "end"] + list(range(col_len - 3)) GR = makeGR(df) GR_anno = chipseeker.annotatePeak(GR, overlap='all', TxDb=txdb, annoDb=anno) return_dict[f'{key}_annotated'] = ro.pandas2ri.ri2py( chipseeker.as_data_frame_csAnno(GR_anno)) return_dict[f'{key}_annotated'].to_excel( f'{folder}{key.replace(" ", "_")}_annotated.xlsx', index=None) return return_dict
def overlaps(exp): ''' Performs overlaps of two or more de_sig lists. ''' out_dir = make_folder(f'{exp.scratch}/Overlaps/') for comparison, overlap_list in exp.overlaps.items(): comp_dir = make_folder(f'{out_dir}{comparison}_Overlap/') peakset = 'overlap_peak' if 'none' in [ exp.sample_files[condition]['idr_optimal_peak'] for condition in overlap_list ] else 'idr_optimal_peak' if (peakset == 'overlap_peak') and ('none' in [ exp.sample_files[condition]['overlap_peak'] for condition in overlap_list ]): output( f'ENCODE processing did not finish for at least one of the samples in the comparison {comparison}. Skipping overlap...', log_file=exp.log_file, run_main=exp.run_main) with open(f'{comp_dir}SKIPPING_OVERLAP.txt', 'w') as file: file.write( 'Cannot find the peaks for at least one sample. Skipping overlap...' ) continue for condition in overlap_list: exp.sample_files[condition]['peaktype'] = peakset bed_dict = { condition: load_bedtool(exp.sample_files[condition][peakset]) for condition in overlap_list } genome_list = exp.IPs.loc[exp.IPs['Condition'].isin(overlap_list), 'Genome'].unique().tolist() if len(genome_list) > 1: output( f'Cannot overlap peaks from different genomes for {condition}.', log_file=exp.log_file, run_main=exp.run_main) with open(f'{comp_dir}SKIPPING_OVERLAP.txt', 'w') as file: file.write( 'Cannot overlap peaks from different genomes for this condition.' ) continue else: genome = genome_list[0] if len(overlap_list) == 2: exp.overlap_results[comparison] = overlap_two( bed_dict, comparison, comp_dir, exp.log_file, genome=genome, run_main=exp.run_main) elif len(overlap_list) == 3: exp.overlap_results[comparison] = overlap_three( bed_dict, comparison, comp_dir, exp.log_file, genome=genome, run_main=exp.run_main) else: output(f'Cannot overlap more than three samples for {condition}.', log_file=exp.log_file, run_main=exp.run_main) with open(f'{comp_dir}SKIPPING_OVERLAP.txt', 'w') as file: file.write('Cannot overlap more than three samples.') continue output(f'Overlap analysis complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) return exp
def overlap_two(bed_dict, overlap_name, out_folder, log_file, genome=None, run_main=False): ''' Takes a dictionary of two bed-like format files. Merges all overlapping peaks for each bed into a master file. Intersects beds to merged master file. Performs annotations with ChIPseeker if genome is specified. Plots venn diagrams of peak overlaps If genome is specified, also plots venn diagrams of annotated gene sets. Inputs ------ bed_dict: dictionary of BedTool files genome: 'hg38','hg19','mm10' Returns ------- Returns a dictionary of dataframes from unique and overlap peaks. If genome is specified, includes a dictionary of annotated peaks. ''' names = list(bed_dict.keys()) out_folder = make_folder(out_folder) output(f'Output files for {overlap_name} are found in {out_folder}', log_file=log_file, run_main=run_main) masterfile = bed_dict[names[0]].cat(bed_dict[names[1]]).sort().merge() sorted_dict = {key: bed.sort().merge() for key, bed in bed_dict.items()} overlap_dict = { 'overlap': masterfile.intersect(sorted_dict[names[0]]).intersect( sorted_dict[names[1]]) } for key, bed in sorted_dict.items(): other = { other_key: other_bed for other_key, other_bed in sorted_dict.items() if other_key != key } overlap_dict[f'{key}_unique_peak'] = masterfile.intersect( sorted_dict[key]).intersect(list(other.values())[0], v=True) for key, bed in overlap_dict.items(): if len(bed) == 0: open( f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed', 'w').close() # Can't convert empty bed file to dataframe else: bed2df(bed).to_csv( f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed', header=None, index=None, sep="\t") overlap_numbers = pd.Series( { names[0]: len(overlap_dict[f'{names[0]}_unique_peak']), names[1]: len(overlap_dict[f'{names[1]}_unique_peak']), 'overlap': len(overlap_dict['overlap']) }, index=[names[0], names[1], 'overlap']) # Venn plot_venn2(overlap_numbers, overlap_name.replace('_', ' '), out_folder) out_result( f'{out_folder}venn_plot/{overlap_name.replace(" ","_")}-overlap.png', f"{overlap_name.replace('_',' ')} Peak Venn Overlap", run_main=run_main) if bool(genome): # output(f'Annotating overlaping peaks for {overlap_name.replace("_"," ")}...', log_file) # Annotate with ChIPseeker unikey = '{}_unique' unianno = '{}_unique_annotated' return_dict = annotate_peaks( { unikey.format(key): bed2df(bed) for key, bed in overlap_dict.items() if len(bed) > 0 }, out_folder, genome=genome, log_file=log_file, run_main=run_main) for key, bed in overlap_dict.items(): if len(bed) == 0: return_dict[unianno.format(key)] = None Set1_unique = set() if return_dict[unianno.format( f'{names[0]}_unique_peak')] is None else set( return_dict[unianno.format( f'{names[0]}_unique_peak')].SYMBOL.unique().tolist()) Set2_unique = set() if return_dict[unianno.format( f'{names[1]}_unique_peak')] is None else set( return_dict[unianno.format( f'{names[1]}_unique_peak')].SYMBOL.unique().tolist()) Overlap_Set = set( ) if return_dict[unianno.format('overlap')] is None else set( return_dict[unianno.format('overlap')].SYMBOL.unique().tolist()) venn2_dict = { names[0]: (Set1_unique | Overlap_Set), names[1]: (Set2_unique | Overlap_Set) } plot_name = f'{overlap_name.replace("_"," ")} Annotated Gene' plot_venn2_set(venn2_dict, plot_name, out_folder) out_result( f'{out_folder}venn_plot/{plot_name.replace(" ","_")}-overlap.png', f"{overlap_name.replace('_',' ')} Venn Annotated Gene Overlap", run_main=run_main) gene_overlaps = {} gene_overlaps[f'{names[0]}_unique_genes'] = Set1_unique - ( Set2_unique | Overlap_Set) gene_overlaps[f'{names[1]}_unique_genes'] = Set2_unique - ( Set1_unique | Overlap_Set) gene_overlaps['Overlap_Gene_Set'] = (Set1_unique & Set2_unique) | Overlap_Set return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()} for key, item in gene_overlaps.items(): return_dict[key] = item else: return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()} return return_dict
def encode3(exp): if 'Stage' not in exp.tasks_complete: output('Files not staged.\n', log_file=exp.log_file) exp = stage(exp) output('Running alignment and peak calling using ENCODE3 standards.', log_file=exp.log_file, run_main=exp.run_main) output('ENCODE3 cromwell pipeline.', log_file=exp.log_file, run_main=exp.run_main) out_dir = make_folder(f'{exp.scratch}ENCODE3/') IPs = exp.IPs end_types = {'q.gz': 'fastq', '.bam': 'bam'} for experiment in IPs.Condition.unique().tolist(): exp_dir = make_folder(f'{out_dir}{experiment}/') IP_sample_indicies = [(rep, index) for rep, index in enumerate( IPs[IPs.Condition == experiment].index.tolist(), start=1)] if len(IP_sample_indicies) > 6: raise IOError('Pipeline cannot handle more than 6 replicates.') seq_type = False if 'none' in IPs[ IPs.Condition == experiment]['File2'].tolist() else True aligner = IPs[IPs.Condition == experiment]['Aligner'].unique().tolist() if len(aligner) != 1: raise IOError( 'All replicates must be aligned using the same aligner or not, which must be specified.' ) else: aligner = aligner[0] peak_caller = IPs[IPs.Condition == experiment]['Peak Caller'].unique().tolist() if len(peak_caller) != 1: raise IOError( 'All replicates peaks must be called or not using the same peak calling strategy.' ) else: peak_caller = peak_caller[0] UMI_list = [ x.lower() for x in IPs[IPs.Condition == experiment]['UMI'].unique().tolist() ] if len(set(UMI_list)) > 1: raise IOError( 'All samples must be UMI processed or not for each condition.') UMI = True if UMI_list[0] == 'yes' else False try: file_type = end_types[exp.sample_df[exp.sample_df.Condition == experiment] ['Scratch_File1'].tolist()[0][-4:]] except KeyError: output( f"{exp.sample_df[exp.sample_df.Condition == experiment]['Scratch_File1'].tolist()[0]} not a valid file type for this pipeline.", log_file=exp.log_file, run_main=exp.run_main) file_type = 'bam' if (UMI is True) & ( 'UMI' in exp.tasks_complete) else file_type genome = IPs[IPs.Condition == experiment]['Genome'].unique().tolist() if len(genome) > 1: raise IOError( 'Cannot align to more than one genome per condition.') chip_type = IPs[IPs.Condition == experiment]['ChIP Type'].unique().tolist() if len(chip_type) > 1: raise IOError( 'Cannot have more than one chip type (histone or TF) for a condition.' ) chip_type = 'histone' if chip_type[0].lower() == 'histone' else 'tf' json_file = { 'chip.pipeline_type': chip_type, 'chip.paired_end': seq_type, 'chip.genome_tsv': exp.genome_indicies['encode_tsv'][genome[0]], 'chip.align_mem_mb': 30000, "chip.true_rep_only": False, "chip.dup_marker": "picard", "chip.mapq_thresh": 30, "chip.filter_chrs": ["chrM"], "chip.subsample_reads": 0, "chip.ctl_subsample_reads": 0, "chip.xcor_subsample_reads": 15000000, "chip.always_use_pooled_ctl": False, "chip.ctl_depth_ratio": 1.2, "chip.cap_num_peak_macs2": 500000, "chip.pval_thresh": 0.01, "chip.idr_thresh": 0.05, "chip.align_cpu": 4, "chip.align_time_hr": 48, "chip.filter_cpu": 2, "chip.filter_mem_mb": 20000, "chip.filter_time_hr": 24, "chip.bam2ta_cpu": 2, "chip.bam2ta_mem_mb": 10000, "chip.bam2ta_time_hr": 6, "chip.jsd_cpu": 2, "chip.jsd_mem_mb": 12000, "chip.jsd_time_hr": 6, "chip.xcor_cpu": 2, "chip.xcor_mem_mb": 16000, "chip.xcor_time_hr": 24, "chip.align_time_hr": 24, "chip.spr_mem_mb": 16000, "chip.enable_count_signal_track": True, } if peak_caller == 'macs2': json_file['chip.peak_caller'] = 'macs2' if aligner != 'none': json_file['chip.aligner'] = aligner bams = [] ctl_bams = [] for rep, index in IP_sample_indicies: sample = exp.sample_df.loc[index, 'Sample_Name'] input_sample = IPs.loc[index, 'Background_Name'] if file_type == 'fastq': json_file[f'chip.fastqs_rep{rep}_R1'] = [ f'{exp.data_folder}{sample}_trim_R1.fastq.gz' ] json_file[f'chip.ctl_fastqs_rep{rep}_R1'] = [ f'{exp.data_folder}{input_sample}_trim_R1.fastq.gz' ] if seq_type: json_file[f'chip.fastqs_rep{rep}_R2'] = [ f'{exp.data_folder}{sample}_trim_R2.fastq.gz' ] json_file[f'chip.ctl_fastqs_rep{rep}_R2'] = [ f'{exp.data_folder}{input_sample}_trim_R2.fastq.gz' ] else: bams.append(f'{exp.data_folder}{sample}.bam') ctl_bams.append(f'{exp.data_folder}{input_sample}.bam') if file_type == 'bam': json_file[f'chip.bams'] = bams json_file[f'chip.ctl_bams'] = ctl_bams json_file['chip.align_only'] = True if UMI & (file_type == 'fastq') else False json_file[ 'chip.align_only'] = True if peak_caller == 'none' else json_file[ 'chip.align_only'] json_file['chip.no_dup_removal'] = True if UMI else False json_file['chip.title'] = f'{experiment}_postUMI_dedup' if UMI & ( file_type == 'bam') else experiment json_file[ "chip.description"] = f"Cromwell ENCODE3 {experiment}: {'paired-end' if seq_type else 'single-end'} {chip_type}." encode_file = f'{exp_dir}{experiment}_ENCODE3.json' with open(encode_file, 'w') as file: json.dump(json_file, file, indent=4, sort_keys=True) pythonpath = shutil.which('python') miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x] cromwell_jar = re.sub( r'{}/.*'.format(miniconda), '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format( miniconda), pythonpath) jar = cromwell_jar if os.path.isfile( cromwell_jar ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar' command_list = [ submission_prepend(source='encode-chip-seq-pipeline'), f'cd {exp_dir}', f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {encode_file}' ] sent_job = send_job(command_list=command_list, job_name=f"{experiment}_ENCODE3", job_log_folder=exp.job_folder, q='bigmem', mem=35000, log_file=exp.log_file, project=exp.project, cores=1, run_main=exp.run_main) exp.job_id.append(sent_job) job_pending(sent_job, exp.log_file) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file) # Check fraglength and resubmit with set 200 fraglen for macs2 if xcor error for experiment in exp.IPs.Condition.unique().tolist(): rep_number = len(exp.IPs[exp.IPs.Condition == experiment]) frag_list = [] for rep in range(rep_number): file = glob_check( f'{exp.scratch}ENCODE3/{experiment}/cromwell-executions/chip/*/call-xcor/shard-{rep}/execution/*fraglen.txt' ) with open(file, 'r') as f: frag_list.append(f.read().split()[0]) if '-' in [x[0] for x in frag_list]: output( f'Xcor failed for {experiment}. Resubmitting with fragment length set to 200 for failed sample/s', log_file=exp.log_file, run_main=exp.run_main) frag_list = [x if x[0] != '-' else '200' for x in frag_list] exp_dir = f'{exp.scratch}ENCODE3/{experiment}/' encode_file = f'{exp_dir}{experiment}_ENCODE3.json' with open(encode_file, 'r') as file: json_file = json.load(file) json_file["chip.fraglen"] = frag_list resubmit_file = f'{exp_dir}/{experiment}_ENCODE3_setfraglenth.json' with open(resubmit_file, 'w') as file: json.dump(json_file, file, indent=4, sort_keys=True) pythonpath = shutil.which('python') miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x] cromwell_jar = re.sub( r'{}/.*'.format(miniconda), '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format( miniconda), pythonpath) jar = cromwell_jar if os.path.isfile( cromwell_jar ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar' command_list = [ submission_prepend(source='encode-chip-seq-pipeline'), f'cd {exp_dir}', f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {resubmit_file}' ] sent_job = send_job(command_list=command_list, job_name=f"{experiment}_ENCODE3_resubmission", job_log_folder=exp.job_folder, q='bigmem', mem=35000, log_file=exp.log_file, project=exp.project, cores=1, run_main=exp.run_main) exp.job_id.append(sent_job) job_pending(sent_job, exp.log_file) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file) exp = encode_results(exp) exp.tasks_complete.append('ENCODE3') return exp
def overlap_three(bed_dict, overlap_name, out_folder, log_file, genome=None, run_main=False): ''' Takes a dictionary of three bed-like format files. Merges all overlapping peaks for each bed into a master file. Intersects beds to merged master file. Performs annotations with ChIPseeker if genome is specified. Plots venn diagrams of peak overlaps If genome is specified, also plots venn diagrams of annotated gene sets. Inputs ------ bed_dict: dictionary of BedTool files genome: 'hg38','hg19','mm10' Returns ------- Returns a dictionary of dataframes from unique and overlap peaks. If genome is specified, includes a dictionary of annotated peaks. ''' from collections import OrderedDict names = list(bed_dict.keys()) out = make_folder(out_folder) output(f'Output files are found in {out}', log_file=log_file, run_main=run_main) output(f'A: {names[0]}, B: {names[1]}, C: {names[2]}', log_file=log_file, run_main=run_main) with open(f'{out}README.txt', 'w') as file: file.write( 'All peaks are unique, meaning that each peak is in only one group.\n' ) file.write( 'Capital letter means this sample peak is included in the overlap.\n' ) file.write( 'Lowercase letter means the sample is excluded in the overlap.\n\n' ) file.write(f'A: {names[0]}\nB: {names[1]}\nC: {names[2]}') master = bed_dict[names[0]].cat(bed_dict[names[1]]).cat( bed_dict[names[2]]).sort().merge() A = bed_dict[names[0]].sort().merge() B = bed_dict[names[1]].sort().merge() C = bed_dict[names[2]].sort().merge() sorted_dict = OrderedDict({'master': master, 'A': A, 'B': B, 'C': C}) sorted_dict['Abc'] = master.intersect(A).intersect(B, v=True).intersect( C, v=True) sorted_dict['aBc'] = master.intersect(B).intersect(A, v=True).intersect( C, v=True) sorted_dict['ABc'] = master.intersect(A).intersect(B).intersect(C, v=True) sorted_dict['abC'] = master.intersect(C).intersect(A, v=True).intersect( B, v=True) sorted_dict['AbC'] = master.intersect(A).intersect(C).intersect(B, v=True) sorted_dict['aBC'] = master.intersect(B).intersect(C).intersect(A, v=True) sorted_dict['ABC'] = master.intersect(A).intersect(B).intersect(C) labTup = tuple(key for key in sorted_dict.keys()) lenTup = tuple(len(bed) for bed in sorted_dict.values()) output(f'{labTup}\n{lenTup}', log_file=log_file, run_main=run_main) plot_venn3_counts(lenTup[4:], names, f'{overlap_name} Peak', out) out_result(f'{out}venn_plot/{overlap_name}_Peak-overlap.png', f"{overlap_name} Peak Venn Overlap", run_main=run_main) for key, bed in sorted_dict.items(): if len(bed) == 0: open(f'{out}{key.replace(" ", "_")}-peaks-from-mergedPeaks.bed', 'w').close() # Can't convert empty bed file to dataframe else: bed2df(bed).to_csv( f"{out}{key.replace(' ', '_')}-peaks-from-mergedPeaks.bed", header=None, index=None, sep="\t") if bool(genome): output('Annotating ovelapped peaks...', log_file=log_file) unikey = '{}_unique' unianno = '{}_unique_annotated' return_dict = annotate_peaks( { unikey.format(key): bed2df(bed) for key, bed in sorted_dict.items() if len(bed) > 0 }, out, genome=genome, log_file=log_file, run_main=run_main) for key, bed in sorted_dict.items(): if len(bed) == 0: return_dict[unianno.format(key)] = None Set1 = set() if return_dict[unianno.format('A')] is None else set( return_dict[unianno.format('A')].SYMBOL.unique().tolist()) Set2 = set() if return_dict[unianno.format('B')] is None else set( return_dict[unianno.format('B')].SYMBOL.unique().tolist()) Set3 = set() if return_dict[unianno.format('C')] is None else set( return_dict[unianno.format('C')].SYMBOL.unique().tolist()) plot_venn3_set({ names[0]: Set1, names[1]: Set2, names[2]: Set3 }, f'{overlap_name}_annotated_genes', out) out_result( f'{out}venn_plot/{overlap_name}_annotated_genes-overlap.png', f"{overlap_name.replace('_',' ')} Gene Venn Overlap", run_main=run_main) return_sorted_dict = {key: bed2df(bed) for key, bed in sorted_dict.items()} return return_sorted_dict if genome is None else { **return_sorted_dict, **return_dict }
def parse_config(config_file, run_main=False): ''' Parse experimental info from yaml file ''' with open(config_file, 'r') as file: yml = yaml.safe_load(file) # Make a new experimental object exp = Experiment() # Project exp.project = yml['LSF_Project'] # Check if running as pipeline exp.run_main = run_main # Setting Scratch folder exp.scratch = f'{os.getcwd()}/{yml["Name"]}_tmp/' if yml["Scratch_folder"] is None else f'{val_folder(yml["Scratch_folder"])}{yml["Name"]}/' os.makedirs(exp.scratch, exist_ok=True) # check whether experiment has been attempted exp.name = yml['Name'] exp.out_dir = make_folder(f"{val_folder(yml['Output_directory'])}{exp.name}/") filename = f'{exp.scratch}{exp.name}_incomplete.pkl' if os.path.isfile(filename): if yml['Restart'] is False: with open(filename, 'rb') as experiment: exp = pickle.load(experiment) os.remove(filename) # set new date exp.date = f'{datetime.now():%Y-%m-%d}' # For output of R logs into job_log_folder os.chdir(exp.job_folder) output(f'\n#############\nRestarting pipeline on {datetime.now():%Y-%m-%d %H:%M:%S}, from last completed step.', log_file=exp.log_file, run_main=exp.run_main) return exp else: os.remove(filename) # Passing paramters to new object exp.date = f'{datetime.now():%Y-%m-%d}' # Log file exp.log_file = f'{exp.out_dir}{exp.name}-{exp.date}.log' output(f'Pipeline version {version()} run on {exp.date} \n', log_file=exp.log_file, run_main=run_main) output(f'Beginning ChIPseq Analysis: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main) output('Reading experimental file...\n', log_file=exp.log_file, run_main=run_main) output(f"Pipeline output folder: {exp.out_dir}\n", log_file=exp.log_file, run_main=run_main) # Setting Job Folder exp.job_folder = f'{val_folder(exp.scratch)}logs/' os.makedirs(exp.job_folder, exist_ok=True) # Load sample info exp.sample_df = read_pd(yml['Sample_file']) # Make Sample Name exp.sample_df.replace([np.nan], 'none', inplace=True) exp.sample_df['Sample_Name'] = exp.sample_df.Condition + '_' + exp.sample_df.Replicate output(f'Processing samples:\n{exp.sample_df}', log_file=exp.log_file, run_main=run_main) # Paired exp.sample_df['paired'] = [x != 'none' for x in exp.sample_df.File2.tolist()] exp.IPs = exp.sample_df[exp.sample_df['Background Sample'] != 'none'].copy() sample_dict = exp.sample_df.Sample_Name.to_dict() exp.IPs['Background_Name'] = exp.IPs['Background Sample'].map(sample_dict) exp.samples = exp.IPs.Sample_Name.tolist() # Convert Comparisons to a column of lists, then make unique comparisons exp.IPs['Comparisons'] = exp.IPs.Comparisons.apply(lambda x: [x.replace(' ', '') for x in x.split(',')]) exp.IPs['Comparison_names'] = exp.IPs[['Condition', 'Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1) comparisons = [] for comparison in exp.IPs.Comparison_names.tolist(): comparisons += comparison exp.overlaps = {o_name: o_name.split('_v_') for o_name in set(comparisons)} # Spike-in comparisons exp.IPs['Spike-in Comparisons'] = exp.IPs['Spike-in Comparisons'].apply(lambda x: [x.replace(' ', '') for x in x.split(',')]) exp.IPs['Spike_names'] = exp.IPs[['Condition', 'Spike-in Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1) sp_comparisons = [comparison for subls in exp.IPs.Spike_names.tolist() for comparison in subls] exp.spike_comparisons = {s_name: s_name.split('_v_') for s_name in set(sp_comparisons)} spike_samples = [condition for subls in exp.spike_comparisons.values() for condition in subls] exp.spike_samples = exp.IPs[exp.IPs.Condition.isin(spike_samples)].Sample_Name.tolist() # Make out directory if it doesn't exist exp.out_dir = make_folder(f'{val_folder(yml["Output_directory"])}{exp.name}/') # Lab specific files exp.genome_indicies['spike_index'] = yml['Spike_index'] # Locating genome indicies tsvs = yml['Genome_tsv'].split(',') genomes = ['hg38', 'hg19', 'mm10'] for tsv in tsvs: glob_check(tsv) exp.genome_indicies['encode_tsv'] = {**exp.genome_indicies['encode_tsv'], **{genome: tsv for genome in genomes if genome in tsv} } exp.encode3_folder = val_folder(yml['ENCODE3_folder']) # Initialized Process Complete List exp._parsed = True output(f'Experiment file parsed: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main) return exp
def UMI(exp): # exp.data_type = 'bam' IPs = exp.IPs for experiment in IPs.Condition.unique().tolist(): UMI = True if 'yes' in IPs[IPs.Condition == experiment]['UMI'].tolist() else False if not UMI: return exp else: out_dir = make_folder(f'{exp.scratch}UMI/') output('Deduplicating bam files using UMIs with UMI-tools.', log_file=exp.log_file, run_main=exp.run_main) for index in IPs[IPs.Condition == experiment].index.tolist(): sample = IPs.loc[index, 'Sample_Name'] input_sample = IPs.loc[index, 'Background_Name'] bam = exp.sample_files[sample]['bam'] input_bam = exp.sample_files[input_sample]['bam'] nodup_bam = f'{out_dir}{sample}.UMI.dedup.bam' nodup_input = f'{out_dir}{input_sample}.UMI.dedup.bam' umi_string = 'umi_tools dedup --umi-separator=":" --output-stats={out_dir}{sample}deduplicated.qc -I {inbam} -S {outbam} -L {out_dir}{sample}.UMI.log' seq_type = False if 'none' in IPs[ IPs.Condition == experiment]['Scratch_File2'].tolist() else True if seq_type == 'paired': umi_string += ' --paired' command_list = [ submission_prepend(), f'samtools index {bam}', f'samtools index {input_bam}', umi_string.format(inbam=bam, outbam=nodup_bam, sample=sample, out_dir=out_dir), umi_string.format(inbam=input_bam, outbam=nodup_input, sample=input_sample, out_dir=out_dir) ] exp.job_id.append( send_job(command_list=command_list, job_name=f"{sample}_UMI_dedup", job_log_folder=exp.job_folder, q='bigmem', mem=40000, log_file=exp.log_file, project=exp.project, cores=1, run_main=exp.run_main)) exp.sample_files[sample]['nodup_bam'] = nodup_bam exp.sample_files[input_sample]['nodup_bam'] = nodup_input job_wait(exp.job_id, exp.log_file) output( 'Dedplication complete. Submitting deduplicated files for the remainder of processing.', log_file=exp.log_file, run_main=exp.run_main) exp.tasks_complete.append('UMI') return encode3(exp)