def annotation(exp): from requests.exceptions import RetryError from time import sleep out_dir = make_folder(f'{exp.scratch}/Annotated/') condition_list = exp.IPs['Condition'].unique().tolist() for condition in condition_list: if 'peaktype' not in list(exp.sample_files[condition].keys()): peakset = 'overlap_peak' if exp.sample_files[condition][ 'idr_optimal_peak'] == 'none' else 'idr_optimal_peak' exp.sample_files[condition]['peaktype'] = peakset peakfiles = { condition: read_pd(exp.sample_files[condition][ exp.sample_files[condition]['peaktype']]) for condition in exp.IPs['Condition'].unique().tolist() if exp.sample_files[condition][exp.sample_files[condition]['peaktype']] != 'none' } for condition, file in peakfiles.items(): genome = exp.IPs.loc[exp.IPs.Condition == condition, 'Genome'].unique().tolist()[0] cond_dir = make_folder(f'{out_dir}{condition}/') anno_results = annotate_peaks( {condition: file}, cond_dir, genome, db='UCSC', check=False, log_file=exp.log_file, run_main=exp.run_main)[f'{condition}_annotated'] anno_list = anno_results.SYMBOL.unique().tolist() try: sleep(1) enrichr(anno_list, f'enrichr_{condition}', cond_dir, scan=None, max_terms=10, figsize=(12, 6), run_main=exp.run_main, log_file=exp.log_file) except RetryError: output( f'No stable enrichr connection. Skipping enrichr for {condition}.', log_file=exp.log_file, run_main=exp.run_main) exp.anno_results = {**exp.anno_results, **anno_results} exp.tasks_complete.append('Annotations') return exp
def final_qc(exp): ''' add preseq ''' try: output(f'Beginning final qc: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) if os.path.isdir(f'{exp.scratch}QC/multiqc_data') is False: os.system(f'multiqc {exp.scratch}* -o {exp.scratch}QC/') # Summary plots for FastQC data fastqc_file = f'{exp.scratch}/QC/multiqc_data/multiqc_fastqc.txt' if os.path.isfile(fastqc_file): gen_stats = read_pd( f'{exp.scratch}/QC/multiqc_data/multiqc_general_stats.txt') samples = exp.sample_df.Sample_Name.tolist() plot_col(df=gen_stats.loc[ samples, 'FastQC_mqc-generalstats-fastqc-total_sequences'] / 1e6, title='Total Sequencer Reads per Sample', ylabel='Reads (Millions)', log_file=exp.log_file, run_main=exp.run_main) plot_col( df=gen_stats.loc[samples, 'FastQC_mqc-generalstats-fastqc-percent_gc'], title='Percent GC Content per Sample', ylabel='Percentage of Reads with GC Content', log_file=exp.log_file, run_main=exp.run_main) if os.path.isdir('plots/'): copytree('plots/', f'{exp.scratch}QC/plots/') rmtree('plots') display(HTML('<h1>Final QC Summary</h1>')) display(HTML(f'{exp.scratch}/multiqc_report.html')) exp.tasks_complete.append('MultiQC') return exp except: close_out('final qc', exp)
def parse_config(config_file, run_main=False): ''' Parse experimental info from yaml file ''' with open(config_file, 'r') as file: yml = yaml.safe_load(file) # Make a new experimental object exp = Experiment() # Project exp.project = yml['LSF_Project'] # Check if running as pipeline exp.run_main = run_main # Setting Scratch folder exp.scratch = f'{os.getcwd()}/{yml["Name"]}_tmp/' if yml["Scratch_folder"] is None else f'{val_folder(yml["Scratch_folder"])}{yml["Name"]}/' os.makedirs(exp.scratch, exist_ok=True) # check whether experiment has been attempted exp.name = yml['Name'] exp.out_dir = make_folder(f"{val_folder(yml['Output_directory'])}{exp.name}/") filename = f'{exp.scratch}{exp.name}_incomplete.pkl' if os.path.isfile(filename): if yml['Restart'] is False: with open(filename, 'rb') as experiment: exp = pickle.load(experiment) os.remove(filename) # set new date exp.date = f'{datetime.now():%Y-%m-%d}' # For output of R logs into job_log_folder os.chdir(exp.job_folder) output(f'\n#############\nRestarting pipeline on {datetime.now():%Y-%m-%d %H:%M:%S}, from last completed step.', log_file=exp.log_file, run_main=exp.run_main) return exp else: os.remove(filename) # Passing paramters to new object exp.date = f'{datetime.now():%Y-%m-%d}' # Log file exp.log_file = f'{exp.out_dir}{exp.name}-{exp.date}.log' output(f'Pipeline version {version()} run on {exp.date} \n', log_file=exp.log_file, run_main=run_main) output(f'Beginning ChIPseq Analysis: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main) output('Reading experimental file...\n', log_file=exp.log_file, run_main=run_main) output(f"Pipeline output folder: {exp.out_dir}\n", log_file=exp.log_file, run_main=run_main) # Setting Job Folder exp.job_folder = f'{val_folder(exp.scratch)}logs/' os.makedirs(exp.job_folder, exist_ok=True) # Load sample info exp.sample_df = read_pd(yml['Sample_file']) # Make Sample Name exp.sample_df.replace([np.nan], 'none', inplace=True) exp.sample_df['Sample_Name'] = exp.sample_df.Condition + '_' + exp.sample_df.Replicate output(f'Processing samples:\n{exp.sample_df}', log_file=exp.log_file, run_main=run_main) # Paired exp.sample_df['paired'] = [x != 'none' for x in exp.sample_df.File2.tolist()] exp.IPs = exp.sample_df[exp.sample_df['Background Sample'] != 'none'].copy() sample_dict = exp.sample_df.Sample_Name.to_dict() exp.IPs['Background_Name'] = exp.IPs['Background Sample'].map(sample_dict) exp.samples = exp.IPs.Sample_Name.tolist() # Convert Comparisons to a column of lists, then make unique comparisons exp.IPs['Comparisons'] = exp.IPs.Comparisons.apply(lambda x: [x.replace(' ', '') for x in x.split(',')]) exp.IPs['Comparison_names'] = exp.IPs[['Condition', 'Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1) comparisons = [] for comparison in exp.IPs.Comparison_names.tolist(): comparisons += comparison exp.overlaps = {o_name: o_name.split('_v_') for o_name in set(comparisons)} # Spike-in comparisons exp.IPs['Spike-in Comparisons'] = exp.IPs['Spike-in Comparisons'].apply(lambda x: [x.replace(' ', '') for x in x.split(',')]) exp.IPs['Spike_names'] = exp.IPs[['Condition', 'Spike-in Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1) sp_comparisons = [comparison for subls in exp.IPs.Spike_names.tolist() for comparison in subls] exp.spike_comparisons = {s_name: s_name.split('_v_') for s_name in set(sp_comparisons)} spike_samples = [condition for subls in exp.spike_comparisons.values() for condition in subls] exp.spike_samples = exp.IPs[exp.IPs.Condition.isin(spike_samples)].Sample_Name.tolist() # Make out directory if it doesn't exist exp.out_dir = make_folder(f'{val_folder(yml["Output_directory"])}{exp.name}/') # Lab specific files exp.genome_indicies['spike_index'] = yml['Spike_index'] # Locating genome indicies tsvs = yml['Genome_tsv'].split(',') genomes = ['hg38', 'hg19', 'mm10'] for tsv in tsvs: glob_check(tsv) exp.genome_indicies['encode_tsv'] = {**exp.genome_indicies['encode_tsv'], **{genome: tsv for genome in genomes if genome in tsv} } exp.encode3_folder = val_folder(yml['ENCODE3_folder']) # Initialized Process Complete List exp._parsed = True output(f'Experiment file parsed: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main) return exp