def extract_AQUAS_report_data(base_folder, out_folder='', histone=False, replicate=False): ''' Inputs ----- base_folder: AQUAS results folder. Will use subfolders for sample name and look for report in those subfolders. replicate: Whether the ChIPseq was performed as a repliate or not. Returns ----- DataFrame of results ''' reports = glob.glob(f'{base_folder}/*/*report.html') out_folder = val_folder(out_folder) base_folder = val_folder(base_folder) if replicate is True: raise AssertionError('Not set up for replicates yet.') results_df = pd.DataFrame(index=[ 'Percent_mapped', 'Mapped_Reads', 'Fraction_Duplicated', 'S_JS_Distance', 'PBC1', 'RSC', 'Raw_Peak_Number', 'N_optimal_overlap_peaks', 'FrIP_IDR', 'N_IDR_peaks' ]) for file in reports: name = re.findall(r'.*/(.*)_report.html', file)[0] report = pd.read_html(file) series = pd.Series() series['Percent_mapped'] = report[1].iloc[7, 1] series['Mapped_Reads'] = report[2].iloc[5, 1] series['Fraction_Duplicated'] = report[3].iloc[7, 1] series['S_JS_Distance'] = report[4].iloc[7, 1] series['PBC1'] = report[5].iloc[6, 1] series['RSC'] = report[6].iloc[8, 1] series['Raw_Peak_Number'] = report[7].iloc[0, 1] series['N_optimal_overlap_peaks'] = report[10].iloc[4, 1] if histone is False: series['FrIP_IDR'] = report[11].iloc[0, 1] series['N_IDR_peaks'] = report[12].iloc[4, 1] results_df[name] = series for index in results_df.index.tolist(): plot_col(results_df.loc[index], out=out_folder, title=f'{index}', ylabel=index.replace('_', ' '), plot_type=['violin', 'swarm']) return results_df
def fastq_screen(exp): ''' Checks fastq files for contamination with alternative genomes using Bowtie2 ''' output( f'Screening for contamination during sequencing: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) # Make QC folder exp.qc_folder = make_folder(f'{exp.scratch}QC/') cwd = val_folder(os.getcwd()) os.chdir(exp.data_folder) samples = [ file for file in exp.sample_df.Scratch_File1.tolist() if is_fastq(file) ] # Submit fastqc and fastq_screen jobs for each sample for sample in samples: command_list = [ submission_prepend( f'fastq_screen --threads 4 --aligner bowtie2 {sample}') ] exp.job_id.append( send_job(command_list=command_list, job_name=f'{sample.split("/")[-1]}_fastq_screen', job_log_folder=exp.job_folder, q='general', mem=3000, log_file=exp.log_file, project=exp.project, cores=2, run_main=exp.run_main)) time.sleep(1) # Wait for jobs to finish job_wait(exp.job_id, exp.log_file) # move to qc folder fastqs_files = glob.glob(f'{exp.data_folder}*screen*') for f in fastqs_files: copy2(f, exp.qc_folder) os.remove(f) # change to experimental directory in scratch os.chdir(cwd) exp.tasks_complete.append('Fastq_screen') output(f'Screening complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=exp.run_main) return exp
def parse_config(config_file, run_main=False): ''' Parse experimental info from yaml file ''' with open(config_file, 'r') as file: yml = yaml.safe_load(file) # Make a new experimental object exp = Experiment() # Project exp.project = yml['LSF_Project'] # Check if running as pipeline exp.run_main = run_main # Setting Scratch folder exp.scratch = f'{os.getcwd()}/{yml["Name"]}_tmp/' if yml["Scratch_folder"] is None else f'{val_folder(yml["Scratch_folder"])}{yml["Name"]}/' os.makedirs(exp.scratch, exist_ok=True) # check whether experiment has been attempted exp.name = yml['Name'] exp.out_dir = make_folder(f"{val_folder(yml['Output_directory'])}{exp.name}/") filename = f'{exp.scratch}{exp.name}_incomplete.pkl' if os.path.isfile(filename): if yml['Restart'] is False: with open(filename, 'rb') as experiment: exp = pickle.load(experiment) os.remove(filename) # set new date exp.date = f'{datetime.now():%Y-%m-%d}' # For output of R logs into job_log_folder os.chdir(exp.job_folder) output(f'\n#############\nRestarting pipeline on {datetime.now():%Y-%m-%d %H:%M:%S}, from last completed step.', log_file=exp.log_file, run_main=exp.run_main) return exp else: os.remove(filename) # Passing paramters to new object exp.date = f'{datetime.now():%Y-%m-%d}' # Log file exp.log_file = f'{exp.out_dir}{exp.name}-{exp.date}.log' output(f'Pipeline version {version()} run on {exp.date} \n', log_file=exp.log_file, run_main=run_main) output(f'Beginning ChIPseq Analysis: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main) output('Reading experimental file...\n', log_file=exp.log_file, run_main=run_main) output(f"Pipeline output folder: {exp.out_dir}\n", log_file=exp.log_file, run_main=run_main) # Setting Job Folder exp.job_folder = f'{val_folder(exp.scratch)}logs/' os.makedirs(exp.job_folder, exist_ok=True) # Load sample info exp.sample_df = read_pd(yml['Sample_file']) # Make Sample Name exp.sample_df.replace([np.nan], 'none', inplace=True) exp.sample_df['Sample_Name'] = exp.sample_df.Condition + '_' + exp.sample_df.Replicate output(f'Processing samples:\n{exp.sample_df}', log_file=exp.log_file, run_main=run_main) # Paired exp.sample_df['paired'] = [x != 'none' for x in exp.sample_df.File2.tolist()] exp.IPs = exp.sample_df[exp.sample_df['Background Sample'] != 'none'].copy() sample_dict = exp.sample_df.Sample_Name.to_dict() exp.IPs['Background_Name'] = exp.IPs['Background Sample'].map(sample_dict) exp.samples = exp.IPs.Sample_Name.tolist() # Convert Comparisons to a column of lists, then make unique comparisons exp.IPs['Comparisons'] = exp.IPs.Comparisons.apply(lambda x: [x.replace(' ', '') for x in x.split(',')]) exp.IPs['Comparison_names'] = exp.IPs[['Condition', 'Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1) comparisons = [] for comparison in exp.IPs.Comparison_names.tolist(): comparisons += comparison exp.overlaps = {o_name: o_name.split('_v_') for o_name in set(comparisons)} # Spike-in comparisons exp.IPs['Spike-in Comparisons'] = exp.IPs['Spike-in Comparisons'].apply(lambda x: [x.replace(' ', '') for x in x.split(',')]) exp.IPs['Spike_names'] = exp.IPs[['Condition', 'Spike-in Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1) sp_comparisons = [comparison for subls in exp.IPs.Spike_names.tolist() for comparison in subls] exp.spike_comparisons = {s_name: s_name.split('_v_') for s_name in set(sp_comparisons)} spike_samples = [condition for subls in exp.spike_comparisons.values() for condition in subls] exp.spike_samples = exp.IPs[exp.IPs.Condition.isin(spike_samples)].Sample_Name.tolist() # Make out directory if it doesn't exist exp.out_dir = make_folder(f'{val_folder(yml["Output_directory"])}{exp.name}/') # Lab specific files exp.genome_indicies['spike_index'] = yml['Spike_index'] # Locating genome indicies tsvs = yml['Genome_tsv'].split(',') genomes = ['hg38', 'hg19', 'mm10'] for tsv in tsvs: glob_check(tsv) exp.genome_indicies['encode_tsv'] = {**exp.genome_indicies['encode_tsv'], **{genome: tsv for genome in genomes if genome in tsv} } exp.encode3_folder = val_folder(yml['ENCODE3_folder']) # Initialized Process Complete List exp._parsed = True output(f'Experiment file parsed: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main) return exp