def main(): try: description = [ '~~~CRISPRessoBatch~~~', '-Analysis of CRISPR/Cas9 outcomes from batch deep sequencing data-' ] batch_string = r''' _________________ | __ ___ __ | ||__) /\ | / |__|| ||__)/--\| \__| || |_________________| ''' print(CRISPRessoShared.get_crispresso_header(description, batch_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoBatch Parameters') #batch specific params parser.add_argument( '-bs', '--batch_settings', type=str, help= 'Settings file for batch. Must be tab-separated text file. The header row contains CRISPResso parameters (e.g., fastq_r1, fastq_r2, amplicon_seq, and other optional parameters). Each following row sets parameters for an additional batch.', required=True) parser.add_argument( '--skip_failed', help='Continue with batch analysis even if one sample fails', action='store_true') parser.add_argument( '--min_reads_for_inclusion', help= 'Minimum number of reads for a batch to be included in the batch summary', type=int) parser.add_argument( '-p', '--n_processes', type=int, help='Specify the number of processes to use for quantification.\ Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.', default=1) parser.add_argument( '-bo', '--batch_output_folder', help='Directory where batch analysis output will be stored') parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() debug_flag = args.debug crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set(['name', 'output_folder']) crispresso_options_for_batch = list(crispresso_options - options_to_ignore) CRISPRessoShared.check_file(args.batch_settings) ##parse excel sheet batch_params = pd.read_csv(args.batch_settings, comment='#', sep='\t') #pandas either allows for auto-detect sep or for comment. not both # batch_params=pd.read_csv(args.batch_settings,sep=None,engine='python',error_bad_lines=False) batch_params.columns = batch_params.columns.str.strip(' -\xd0') #rename column "a" to "amplicon_seq", etc batch_params.rename( index=str, columns=CRISPRessoShared.get_crispresso_options_lookup(), inplace=True) batch_count = batch_params.shape[0] batch_params.index = range(batch_count) if 'fastq_r1' not in batch_params and 'bam_input' not in batch_params: raise CRISPRessoShared.BadParameterException( "fastq_r1 must be specified in the batch settings file. Current headings are: " + str(batch_params.columns.values)) #add args from the command line to batch_params_df for arg in vars(args): if arg not in batch_params: batch_params[arg] = getattr(args, arg) else: if (getattr(args, arg) is not None): batch_params[arg].fillna(value=getattr(args, arg), inplace=True) #assert that all names are unique #and clean names for i in range(batch_count): if batch_params.loc[i, 'name'] == '': batch_params.at[i, 'name'] = i batch_params.at[i, 'name'] = CRISPRessoShared.clean_filename( batch_params.loc[i, 'name']) if batch_params.drop_duplicates( 'name').shape[0] != batch_params.shape[0]: raise CRISPRessoShared.BadParameterException( 'Batch input names must be unique. The given names are not unique: ' + str(batch_params.loc[:, 'name'])) #Check files batch_params[ "sgRNA_intervals"] = '' #create empty array for sgRNA intervals batch_params["sgRNA_intervals"] = batch_params[ "sgRNA_intervals"].apply(list) batch_params[ "cut_point_include_idx"] = '' #create empty array for cut point intervals for each batch based on sgRNA batch_params["cut_point_include_idx"] = batch_params[ "cut_point_include_idx"].apply(list) for idx, row in batch_params.iterrows(): if 'fastq_r1' in row: if row.fastq_r1 is None: raise CRISPRessoShared.BadParameterException( "At least one fastq file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' (fastq_r1 on row %s '%s' is invalid)" % (int(idx) + 1, row.fastq_r1)) else: CRISPRessoShared.check_file(row.fastq_r1) if 'fastq_r2' in row and row.fastq_r2 != "": CRISPRessoShared.check_file(row.fastq_r2) if 'input_bam' in row: if row.input_bam is None: raise CRISPRessoShared.BadParameterException( "At least one input file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' or 'input_bam' (input_bam on row %s '%s' is invalid)" % (int(idx) + 1, row.input_bam)) else: CRISPRessoShared.check_file(row.input_bam) if args.auto: continue curr_amplicon_seq_str = row.amplicon_seq if curr_amplicon_seq_str is None: raise CRISPRessoShared.BadParameterException( "Amplicon sequence must be given as a command line parameter or be specified in the batch settings file with the heading 'amplicon_seq' (Amplicon seq on row %s '%s' is invalid)" % (int(idx) + 1, curr_amplicon_seq_str)) guides_are_in_amplicon = { } #dict of whether a guide is in at least one amplicon sequence #iterate through amplicons for curr_amplicon_seq in curr_amplicon_seq_str.split(','): this_include_idxs = [ ] #mask for bp to include for this amplicon seq, as specified by sgRNA cut points this_sgRNA_intervals = [] wrong_nt = CRISPRessoShared.find_wrong_nt(curr_amplicon_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The amplicon sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_amplicon_seq_str, ' '.join(wrong_nt))) #iterate through guides curr_guide_seq_string = row.guide_seq if curr_guide_seq_string is not None and curr_guide_seq_string != "": guides = curr_guide_seq_string.strip().upper().split(',') for curr_guide_seq in guides: wrong_nt = CRISPRessoShared.find_wrong_nt( curr_guide_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The sgRNA sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_guide_seq, ' '.join(wrong_nt))) guide_mismatches = [[]] * len(guides) guide_names = [""] * len(guides) guide_qw_centers = CRISPRessoShared.set_guide_array( row.quantification_window_center, guides, 'guide quantification center') guide_qw_sizes = CRISPRessoShared.set_guide_array( row.quantification_window_size, guides, 'guide quantification size') guide_plot_cut_points = [1] * len(guides) (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs, this_exclude_idxs ) = CRISPRessoShared.get_amplicon_info_for_guides( curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers, guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points) for guide_seq in this_sgRNA_sequences: guides_are_in_amplicon[guide_seq] = 1 batch_params.ix[idx, "cut_point_include_idx"].append( this_include_idxs) batch_params.ix[idx, "sgRNA_intervals"].append(this_sgRNA_intervals) for guide_seq in guides_are_in_amplicon: if guides_are_in_amplicon[guide_seq] != 1: warn( '\nThe guide sequence provided on row %d (%s) is not present in any amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!' % (idx + 1, row.guide_seq, curr_amplicon_seq)) batch_folder_name = os.path.splitext( os.path.basename(args.batch_settings))[0] if args.name and args.name != "": batch_folder_name = args.name output_folder_name = 'CRISPRessoBatch_on_%s' % batch_folder_name OUTPUT_DIRECTORY = os.path.abspath(output_folder_name) if args.batch_output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.batch_output_folder), output_folder_name) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoBatch_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Batch_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Batch_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) crispresso_cmds = [] batch_names_arr = [] batch_input_names = {} for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) batch_names_arr.append(batchName) batch_input_names[batchName] = row["name"] crispresso_cmd = args.crispresso_command + ' -o %s --name %s' % ( OUTPUT_DIRECTORY, batchName) crispresso_cmd = propagate_options(crispresso_cmd, crispresso_options_for_batch, batch_params, idx) crispresso_cmds.append(crispresso_cmd) crispresso2_info['batch_names_arr'] = batch_names_arr crispresso2_info['batch_input_names'] = batch_input_names CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, args.n_processes, 'batch', args.skip_failed) run_datas = [] #crispresso2 info from each row all_amplicons = set() amplicon_names = {} amplicon_counts = {} completed_batch_arr = [] for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data_file = os.path.join(folder_name, 'CRISPResso2_info.pickle') if os.path.isfile(run_data_file) is False: info("Skipping folder '%s'. Cannot find run data at '%s'." % (folder_name, run_data_file)) run_datas.append(None) continue run_data = cp.load(open(run_data_file, 'rb')) run_datas.append(run_data) for ref_name in run_data['ref_names']: ref_seq = run_data['refs'][ref_name]['sequence'] all_amplicons.add(ref_seq) #if this amplicon is called something else in another sample, just call it the amplicon if ref_name in amplicon_names and amplicon_names[ ref_seq] != ref_name: amplicon_names[ref_seq] = ref_seq else: amplicon_names[ref_seq] = ref_name if ref_seq not in amplicon_counts: amplicon_counts[ref_seq] = 0 amplicon_counts[ref_seq] += 1 completed_batch_arr.append(batchName) crispresso2_info['completed_batch_arr'] = completed_batch_arr #make sure amplicon names aren't super long for amplicon in all_amplicons: if len(amplicon_names[amplicon]) > 20: amplicon_names[amplicon] = amplicon_names[amplicon][0:20] #make sure no duplicate names (same name for the different amplicons) seen_names = {} for amplicon in all_amplicons: suffix_counter = 2 orig_name = amplicon_names[amplicon] while amplicon_names[amplicon] in seen_names: amplicon_names[amplicon] = orig_name + "_" + str( suffix_counter) suffix_counter += 1 seen_names[amplicon_names[amplicon]] = 1 save_png = True if args.suppress_report: save_png = False window_nuc_pct_quilt_plot_names = [] nuc_pct_quilt_plot_names = [] window_nuc_conv_plot_names = [] nuc_conv_plot_names = [] #report for amplicons that appear multiple times for amplicon_index, amplicon_seq in enumerate(all_amplicons): #only perform comparison if amplicon seen in more than one sample if amplicon_counts[amplicon_seq] < 2: continue amplicon_name = amplicon_names[amplicon_seq] info('Reporting summary for amplicon: "' + amplicon_name + '"') consensus_sequence = "" nucleotide_frequency_summary = [] nucleotide_percentage_summary = [] modification_frequency_summary = [] modification_percentage_summary = [] amp_found_count = 0 #how many folders had information for this amplicon consensus_guides = [] consensus_include_idxs = [] consensus_sgRNA_plot_idxs = [] consensus_sgRNA_intervals = [] guides_all_same = True batches_with_this_amplicon = [] for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue batch_has_amplicon = False batch_amplicon_name = '' for ref_name in run_data['ref_names']: if amplicon_seq == run_data['refs'][ref_name]['sequence']: batch_has_amplicon = True batch_amplicon_name = ref_name if not batch_has_amplicon: continue batches_with_this_amplicon.append(idx) if consensus_guides == []: consensus_guides = run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] consensus_include_idxs = run_data['refs'][ batch_amplicon_name]['include_idxs'] consensus_sgRNA_intervals = run_data['refs'][ batch_amplicon_name]['sgRNA_intervals'] consensus_sgRNA_plot_idxs = run_data['refs'][ batch_amplicon_name]['sgRNA_plot_idxs'] if run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] != consensus_guides: guides_all_same = False if set(run_data['refs'][batch_amplicon_name] ['include_idxs']) != set(consensus_include_idxs): guides_all_same = False if 'nuc_freq_filename' not in run_data['refs'][ batch_amplicon_name]: info( "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information." % (batch_amplicon_name, folder_name)) continue nucleotide_frequency_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name]['nuc_freq_filename']) ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file( nucleotide_frequency_file) nucleotide_pct_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name]['nuc_pct_filename']) ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file( nucleotide_pct_file) count_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name] ['mod_count_filename']) ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file( count_file) if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None: info( "Skipping the amplicon '%s' in folder '%s'. Could not parse batch output." % (batch_amplicon_name, folder_name)) info( "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'" % (ampSeq_nf, ampSeq_np, ampSeq_cf)) continue if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf: warn( "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s" % (batch_amplicon_name, folder_name, ampSeq_nf, ampSeq_np, ampSeq_cf, amplicon_seq)) continue if consensus_sequence == "": consensus_sequence = ampSeq_nf if ampSeq_nf != consensus_sequence: info( "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match." % (batch_amplicon_name, folder_name)) continue if 'Total' not in mod_freqs: info( "Skipping the amplicon '%s' in folder '%s'. Processing did not complete." % (batch_amplicon_name, folder_name)) continue if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][0] == "0": info( "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon." % (batch_amplicon_name, folder_name)) continue if (args.min_reads_for_inclusion is not None) and (int( mod_freqs['Total'][0]) < args.min_reads_for_inclusion): info( "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)." % (batch_amplicon_name, folder_name, str(mod_freqs['Total'][0]), args.min_reads_for_inclusion)) continue mod_pcts = {} for key in mod_freqs: mod_pcts[key] = np.array(mod_freqs[key]).astype( np.float) / float(mod_freqs['Total'][0]) amp_found_count += 1 for nuc in ['A', 'T', 'C', 'G', 'N', '-']: row = [batchName, nuc] row.extend(nuc_freqs[nuc]) nucleotide_frequency_summary.append(row) pct_row = [batchName, nuc] pct_row.extend(nuc_pcts[nuc]) nucleotide_percentage_summary.append(pct_row) for mod in [ 'Insertions', 'Insertions_Left', 'Deletions', 'Substitutions', 'All_modifications' ]: row = [batchName, mod] row.extend(mod_freqs[mod]) modification_frequency_summary.append(row) pct_row = [batchName, mod] pct_row.extend(mod_pcts[mod]) modification_percentage_summary.append(pct_row) if amp_found_count == 0: info( "Couldn't find any data for amplicon '%s'. Not compiling results." % amplicon_name) else: amplicon_plot_name = amplicon_name + "." if len(amplicon_names) == 1 and amplicon_name == "Reference": amplicon_plot_name = "" colnames = ['Batch', 'Nucleotide'] colnames.extend(list(consensus_sequence)) nucleotide_frequency_summary_df = pd.DataFrame( nucleotide_frequency_summary, columns=colnames) nucleotide_frequency_summary_df = pd.concat([ nucleotide_frequency_summary_df.iloc[:, 0:2], nucleotide_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary.txt') nucleotide_frequency_summary_df.to_csv( nucleotide_frequency_summary_filename, sep='\t', index=None) nucleotide_percentage_summary_df = pd.DataFrame( nucleotide_percentage_summary, columns=colnames) nucleotide_percentage_summary_df = pd.concat([ nucleotide_percentage_summary_df.iloc[:, 0:2], nucleotide_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary.txt') nucleotide_percentage_summary_df.to_csv( nucleotide_percentage_summary_filename, sep='\t', index=None) colnames = ['Batch', 'Modification'] colnames.extend(list(consensus_sequence)) modification_frequency_summary_df = pd.DataFrame( modification_frequency_summary, columns=colnames) modification_frequency_summary_df = pd.concat([ modification_frequency_summary_df.iloc[:, 0:2], modification_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_frequency_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_FREQUENCY_SUMMARY.txt') modification_frequency_summary_df.to_csv( modification_frequency_summary_filename, sep='\t', index=None) modification_percentage_summary_df = pd.DataFrame( modification_percentage_summary, columns=colnames) modification_percentage_summary_df = pd.concat([ modification_percentage_summary_df.iloc[:, 0:2], modification_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_percentage_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_PERCENTAGE_SUMMARY.txt') modification_percentage_summary_df.to_csv( modification_percentage_summary_filename, sep='\t', index=None) crispresso2_info[ 'nucleotide_frequency_summary_filename'] = os.path.basename( nucleotide_frequency_summary_filename) crispresso2_info[ 'nucleotide_percentage_summary_filename'] = os.path.basename( nucleotide_percentage_summary_filename) crispresso2_info[ 'modification_frequency_summary_filename'] = os.path.basename( modification_frequency_summary_filename) crispresso2_info[ 'modification_percentage_summary_filename'] = os.path.basename( modification_percentage_summary_filename) crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window if guides_all_same and consensus_guides != []: info( "All guides are equal. Performing comparison of batches for amplicon '%s'" % amplicon_name) include_idxs = consensus_include_idxs #include indexes are the same for all guides for idx, sgRNA in enumerate(consensus_guides): sgRNA_intervals = consensus_sgRNA_intervals[idx] sgRNA_plot_idxs = consensus_sgRNA_plot_idxs[idx] plot_idxs_flat = [0, 1] # guide, nucleotide plot_idxs_flat.extend( [plot_idx + 2 for plot_idx in sgRNA_plot_idxs]) sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:, plot_idxs_flat] sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:, plot_idxs_flat] sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:, plot_idxs_flat] #show all sgRNA's on the plot sub_sgRNA_intervals = [] for sgRNA_interval in consensus_sgRNA_intervals: newstart = None newend = None for idx, i in enumerate(sgRNA_plot_idxs): if i <= sgRNA_interval[0]: newstart = idx if newend is None and i >= sgRNA_interval[1]: newend = idx #if guide doesn't overlap with plot idxs if newend == 0 or newstart == len(sgRNA_plot_idxs): continue #otherwise, correct partial overlaps elif newstart == None and newend == None: newstart = 0 newend = len(include_idxs) - 1 elif newstart == None: newstart = 0 elif newend == None: newend = len(include_idxs) - 1 #and add it to the list sub_sgRNA_intervals.append((newstart, newend)) if not args.suppress_plots: #plot for each guide this_window_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_nucleotide_quilt( sub_nucleotide_percentage_summary_df, sub_modification_percentage_summary_df, this_window_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_window_nuc_pct_quilt_plot_name) window_nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] sub_nucleotide_frequency_summary_df = pd.concat( [ sub_nucleotide_frequency_summary_df. iloc[:, 0:2], sub_nucleotide_frequency_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_frequency_summary_df.to_csv( sub_nucleotide_frequency_summary_filename, sep='\t', index=None) sub_nucleotide_percentage_summary_df = pd.concat( [ sub_nucleotide_percentage_summary_df. iloc[:, 0:2], sub_nucleotide_percentage_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_percentage_summary_df.to_csv( sub_nucleotide_percentage_summary_filename, sep='\t', index=None) if args.base_editor_output: this_window_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_conversion_map_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_conversion_map( sub_nucleotide_percentage_summary_df, this_window_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_window_nuc_conv_plot_name) window_nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][ plot_name] = [ ('Nucleotide frequencies around sgRNA', os.path.basename( sub_nucleotide_frequency_summary_filename )), ('Nucleotide percentages around sgRNA', os.path.basename( sub_nucleotide_percentage_summary_filename )) ] if not args.suppress_plots: # plot the whole region this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] if args.base_editor_output: this_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_conversion_map') CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, this_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_nuc_conv_plot_name) nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] else: #guides are not the same if not args.suppress_plots: this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png) plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] if args.base_editor_output: this_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, this_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png) plot_name = os.path.basename( this_nuc_conv_plot_name) nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] crispresso2_info[ 'window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names crispresso2_info['nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names crispresso2_info[ 'window_nuc_conv_plot_names'] = window_nuc_conv_plot_names crispresso2_info['nuc_conv_plot_names'] = nuc_conv_plot_names #summarize amplicon modifications with open( _jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'), 'w') as outfile: wrote_header = False for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['quant_of_editing_freq_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(batchName + "\t" + line) #summarize alignment with open(_jp('CRISPRessoBatch_mapping_statistics.txt'), 'w') as outfile: wrote_header = False for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['mapping_stats_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(batchName + "\t" + line) if not args.suppress_report: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2Batch_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_batch_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) cp.dump(crispresso2_info, open(crispresso2Batch_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): try: start_time = datetime.now() start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S') description = [ '~~~CRISPRessoAggregate~~~', '-Aggregation of CRISPResso Run Data-' ] aggregate_string = r''' ___________________________________ | __ __ _ _ __ ___ _ | | /\ /__ /__ |_) |_ /__ /\ | |_ | |/--\ \_| \_| | \ |_ \_| /--\ | |_ | |__________________________________| ''' print( CRISPRessoShared.get_crispresso_header(description, aggregate_string)) parser = argparse.ArgumentParser( description="Aggreate CRISPResso2 Runs") parser.add_argument( "-p", "--prefix", action='append', help= "Prefix for CRISPResso folders to aggregate (may be specified multiple times)", default=[]) parser.add_argument("-s", "--suffix", type=str, help="Suffix for CRISPResso folders to aggregate", default="") parser.add_argument("-n", "--name", type=str, help="Output name of the report", required=True) parser.add_argument( '--min_reads_for_inclusion', help= 'Minimum number of reads for a run to be included in the run summary', type=int, default=0) parser.add_argument( '--place_report_in_output_folder', help= 'If true, report will be written inside the CRISPResso output folder. By default, the report will be written one directory up from the report output.', action='store_true') parser.add_argument('--suppress_report', help='Suppress output report', action='store_true') parser.add_argument('--suppress_plots', help='Suppress output plots', action='store_true') parser.add_argument('--debug', help='Show debug messages', action='store_true') args = parser.parse_args() output_folder_name = 'CRISPRessoAggregate_on_%s' % args.name OUTPUT_DIRECTORY = os.path.abspath(output_folder_name) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoAggregate_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Aggregate_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Aggregate_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) #glob returns paths including the original prefix all_files = [] for prefix in args.prefix: all_files.extend(glob.glob(prefix + '*' + args.suffix)) if args.prefix != "": all_files.extend(glob.glob( prefix + '/*' + args.suffix)) #if a folder is given, add all subfolders seen_folders = {} crispresso2_folder_infos = { } #file_loc->crispresso_info; these are only CRISPResso runs -- this bit unrolls batch, pooled, and wgs runs successfully_imported_count = 0 not_imported_count = 0 for folder in all_files: if folder in seen_folders: #skip if we've seen this folder (glob could have added it twice) continue seen_folders[folder] = 1 if os.path.isdir(folder) and str(folder).endswith(args.suffix): #first, try to import a plain CRISPResso2 run crispresso_info_file = os.path.join(folder, 'CRISPResso2_info.pickle') if os.path.exists(crispresso_info_file): try: run_data = CRISPRessoShared.load_crispresso_info( folder) crispresso2_folder_infos[folder] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + folder) not_imported_count += 1 #second, check pooled pooled_info_file = os.path.join( folder, 'CRISPResso2Pooled_info.pickle') if os.path.exists(pooled_info_file): pooled_data = cp.load(open(pooled_info_file, 'rb')) if 'good_region_names' in pooled_data: run_names = pooled_data['good_region_names'] for run_name in run_names: run_folder_loc = os.path.join( folder, 'CRISPResso_on_%s' % run_name) try: run_data = CRISPRessoShared.load_crispresso_info( run_folder_loc) crispresso2_folder_infos[ run_folder_loc] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + run_folder_loc) not_imported_count += 1 else: warn('Could not process pooled folder ' + folder) not_imported_count += 1 #third, check batch batch_info_file = os.path.join(folder, 'CRISPResso2Batch_info.pickle') if os.path.exists(batch_info_file): batch_data = cp.load(open(batch_info_file, 'rb')) if 'completed_batch_arr' in batch_data: run_names = batch_data['completed_batch_arr'] for run_name in run_names: run_folder_loc = os.path.join( folder, 'CRISPResso_on_%s' % run_name) try: run_data = CRISPRessoShared.load_crispresso_info( run_folder_loc) crispresso2_folder_infos[ run_folder_loc] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + run_folder_loc) not_imported_count += 1 else: warn('Could not process batch folder ' + folder) not_imported_count += 1 #fourth, check WGS wgs_info_file = os.path.join(folder, 'CRISPResso2WGS_info.pickle') if os.path.exists(wgs_info_file): wgs_data = cp.load(open(wgs_info_file, 'rb')) if 'good_region_folders' in wgs_data: run_names = wgs_data['good_region_folders'] for run_name in run_names: run_folder_loc = os.path.join( folder, 'CRISPResso_on_%s' % run_name) try: run_data = CRISPRessoShared.load_crispresso_info( run_folder_loc) crispresso2_folder_infos[ run_folder_loc] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + run_folder_loc) not_imported_count += 1 else: warn('Could not process WGS folder ' + folder) not_imported_count += 1 info('Read ' + str(successfully_imported_count) + ' folders (' + str(not_imported_count) + ' not imported)') save_png = True if args.suppress_report: save_png = False if successfully_imported_count > 0: crispresso2_folders = crispresso2_folder_infos.keys() crispresso2_folder_names = {} crispresso2_folder_htmls = {} #file_loc->html folder loc for crispresso2_folder in crispresso2_folders: crispresso2_folder_names[ crispresso2_folder] = CRISPRessoShared.slugify( crispresso2_folder) this_sub_html_file = crispresso2_folder + ".html" if crispresso2_folder_infos[crispresso2_folder][ 'args'].place_report_in_output_folder: this_sub_html_file = os.path.join( crispresso2_folder, crispresso2_folder_infos[crispresso2_folder] ['report_filename']) crispresso2_folder_htmls[crispresso2_folder] = os.path.abspath( this_sub_html_file) all_amplicons = set() amplicon_names = { } #sequence -> ref name (to check for amplicons with the same name but different sequences) amplicon_counts = {} amplicon_sources = {} completed_batch_arr = [] for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] for ref_name in run_data['ref_names']: ref_seq = run_data['refs'][ref_name]['sequence'] all_amplicons.add(ref_seq) #if this amplicon is called something else in another sample, just call it the amplicon if ref_name in amplicon_names and amplicon_names[ ref_seq] != ref_name: amplicon_names[ref_seq] = ref_seq else: amplicon_names[ref_seq] = ref_name if ref_seq not in amplicon_counts: amplicon_counts[ref_seq] = 0 amplicon_sources[ref_seq] = [] amplicon_counts[ref_seq] += 1 amplicon_sources[ref_seq].append(crispresso2_folder + '(' + ref_name + ')') #make sure amplicon names aren't super long for amplicon in all_amplicons: if len(amplicon_names[amplicon]) > 21: amplicon_names[amplicon] = amplicon_names[amplicon][0:21] #make sure no duplicate amplicon names (same name for the different amplicons) seen_names = [] for amplicon in all_amplicons: suffix_counter = 2 orig_name = amplicon_names[amplicon] while amplicon_names[amplicon] in seen_names: amplicon_names[amplicon] = orig_name + "_" + str( suffix_counter) suffix_counter += 1 seen_names.append(amplicon_names[amplicon]) crispresso2_info['ref_names'] = seen_names crispresso2_info['refs'] = {} crispresso2_info['summary_plot_names'] = [] crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} with open(_jp('CRISPRessoAggregate_amplicon_information.txt'), 'w') as outfile: outfile.write("\t".join([ 'Amplicon Name', 'Number of sources', 'Amplicon sources', 'Amplicon sequence' ]) + "\n") for amplicon in all_amplicons: outfile.write("\t".join([ amplicon_names[amplicon], str(amplicon_counts[amplicon]), ';'.join( amplicon_sources[amplicon]), amplicon ]) + "\n") window_nuc_pct_quilt_plot_names = [] nuc_pct_quilt_plot_names = [] window_nuc_conv_plot_names = [] nuc_conv_plot_names = [] #report for amplicons that appear multiple times for amplicon_index, amplicon_seq in enumerate(all_amplicons): amplicon_name = amplicon_names[amplicon_seq] crispresso2_info['refs'][amplicon_name] = {} #only perform comparison if amplicon seen in more than one sample if amplicon_counts[amplicon_seq] < 2: continue info('Reporting summary for amplicon: "' + amplicon_name + '"') consensus_sequence = "" nucleotide_frequency_summary = [] nucleotide_percentage_summary = [] modification_frequency_summary = [] modification_percentage_summary = [] amp_found_count = 0 #how many folders had information for this amplicon consensus_guides = [] consensus_include_idxs = [] consensus_sgRNA_plot_idxs = [] consensus_sgRNA_intervals = [] guides_all_same = True runs_with_this_amplicon = [] for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] run_has_amplicon = False run_amplicon_name = '' for ref_name in run_data['ref_names']: if amplicon_seq == run_data['refs'][ref_name][ 'sequence']: run_has_amplicon = True run_amplicon_name = ref_name if not run_has_amplicon: continue runs_with_this_amplicon.append(crispresso2_folder) if consensus_guides == []: consensus_guides = run_data['refs'][run_amplicon_name][ 'sgRNA_sequences'] consensus_include_idxs = run_data['refs'][ run_amplicon_name]['include_idxs'] consensus_sgRNA_intervals = run_data['refs'][ run_amplicon_name]['sgRNA_intervals'] consensus_sgRNA_plot_idxs = run_data['refs'][ run_amplicon_name]['sgRNA_plot_idxs'] if run_data['refs'][run_amplicon_name][ 'sgRNA_sequences'] != consensus_guides: guides_all_same = False if set(run_data['refs'][run_amplicon_name] ['include_idxs']) != set(consensus_include_idxs): guides_all_same = False if 'nuc_freq_filename' not in run_data['refs'][ run_amplicon_name]: info( "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information." % (run_amplicon_name, crispresso2_folder)) continue nucleotide_frequency_file = os.path.join( crispresso2_folder, run_data['refs'][run_amplicon_name] ['nuc_freq_filename']) ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file( nucleotide_frequency_file) nucleotide_pct_file = os.path.join( crispresso2_folder, run_data['refs'][run_amplicon_name] ['nuc_pct_filename']) ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file( nucleotide_pct_file) count_file = os.path.join( crispresso2_folder, run_data['refs'][run_amplicon_name] ['mod_count_filename']) ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file( count_file) if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None: info( "Skipping the amplicon '%s' in folder '%s'. Could not parse run output." % (run_amplicon_name, crispresso2_folder)) info( "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'" % (ampSeq_nf, ampSeq_np, ampSeq_cf)) continue if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf: warn( "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s" % (run_amplicon_name, crispresso2_folder, ampSeq_nf, ampSeq_np, ampSeq_cf, amplicon_seq)) continue if consensus_sequence == "": consensus_sequence = ampSeq_nf if ampSeq_nf != consensus_sequence: info( "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match." % (run_amplicon_name, crispresso2_folder)) continue if 'Total' not in mod_freqs: info( "Skipping the amplicon '%s' in folder '%s'. Processing did not complete." % (run_amplicon_name, crispresso2_folder)) continue if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][ 0] == "0": info( "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon." % (run_amplicon_name, crispresso2_folder)) continue this_amp_total_reads = run_data['counts_total'][ run_amplicon_name] if this_amp_total_reads < args.min_reads_for_inclusion: info( "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)." % (run_amplicon_name, crispresso2_folder, str(this_amp_total_reads), args.min_reads_for_inclusion)) continue mod_pcts = {} for key in mod_freqs: mod_pcts[key] = np.array(mod_freqs[key]).astype( np.float) / float(this_amp_total_reads) amp_found_count += 1 run_name = crispresso2_folder_names[crispresso2_folder] for nuc in ['A', 'T', 'C', 'G', 'N', '-']: row = [run_name, nuc] row.extend(nuc_freqs[nuc]) nucleotide_frequency_summary.append(row) pct_row = [run_name, nuc] pct_row.extend(nuc_pcts[nuc]) nucleotide_percentage_summary.append(pct_row) for mod in [ 'Insertions', 'Insertions_Left', 'Deletions', 'Substitutions', 'All_modifications' ]: row = [run_name, mod] row.extend(mod_freqs[mod]) modification_frequency_summary.append(row) pct_row = [run_name, mod] pct_row.extend(mod_pcts[mod]) modification_percentage_summary.append(pct_row) if amp_found_count == 0: info( "Couldn't find any data for amplicon '%s'. Not compiling results." % amplicon_name) else: amplicon_plot_name = amplicon_name + "." if len(amplicon_names ) == 1 and amplicon_name == "Reference": amplicon_plot_name = "" colnames = ['Folder', 'Nucleotide'] colnames.extend(list(consensus_sequence)) nucleotide_frequency_summary_df = pd.DataFrame( nucleotide_frequency_summary, columns=colnames) nucleotide_frequency_summary_df = pd.concat([ nucleotide_frequency_summary_df.iloc[:, 0:2], nucleotide_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary.txt') nucleotide_frequency_summary_df.to_csv( nucleotide_frequency_summary_filename, sep='\t', index=None) nucleotide_percentage_summary_df = pd.DataFrame( nucleotide_percentage_summary, columns=colnames) nucleotide_percentage_summary_df = pd.concat([ nucleotide_percentage_summary_df.iloc[:, 0:2], nucleotide_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary.txt') nucleotide_percentage_summary_df.to_csv( nucleotide_percentage_summary_filename, sep='\t', index=None) colnames = ['Folder', 'Modification'] colnames.extend(list(consensus_sequence)) modification_frequency_summary_df = pd.DataFrame( modification_frequency_summary, columns=colnames) modification_frequency_summary_df = pd.concat([ modification_frequency_summary_df.iloc[:, 0:2], modification_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_frequency_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_FREQUENCY_SUMMARY.txt') modification_frequency_summary_df.to_csv( modification_frequency_summary_filename, sep='\t', index=None) modification_percentage_summary_df = pd.DataFrame( modification_percentage_summary, columns=colnames) modification_percentage_summary_df = pd.concat([ modification_percentage_summary_df.iloc[:, 0:2], modification_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_percentage_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_PERCENTAGE_SUMMARY.txt') modification_percentage_summary_df.to_csv( modification_percentage_summary_filename, sep='\t', index=None) crispresso2_info['refs'][amplicon_name][ 'nucleotide_frequency_summary_filename'] = os.path.basename( nucleotide_frequency_summary_filename) crispresso2_info['refs'][amplicon_name][ 'nucleotide_percentage_summary_filename'] = os.path.basename( nucleotide_percentage_summary_filename) crispresso2_info['refs'][amplicon_name][ 'modification_frequency_summary_filename'] = os.path.basename( modification_frequency_summary_filename) crispresso2_info['refs'][amplicon_name][ 'modification_percentage_summary_filename'] = os.path.basename( modification_percentage_summary_filename) #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window if guides_all_same and consensus_guides != []: info( "All guides are equal. Performing comparison of runs for amplicon '%s'" % amplicon_name) include_idxs = consensus_include_idxs #include indexes are the same for all guides for idx, sgRNA in enumerate(consensus_guides): sgRNA_intervals = consensus_sgRNA_intervals[idx] sgRNA_plot_idxs = consensus_sgRNA_plot_idxs[idx] plot_idxs_flat = [0, 1] # guide, nucleotide plot_idxs_flat.extend( [plot_idx + 2 for plot_idx in sgRNA_plot_idxs]) sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:, plot_idxs_flat] sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:, plot_idxs_flat] sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:, plot_idxs_flat] #show all sgRNA's on the plot sub_sgRNA_intervals = [] for sgRNA_interval in consensus_sgRNA_intervals: newstart = None newend = None for idx, i in enumerate(sgRNA_plot_idxs): if i <= sgRNA_interval[0]: newstart = idx if newend is None and i >= sgRNA_interval[ 1]: newend = idx #if guide doesn't overlap with plot idxs if newend == 0 or newstart == len( sgRNA_plot_idxs): continue #otherwise, correct partial overlaps elif newstart == None and newend == None: newstart = 0 newend = len(include_idxs) - 1 elif newstart == None: newstart = 0 elif newend == None: newend = len(include_idxs) - 1 #and add it to the list sub_sgRNA_intervals.append((newstart, newend)) if not args.suppress_plots: #plot for each guide this_window_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_nucleotide_quilt( sub_nucleotide_percentage_summary_df, sub_modification_percentage_summary_df, this_window_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs, group_column='Folder') plot_name = os.path.basename( this_window_nuc_pct_quilt_plot_name) window_nuc_pct_quilt_plot_names.append( plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][ plot_name] = [ (amplicon_name + ' nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename )), (amplicon_name + ' modification frequencies', os.path.basename( modification_frequency_summary_filename )) ] sub_nucleotide_frequency_summary_df = pd.concat( [ sub_nucleotide_frequency_summary_df. iloc[:, 0:2], sub_nucleotide_frequency_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_frequency_summary_df.to_csv( sub_nucleotide_frequency_summary_filename, sep='\t', index=None) sub_nucleotide_percentage_summary_df = pd.concat( [ sub_nucleotide_percentage_summary_df. iloc[:, 0:2], sub_nucleotide_percentage_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_percentage_summary_df.to_csv( sub_nucleotide_percentage_summary_filename, sep='\t', index=None) if not args.suppress_plots: # plot the whole region this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs, group_column='Folder') plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ (amplicon_name + ' nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), (amplicon_name + ' modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] else: #guides are not the same if not args.suppress_plots: this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png, group_column='Folder') plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ (amplicon_name + ' nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), (amplicon_name + ' modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] crispresso2_info[ 'window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names crispresso2_info[ 'nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names crispresso2_info[ 'window_nuc_conv_plot_names'] = window_nuc_conv_plot_names crispresso2_info['nuc_conv_plot_names'] = nuc_conv_plot_names quantification_summary = [] #summarize amplicon modifications samples_quantification_summary_by_amplicon_filename = _jp( 'CRISPRessoAggregate_quantification_of_editing_frequency_by_amplicon.txt' ) #this file has separate lines for each amplicon in each run with open(samples_quantification_summary_by_amplicon_filename, 'w') as outfile: wrote_header = False for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] run_name = crispresso2_folder_names[crispresso2_folder] amplicon_modification_file = os.path.join( crispresso2_folder, run_data['quant_of_editing_freq_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Folder\t' + file_head) wrote_header = True for line in infile: outfile.write(crispresso2_folder + "\t" + line) n_tot = run_data['aln_stats']['N_TOT_READS'] n_aligned = 0 n_unmod = 0 n_mod = 0 n_discarded = 0 n_insertion = 0 n_deletion = 0 n_substitution = 0 n_only_insertion = 0 n_only_deletion = 0 n_only_substitution = 0 n_insertion_and_deletion = 0 n_insertion_and_substitution = 0 n_deletion_and_substitution = 0 n_insertion_and_deletion_and_substitution = 0 for ref_name in run_data[ 'ref_names']: #multiple alleles could be provided n_aligned += run_data['counts_total'][ref_name] n_unmod += run_data['counts_unmodified'][ref_name] n_mod += run_data['counts_modified'][ref_name] n_discarded += run_data['counts_discarded'][ref_name] n_insertion += run_data['counts_insertion'][ref_name] n_deletion += run_data['counts_deletion'][ref_name] n_substitution += run_data['counts_substitution'][ ref_name] n_only_insertion += run_data['counts_only_insertion'][ ref_name] n_only_deletion += run_data['counts_only_deletion'][ ref_name] n_only_substitution += run_data[ 'counts_only_substitution'][ref_name] n_insertion_and_deletion += run_data[ 'counts_insertion_and_deletion'][ref_name] n_insertion_and_substitution += run_data[ 'counts_insertion_and_substitution'][ref_name] n_deletion_and_substitution += run_data[ 'counts_deletion_and_substitution'][ref_name] n_insertion_and_deletion_and_substitution += run_data[ 'counts_insertion_and_deletion_and_substitution'][ ref_name] unmod_pct = np.nan mod_pct = np.nan if n_aligned > 0: unmod_pct = 100 * n_unmod / float(n_aligned) mod_pct = 100 * n_mod / float(n_aligned) vals = [run_name] vals.extend([ round(unmod_pct, 8), round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod, n_discarded, n_insertion, n_deletion, n_substitution, n_only_insertion, n_only_deletion, n_only_substitution, n_insertion_and_deletion, n_insertion_and_substitution, n_deletion_and_substitution, n_insertion_and_deletion_and_substitution ]) quantification_summary.append(vals) header = 'Name\tUnmodified%\tModified%\tReads_total\tReads_aligned\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions' header_els = header.split("\t") df_summary_quantification = pd.DataFrame(quantification_summary, columns=header_els) samples_quantification_summary_filename = _jp( 'CRISPRessoAggregate_quantification_of_editing_frequency.txt' ) #this file has one line for each run (sum of all amplicons) df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None) crispresso2_info[ 'samples_quantification_summary_filename'] = os.path.basename( samples_quantification_summary_filename) crispresso2_info[ 'samples_quantification_summary_by_amplicon_filename'] = os.path.basename( samples_quantification_summary_by_amplicon_filename) df_summary_quantification.set_index('Name') if not args.suppress_plots: plot_root = _jp("CRISPRessoAggregate_reads_summary") CRISPRessoPlot.plot_reads_total(plot_root, df_summary_quantification, save_png, args.min_reads_for_inclusion) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoAggregate Mapping Statistics Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads in each sample. The vertical line shows the cutoff for analysis, set using the --min_reads_for_inclusion parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoAggregate summary', os.path.basename(samples_quantification_summary_filename) ), ('CRISPRessoAggregate summary by amplicon', os.path.basename( samples_quantification_summary_by_amplicon_filename)) ] plot_root = _jp( "CRISPRessoAggregate_quantification_of_editing_frequency") CRISPRessoPlot.plot_unmod_mod_pcts( plot_root, df_summary_quantification, save_png, args.min_reads_for_inclusion) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoAggregate Modification Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_for_inclusion parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoAggregate summary', os.path.basename(samples_quantification_summary_filename) ), ('CRISPRessoAggregate summary by amplicon', os.path.basename( samples_quantification_summary_by_amplicon_filename)) ] #summarize alignment with open(_jp('CRISPRessoAggregate_mapping_statistics.txt'), 'w') as outfile: wrote_header = False for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] run_name = crispresso2_folder_names[crispresso2_folder] mapping_file = os.path.join( crispresso2_folder, run_data['mapping_stats_filename']) with open(mapping_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Folder\t' + file_head) wrote_header = True for line in infile: outfile.write(crispresso2_folder + "\t" + line) if not args.suppress_report: report_filename = OUTPUT_DIRECTORY + '.html' if (args.place_report_in_output_folder): report_filename = _jp("CRISPResso2Aggregate_report.html") CRISPRessoReport.make_aggregate_report( crispresso2_info, args.name, report_filename, OUTPUT_DIRECTORY, _ROOT, crispresso2_folders, crispresso2_folder_htmls) crispresso2_info['report_location'] = report_filename crispresso2_info['report_filename'] = os.path.basename( report_filename) end_time = datetime.now() end_time_string = end_time.strftime('%Y-%m-%d %H:%M:%S') running_time = end_time - start_time running_time_string = str(running_time) crispresso2_info['end_time'] = end_time crispresso2_info['end_time_string'] = end_time_string crispresso2_info['running_time'] = running_time crispresso2_info['running_time_string'] = running_time_string cp.dump(crispresso2_info, open(crispresso2Aggregate_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)
def report_nucleotide_summary(amplicon_seq, amplicon_name, amplicon_index): consensus_sequence = "" nucleotide_frequency_summary = [] nucleotide_percentage_summary = [] modification_frequency_summary = [] modification_percentage_summary = [] amp_found_count = 0 #how many folders had information for this amplicon consensus_guides = [] consensus_include_idxs = [] consensus_sgRNA_intervals = [] guides_all_same = True batches_with_this_amplicon = [] for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue batch_has_amplicon = False batch_amplicon_name = '' for ref_name in run_data['ref_names']: if amplicon_seq == run_data['refs'][ref_name]['sequence']: batch_has_amplicon = True batch_amplicon_name = ref_name if not batch_has_amplicon: continue batches_with_this_amplicon.append(idx) if consensus_guides == []: consensus_guides = run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] consensus_include_idxs = run_data['refs'][ batch_amplicon_name]['include_idxs'] consensus_sgRNA_intervals = run_data['refs'][ batch_amplicon_name]['sgRNA_intervals'] if run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] != consensus_guides: guides_all_same = False if 'nuc_freq_filename' not in run_data['refs'][ batch_amplicon_name]: info( "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information." % (batch_amplicon_name, folder_name)) continue nucleotide_frequency_file = run_data['refs'][ batch_amplicon_name]['nuc_freq_filename'] ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file( nucleotide_frequency_file) nucleotide_pct_file = run_data['refs'][batch_amplicon_name][ 'nuc_pct_filename'] ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file( nucleotide_pct_file) count_file = run_data['refs'][batch_amplicon_name][ 'mod_count_filename'] ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file( count_file) if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None: info( "Skipping the amplicon '%s' in folder '%s'. Could not parse batch output." % (batch_amplicon_name, folder_name)) info( "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'" % (ampSeq_nf, ampSeq_np, ampSeq_cf)) continue if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf: warn( "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s" % (batch_amplicon_name, folder_name, ampSeq_nf, ampSeq_np, ampSeq_cf, amplicon_seq)) continue if consensus_sequence == "": consensus_sequence = ampSeq_nf if ampSeq_nf != consensus_sequence: info( "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match." % (batch_amplicon_name, folder_name)) continue if 'Total' not in mod_freqs: info( "Skipping the amplicon '%s' in folder '%s'. Processing did not complete." % (batch_amplicon_name, folder_name)) continue if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][0] == "0": info( "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon." % (batch_amplicon_name, folder_name)) continue if (args.min_reads_for_inclusion is not None) and (int( mod_freqs['Total'][0]) < args.min_reads_for_inclusion): info( "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)." % (batch_amplicon_name, folder_name, str(mod_freqs['Total'][0]), args.min_reads_for_inclusion)) continue mod_pcts = {} for key in mod_freqs: mod_pcts[key] = np.array(mod_freqs[key]).astype( np.float) / float(mod_freqs['Total'][0]) amp_found_count += 1 for nuc in ['A', 'T', 'C', 'G', 'N', '-']: row = [batchName, nuc] row.extend(nuc_freqs[nuc]) nucleotide_frequency_summary.append(row) pct_row = [batchName, nuc] pct_row.extend(nuc_pcts[nuc]) nucleotide_percentage_summary.append(pct_row) for mod in [ 'Insertions', 'Insertions_Left', 'Deletions', 'Substitutions', 'All_modifications' ]: row = [batchName, mod] row.extend(mod_freqs[mod]) modification_frequency_summary.append(row) pct_row = [batchName, mod] pct_row.extend(mod_pcts[mod]) modification_percentage_summary.append(pct_row) if amp_found_count == 0: info( "Couldn't find any data for amplicon '%s'. Not compiling results." % amplicon_name) return () colnames = ['Batch', 'Nucleotide'] colnames.extend(list(consensus_sequence)) nucleotide_frequency_summary_df = pd.DataFrame( nucleotide_frequency_summary, columns=colnames) nucleotide_frequency_summary_df = pd.concat([ nucleotide_frequency_summary_df.iloc[:, 0:2], nucleotide_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_frequency_summary_df.to_csv( _jp(amplicon_name + '.NUCLEOTIDE_FREQUENCY_SUMMARY.txt'), sep='\t', index=None) nucleotide_percentage_summary_df = pd.DataFrame( nucleotide_percentage_summary, columns=colnames) nucleotide_percentage_summary_df = pd.concat([ nucleotide_percentage_summary_df.iloc[:, 0:2], nucleotide_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_percentage_summary_df.to_csv( _jp(amplicon_name + '.NUCLEOTIDE_PERCENTAGE_SUMMARY.txt'), sep='\t', index=None) colnames = ['Batch', 'Modification'] colnames.extend(list(consensus_sequence)) modification_frequency_summary_df = pd.DataFrame( modification_frequency_summary, columns=colnames) modification_frequency_summary_df = pd.concat([ modification_frequency_summary_df.iloc[:, 0:2], modification_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_frequency_summary_df.to_csv( _jp(amplicon_name + '.MODIFICATION_FREQUENCY_SUMMARY.txt'), sep='\t', index=None) modification_percentage_summary_df = pd.DataFrame( modification_percentage_summary, columns=colnames) modification_percentage_summary_df = pd.concat([ modification_percentage_summary_df.iloc[:, 0:2], modification_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_percentage_summary_df.to_csv( _jp(amplicon_name + '.MODIFICATION_PERCENTAGE_SUMMARY.txt'), sep='\t', index=None) #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window if guides_all_same and consensus_guides != []: include_idxs = consensus_include_idxs sgRNA_intervals = consensus_sgRNA_intervals info( "All guides are equal. Performing comparison of batches for amplicon '%s'" % amplicon_name) include_idxs_flat = [0, 1] # guide, nucleotide include_idxs_flat.extend( [cutidx + 2 for cutidx in include_idxs]) sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:, include_idxs_flat] sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:, include_idxs_flat] sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:, include_idxs_flat] sub_sgRNA_intervals = [] for sgRNA_interval in sgRNA_intervals: newstart = None newend = None for idx, i in enumerate(include_idxs): if i <= sgRNA_interval[0]: newstart = idx if newend is None and i >= sgRNA_interval[1]: newend = idx #if guide doesn't overlap with include indexes if newend == 0 or newstart == len(include_idxs): continue #otherwise, correct partial overlaps elif newstart == None and newend == None: newstart = 0 newend = len(include_idxs) - 1 elif newstart == None: newstart = 0 elif newend == None: newend = len(include_idxs) - 1 #and add it to the list sub_sgRNA_intervals.append((newstart, newend)) if not args.suppress_plots: CRISPRessoPlot.plot_nucleotide_quilt( sub_nucleotide_percentage_summary_df, sub_modification_percentage_summary_df, _jp(amplicon_name + '.Quantification_Window_Nucleotide_Percentage_Quilt' ), save_png, sgRNA_intervals=sub_sgRNA_intervals) if args.base_editor_output: CRISPRessoPlot.plot_conversion_map( sub_nucleotide_percentage_summary_df, _jp(amplicon_name + '.Quantification_Window_Nucleotide_Conversion' ), args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=sub_sgRNA_intervals) CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, _jp(amplicon_name + '.Nucleotide_Percentage_Quilt'), save_png, sgRNA_intervals=sgRNA_intervals, quantification_window_idxs=include_idxs) if args.base_editor_output: CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, _jp(amplicon_name + '.Nucleotide_Conversion'), args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=sgRNA_intervals) else: #guides are not the same if not args.suppress_plots: CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, _jp(amplicon_name + '.Nucleotide_Percentage_Quilt'), save_png) if args.base_editor_output: CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, _jp(amplicon_name + '.Nucleotide_Conversion'), args.conversion_nuc_from, args.conversion_nuc_to, save_png)
def main(): try: description = [ '~~~CRISPRessoCompare~~~', '-Comparison of two CRISPResso analyses-' ] compare_header = r''' ___________________________ | __ __ __ __ __ | |/ / \|\/||__) /\ |__)|_ | |\__\__/| || /--\| \ |__ | |___________________________| ''' compare_header = CRISPRessoShared.get_crispresso_header( description, compare_header) print(compare_header) parser = argparse.ArgumentParser( description='CRISPRessoCompare Parameters', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'crispresso_output_folder_1', type=str, help='First output folder with CRISPResso analysis') parser.add_argument( 'crispresso_output_folder_2', type=str, help='Second output folder with CRISPResso analysis') #OPTIONALS parser.add_argument('-n', '--name', help='Output name', default='') parser.add_argument('-n1', '--sample_1_name', help='Sample 1 name', default='Sample_1') parser.add_argument('-n2', '--sample_2_name', help='Sample 2 name', default='Sample_2') parser.add_argument('-o', '--output_folder', help='', default='') parser.add_argument( '--min_frequency_alleles_around_cut_to_plot', type=float, help= 'Minimum %% reads required to report an allele in the alleles table plot.', default=0.2) parser.add_argument( '--max_rows_alleles_around_cut_to_plot', type=int, help='Maximum number of rows to report in the alleles table plot. ', default=50) parser.add_argument( '--save_also_png', help='Save also .png images additionally to .pdf files', action='store_true') parser.add_argument('--debug', help='Show debug messages', action='store_true') args = parser.parse_args() debug_flag = args.debug #check that the CRISPResso output is present and fill amplicon_info quantification_file_1, amplicon_names_1, amplicon_info_1 = CRISPRessoShared.check_output_folder( args.crispresso_output_folder_1) quantification_file_2, amplicon_names_2, amplicon_info_2 = CRISPRessoShared.check_output_folder( args.crispresso_output_folder_2) get_name_from_folder = lambda x: os.path.basename(os.path.abspath( x)).replace('CRISPResso_on_', '') if not args.name: database_id = '%s_VS_%s' % ( get_name_from_folder(args.crispresso_output_folder_1), get_name_from_folder(args.crispresso_output_folder_2)) else: database_id = args.name OUTPUT_DIRECTORY = 'CRISPRessoCompare_on_%s' % database_id if args.output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.output_folder), OUTPUT_DIRECTORY) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt') try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) info('Done!') except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write( '[Command used]:\nCRISPRessoCompare %s\n\n[Execution log]:\n' % ' '.join(sys.argv)) #LOAD DATA amplicon_names_in_both = [ amplicon_name for amplicon_name in amplicon_names_1 if amplicon_name in amplicon_names_2 ] n_refs = len(amplicon_names_in_both) def get_plot_title_with_ref_name(plotTitle, refName): if n_refs > 1: return (plotTitle + ": " + refName) return plotTitle for amplicon_name in amplicon_names_in_both: profile_1 = parse_profile( amplicon_info_1[amplicon_name]['quantification_file']) profile_2 = parse_profile( amplicon_info_2[amplicon_name]['quantification_file']) try: assert np.all(profile_1[:, 0] == profile_2[:, 0]) except: raise DifferentAmpliconLengthException( 'Different amplicon lengths for the two amplicons.') len_amplicon = profile_1.shape[0] effect_vector_any_1 = profile_1[:, 1] effect_vector_any_2 = profile_2[:, 1] cut_points, sgRNA_intervals = load_cut_points_sgRNA_intervals( args.crispresso_output_folder_1, amplicon_name) #Quantification comparison barchart fig = plt.figure(figsize=(30, 15)) n_groups = 2 N_TOTAL_1 = float(amplicon_info_1[amplicon_name]['Total']) N_UNMODIFIED_1 = float( amplicon_info_1[amplicon_name]['Unmodified']) N_MODIFIED_1 = float(amplicon_info_1[amplicon_name]['Modified']) N_TOTAL_2 = float(amplicon_info_2[amplicon_name]['Total']) N_UNMODIFIED_2 = float( amplicon_info_2[amplicon_name]['Unmodified']) N_MODIFIED_2 = float(amplicon_info_2[amplicon_name]['Modified']) means_sample_1 = np.array([N_UNMODIFIED_1, N_MODIFIED_1 ]) / N_TOTAL_1 * 100 means_sample_2 = np.array([N_UNMODIFIED_2, N_MODIFIED_2 ]) / N_TOTAL_2 * 100 ax1 = fig.add_subplot(1, 2, 1) index = np.arange(n_groups) bar_width = 0.35 opacity = 0.4 error_config = {'ecolor': '0.3'} rects1 = ax1.bar(index, means_sample_1, bar_width, alpha=opacity, color=(0, 0, 1, 0.4), label=args.sample_1_name) rects2 = ax1.bar(index + bar_width, means_sample_2, bar_width, alpha=opacity, color=(1, 0, 0, 0.4), label=args.sample_2_name) plt.ylabel('% Sequences') plt.title( get_plot_title_with_ref_name( '%s VS %s' % (args.sample_1_name, args.sample_2_name), amplicon_name)) plt.xticks(index + bar_width / 2.0, ('Unmodified', 'Modified')) plt.legend() # plt.xlim(index[0]-0.2,(index + bar_width)[-1]+bar_width+0.2) plt.tight_layout() ax2 = fig.add_subplot(1, 2, 2) ax2.bar(index, means_sample_1 - means_sample_2, bar_width + 0.35, alpha=opacity, color=(0, 1, 1, 0.4), label='') plt.ylabel('% Sequences Difference') plt.title( get_plot_title_with_ref_name( '%s - %s' % (args.sample_1_name, args.sample_2_name), amplicon_name)) plt.xticks(index, ['Unmodified', 'Modified']) # plt.xlim(index[0]-bar_width/2, (index+bar_width)[-1]+2*bar_width) plt.tight_layout() plt.savefig(_jp('1.' + amplicon_name + '.Comparison_Efficiency.pdf'), bbox_inches='tight') if args.save_also_png: plt.savefig(_jp('1.' + amplicon_name + '.Comparison_Efficiency.png'), bbox_inches='tight') #profile comparion fig = plt.figure(figsize=(20, 10)) ax1 = fig.add_subplot(1, 2, 1) plt.title( get_plot_title_with_ref_name('Mutation position distribution', amplicon_name)) y_max = max(effect_vector_any_1.max(), effect_vector_any_2.max()) * 1.2 plt.plot(effect_vector_any_1, color=(0, 0, 1, 0.3), lw=4, label='%s combined mutations' % args.sample_1_name) # plt.hold(True) plt.plot(effect_vector_any_2, color=(1, 0, 0, 0.3), lw=4, label='%s combined mutations' % args.sample_2_name) if cut_points: for idx, cut_point in enumerate(cut_points): if idx == 0: plt.plot([cut_point, cut_point], [0, y_max], '--k', lw=2, label='Predicted cleavage position') else: plt.plot([cut_point, cut_point], [0, y_max], '--k', lw=2, label='_nolegend_') for idx, sgRNA_int in enumerate(sgRNA_intervals): if idx == 0: plt.plot([sgRNA_int[0], sgRNA_int[1]], [0, 0], lw=10, c=(0, 0, 0, 0.15), label='sgRNA') else: plt.plot([sgRNA_int[0], sgRNA_int[1]], [0, 0], lw=10, c=(0, 0, 0, 0.15), label='_nolegend_') lgd = plt.legend(loc='center', bbox_to_anchor=(0.5, -0.3), ncol=1, fancybox=True, shadow=False) plt.xticks( np.arange(0, len_amplicon, max(3, (len_amplicon / 6) - (len_amplicon / 6) % 5)).astype(int)) plt.xlabel('Reference amplicon position (bp)') plt.ylabel('Sequences %') plt.ylim(0, max(1, y_max)) plt.xlim(xmax=len_amplicon - 1) ax2 = fig.add_subplot(1, 2, 2) effect_vector_any_diff = effect_vector_any_1 - effect_vector_any_2 y_max = effect_vector_any_diff.max() * 1.2 y_min = effect_vector_any_diff.min() * 1.2 plt.title( get_plot_title_with_ref_name( '%s - %s' % (args.sample_1_name, args.sample_2_name), amplicon_name)) plt.plot(effect_vector_any_diff, color=(0, 1, 0, 0.4), lw=3, label='Difference') if cut_points: for idx, cut_point in enumerate(cut_points): if idx == 0: plt.plot( [cut_point, cut_point], [min(-1, y_min), max(1, y_max)], '--k', lw=2, label='Predicted cleavage position') else: plt.plot( [cut_point, cut_point], [min(-1, y_min), max(1, y_max)], '--k', lw=2, label='_nolegend_') for idx, sgRNA_int in enumerate(sgRNA_intervals): if idx == 0: plt.plot( [sgRNA_int[0], sgRNA_int[1]], [min(-1, y_min), min(-1, y_min)], lw=10, c=(0, 0, 0, 0.15), label='sgRNA') else: plt.plot( [sgRNA_int[0], sgRNA_int[1]], [min(-1, y_min), min(-1, y_min)], lw=10, c=(0, 0, 0, 0.15), label='_nolegend_') lgd2 = plt.legend(loc='center', bbox_to_anchor=(0.5, -0.2), ncol=1, fancybox=True, shadow=False) plt.xticks( np.arange(0, len_amplicon, max(3, (len_amplicon / 6) - (len_amplicon / 6) % 5)).astype(int)) plt.xlabel('Reference amplicon position (bp)') plt.ylabel('Sequences Difference %') plt.xlim(xmax=len_amplicon - 1) plt.ylim(min(-1, y_min), max(1, y_max)) plt.savefig(_jp( '2.' + amplicon_name + '.Comparison_Combined_Insertion_Deletion_Substitution_Locations.pdf' ), bbox_extra_artists=(lgd, ), bbox_inches='tight') if args.save_also_png: plt.savefig(_jp( '2.' + amplicon_name + '.Comparison_Insertion_Deletion_Substitution_Locations.png' ), bbox_extra_artists=(lgd, ), bbox_inches='tight') mod_file_1 = amplicon_info_1[amplicon_name][ 'modification_count_file'] amp_seq_1, mod_freqs_1 = CRISPRessoShared.parse_count_file( mod_file_1) mod_file_2 = amplicon_info_2[amplicon_name][ 'modification_count_file'] amp_seq_2, mod_freqs_2 = CRISPRessoShared.parse_count_file( mod_file_2) consensus_sequence = amp_seq_1 if amp_seq_2 != consensus_sequence: raise DifferentAmpliconLengthException( 'Different amplicon lengths for the two amplicons.') for mod in [ 'Insertions', 'Deletions', 'Substitutions', 'All_modifications' ]: mod_counts_1 = np.array(mod_freqs_1[mod], dtype=float) tot_counts_1 = np.array(mod_freqs_1['Total'], dtype=float) unmod_counts_1 = tot_counts_1 - mod_counts_1 mod_counts_2 = np.array(mod_freqs_2[mod], dtype=float) tot_counts_2 = np.array(mod_freqs_2['Total'], dtype=float) unmod_counts_2 = tot_counts_2 - mod_counts_2 fisher_results = [ stats.fisher_exact([[z[0], z[1]], [z[2], z[3]]]) if max(z) > 0 else [nan, 1.0] for z in zip(mod_counts_1, unmod_counts_1, mod_counts_2, unmod_counts_2) ] oddsratios, pvalues = [a for a, b in fisher_results ], [b for a, b in fisher_results] mod_df = [] row = [args.sample_1_name + '_' + mod] row.extend(mod_counts_1) mod_df.append(row) row = [args.sample_1_name + '_total'] row.extend(tot_counts_1) mod_df.append(row) row = [args.sample_2_name + '_' + mod] row.extend(mod_counts_2) mod_df.append(row) row = [args.sample_2_name + '_total'] row.extend(tot_counts_2) mod_df.append(row) row = ['odds_ratios'] row.extend(oddsratios) mod_df.append(row) row = ['pvalues'] row.extend(pvalues) mod_df.append(row) colnames = ['Reference'] colnames.extend(list(consensus_sequence)) mod_df = pd.DataFrame(mod_df, columns=colnames) # mod_df = pd.concat([mod_df.iloc[:,0:2], mod_df.iloc[:,2:].apply(pd.to_numeric)],axis=1) #write to file mod_df.to_csv(_jp(amplicon_name + '.' + mod + '_quantification.txt'), sep='\t', index=None) #plot fig = plt.figure(figsize=(20, 10)) ax1 = fig.add_subplot(2, 1, 1) diff = np.divide(mod_counts_1, tot_counts_1) - np.divide( mod_counts_2, tot_counts_2) diff_plot = ax1.plot(diff, color=(0, 1, 0, 0.4), lw=3, label='Difference') ax1.set_title( get_plot_title_with_ref_name( '%s: %s - %s' % (mod, args.sample_1_name, args.sample_2_name), amplicon_name)) ax1.set_xticks( np.arange( 0, len_amplicon, max(3, (len_amplicon / 6) - (len_amplicon / 6) % 5)).astype(int)) ax1.set_ylabel('Sequences Difference %') ax1.set_xlim(xmin=0, xmax=len_amplicon - 1) pvalues = np.array(pvalues) min_nonzero = np.min(pvalues[np.nonzero(pvalues)]) pvalues[pvalues == 0] = min_nonzero #ax2 = ax1.twinx() ax2 = fig.add_subplot(2, 1, 2) pval_plot = ax2.plot(-1 * np.log10(pvalues), color=(1, 0, 0, 0.4), lw=2, label='-log10 P-value') ax2.set_ylabel('-log10 P-value') ax2.set_xlim(xmin=0, xmax=len_amplicon - 1) ax2.set_xticks( np.arange( 0, len_amplicon, max(3, (len_amplicon / 6) - (len_amplicon / 6) % 5)).astype(int)) ax2.set_xlabel('Reference amplicon position (bp)') #bonferroni correction corrected_p = -1 * np.log10( 0.01 / float(len(consensus_sequence))) cutoff_plot = ax2.plot([0, len(consensus_sequence)], [corrected_p, corrected_p], color='k', dashes=(5, 10), label='Bonferronni corrected cutoff') plots = diff_plot + pval_plot + cutoff_plot diff_y_min, diff_y_max = ax1.get_ylim() p_y_min, p_y_max = ax2.get_ylim() if cut_points: for idx, cut_point in enumerate(cut_points): if idx == 0: plot_cleavage = ax1.plot( [cut_point, cut_point], [diff_y_min, diff_y_max], '--k', lw=2, label='Predicted cleavage position') ax2.plot([cut_point, cut_point], [p_y_min, p_y_max], '--k', lw=2, label='Predicted cleavage position') plots = plots + plot_cleavage else: ax1.plot([cut_point, cut_point], [diff_y_min, diff_y_max], '--k', lw=2, label='_nolegend_') ax2.plot([cut_point, cut_point], [diff_y_min, diff_y_max], '--k', lw=2, label='_nolegend_') for idx, sgRNA_int in enumerate(sgRNA_intervals): if idx == 0: p2 = ax1.plot([sgRNA_int[0], sgRNA_int[1]], [diff_y_min, diff_y_min], lw=10, c=(0, 0, 0, 0.15), label='sgRNA') ax2.plot([sgRNA_int[0], sgRNA_int[1]], [p_y_min, p_y_min], lw=10, c=(0, 0, 0, 0.15), label='sgRNA') plots = plots + p2 else: ax1.plot([sgRNA_int[0], sgRNA_int[1]], [diff_y_min, diff_y_min], lw=10, c=(0, 0, 0, 0.15), label='_nolegend_') ax2.plot([sgRNA_int[0], sgRNA_int[1]], [p_y_min, p_y_min], lw=10, c=(0, 0, 0, 0.15), label='_nolegend_') labs = [p.get_label() for p in plots] lgd = plt.legend(plots, labs, loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=1, fancybox=True, shadow=False) plt.savefig(_jp('2.' + amplicon_name + '.' + mod + '.quantification.pdf'), bbox_extra_artists=(lgd, ), bbox_inches='tight') if args.save_also_png: plt.savefig(_jp('2.' + amplicon_name + '.' + mod + '.quantification.png'), bbox_extra_artists=(lgd, ), bbox_inches='tight') #create merged heatmaps for each cut site allele_files_1 = amplicon_info_1[amplicon_name]['allele_files'] allele_files_2 = amplicon_info_2[amplicon_name]['allele_files'] for allele_file_1 in allele_files_1: allele_file_1_name = os.path.split(allele_file_1)[ 1] #get file part of path for allele_file_2 in allele_files_2: allele_file_2_name = os.path.split(allele_file_2)[ 1] #get file part of path #if files are the same (same amplicon, cut site, guide), run comparison if allele_file_1_name == allele_file_2_name: df1 = pd.read_csv(allele_file_1, sep="\t") df2 = pd.read_csv(allele_file_2, sep="\t") #find unmodified reference for comparison (if it exists) ref_seq_around_cut = "" if len(df1.loc[df1['Reference_Sequence'].str.contains( '-') == False]) > 0: ref_seq_around_cut = df1.loc[ df1['Reference_Sequence'].str.contains('-') == False]['Reference_Sequence'].iloc[0] #otherwise figure out which sgRNA was used for this comparison elif len(df2.loc[df2['Reference_Sequence'].str. contains('-') == False]) > 0: ref_seq_around_cut = df2.loc[ df2['Reference_Sequence'].str.contains('-') == False]['Reference_Sequence'].iloc[0] else: seq_len = df2[df2['Unedited'] == True]['Reference_Sequence'].iloc[0] for sgRNA_interval, cut_point in zip( sgRNA_intervals, cut_points): sgRNA_seq = consensus_sequence[ sgRNA_interval[0]:sgRNA_interval[1]] if sgRNA_seq in allele_file_1_name: this_sgRNA_seq = sgRNA_seq this_cut_point = cut_point ref_seq_around_cut = consensus_sequence[max( 0, this_cut_point - args.offset_around_cut_to_plot + 1):min( len(reference_seq), cut_point + args.offset_around_cut_to_plot + 1)] break merged = pd.merge(df1, df2, on=[ 'Aligned_Sequence', 'Reference_Sequence', 'Unedited', 'n_deleted', 'n_inserted', 'n_mutated' ], suffixes=('_' + args.sample_1_name, '_' + args.sample_2_name), how='outer') quant_cols = [ '#Reads_' + args.sample_1_name, '%Reads_' + args.sample_1_name, '#Reads_' + args.sample_2_name, '%Reads_' + args.sample_2_name ] merged[quant_cols] = merged[quant_cols].fillna(0) lfc_error = 0.1 merged['each_LFC'] = np.log2( ((merged['%Reads_' + args.sample_1_name] + lfc_error) / (merged['%Reads_' + args.sample_2_name] + lfc_error)).astype(float)).replace( [np.inf, np.NaN], 0) merged = merged.reset_index().set_index( 'Aligned_Sequence') output_root = allele_file_1_name.replace(".txt", "") merged.to_csv(_jp(output_root + ".txt"), sep="\t", index=None) CRISPRessoPlot.plot_alleles_table_compare( ref_seq_around_cut, merged.sort_values(['each_LFC'], ascending=True), args.sample_1_name, args.sample_2_name, _jp('3.' + output_root + "_top"), MIN_FREQUENCY=args. min_frequency_alleles_around_cut_to_plot, MAX_N_ROWS=args. max_rows_alleles_around_cut_to_plot, SAVE_ALSO_PNG=args.save_also_png) CRISPRessoPlot.plot_alleles_table_compare( ref_seq_around_cut, merged.sort_values(['each_LFC'], ascending=False), args.sample_1_name, args.sample_2_name, _jp('3.' + output_root + "_bottom"), MIN_FREQUENCY=args. min_frequency_alleles_around_cut_to_plot, MAX_N_ROWS=args. max_rows_alleles_around_cut_to_plot, SAVE_ALSO_PNG=args.save_also_png) info('All Done!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): try: description = [ '~~~CRISPRessoCompare~~~', '-Comparison of two CRISPResso analyses-' ] compare_header = r''' ___________________________ | __ __ __ __ __ | |/ / \|\/||__) /\ |__)|_ | |\__\__/| || /--\| \ |__ | |___________________________| ''' compare_header = CRISPRessoShared.get_crispresso_header( description, compare_header) print(compare_header) parser = argparse.ArgumentParser( description='CRISPRessoCompare Parameters', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'crispresso_output_folder_1', type=str, help='First output folder with CRISPResso analysis') parser.add_argument( 'crispresso_output_folder_2', type=str, help='Second output folder with CRISPResso analysis') #OPTIONALS parser.add_argument('-n', '--name', help='Output name', default='') parser.add_argument('-n1', '--sample_1_name', help='Sample 1 name') parser.add_argument('-n2', '--sample_2_name', help='Sample 2 name') parser.add_argument('-o', '--output_folder', help='', default='') parser.add_argument( '--min_frequency_alleles_around_cut_to_plot', type=float, help= 'Minimum %% reads required to report an allele in the alleles table plot.', default=0.2) parser.add_argument( '--max_rows_alleles_around_cut_to_plot', type=int, help='Maximum number of rows to report in the alleles table plot. ', default=50) parser.add_argument('--suppress_report', help='Suppress output report', action='store_true') parser.add_argument( '--place_report_in_output_folder', help= 'If true, report will be written inside the CRISPResso output folder. By default, the report will be written one directory up from the report output.', action='store_true') parser.add_argument('--debug', help='Show debug messages', action='store_true') args = parser.parse_args() debug_flag = args.debug #check that the CRISPResso output is present and fill amplicon_info quantification_file_1, amplicon_names_1, amplicon_info_1 = CRISPRessoShared.check_output_folder( args.crispresso_output_folder_1) quantification_file_2, amplicon_names_2, amplicon_info_2 = CRISPRessoShared.check_output_folder( args.crispresso_output_folder_2) run_info_1_file = os.path.join(args.crispresso_output_folder_1, 'CRISPResso2_info.pickle') if os.path.isfile(run_info_1_file) is False: raise CRISPRessoShared.OutputFolderIncompleteException( 'The folder %s is not a valid CRISPResso2 output folder. Cannot find run data at %s' % (args.crispresso_output_folder_1, run_info_1_file)) run_info_1 = cp.load(open(run_info_1_file, 'rb')) run_info_2_file = os.path.join(args.crispresso_output_folder_2, 'CRISPResso2_info.pickle') if os.path.isfile(run_info_2_file) is False: raise CRISPRessoShared.OutputFolderIncompleteException( 'The folder %s is not a valid CRISPResso2 output folder. Cannot find run data at %s' % (args.crispresso_output_folder_2, run_info_2_file)) run_info_2 = cp.load(open(run_info_2_file, 'rb')) sample_1_name = args.sample_1_name if args.sample_1_name is None: sample_1_name = "Sample 1" if 'name' in run_info_1 and run_info_1['name'] != '': sample_1_name = run_info_1['name'] sample_2_name = args.sample_2_name if args.sample_2_name is None: sample_2_name = "Sample 2" if 'name' in run_info_2 and run_info_2['name'] != '': sample_2_name = run_info_2['name'] get_name_from_folder = lambda x: os.path.basename(os.path.abspath( x)).replace('CRISPResso_on_', '') if not args.name: database_id = '%s_VS_%s' % ( get_name_from_folder(args.crispresso_output_folder_1), get_name_from_folder(args.crispresso_output_folder_2)) else: database_id = args.name OUTPUT_DIRECTORY = 'CRISPRessoCompare_on_%s' % database_id if args.output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.output_folder), OUTPUT_DIRECTORY) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt') try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) info('Done!') except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write( '[Command used]:\nCRISPRessoCompare %s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Compare_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Compare_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) crispresso2_info['summary_plot_names'] = [] crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} save_png = True if args.suppress_report: save_png = False #LOAD DATA amplicon_names_in_both = [ amplicon_name for amplicon_name in amplicon_names_1 if amplicon_name in amplicon_names_2 ] n_refs = len(amplicon_names_in_both) def get_plot_title_with_ref_name(plotTitle, refName): if n_refs > 1: return (plotTitle + ": " + refName) return plotTitle for amplicon_name in amplicon_names_in_both: profile_1 = parse_profile( amplicon_info_1[amplicon_name]['quantification_file']) profile_2 = parse_profile( amplicon_info_2[amplicon_name]['quantification_file']) amplicon_plot_name = amplicon_name + "." if len(amplicon_names_in_both ) == 1 and amplicon_name == "Reference": amplicon_plot_name = "" try: assert np.all(profile_1[:, 0] == profile_2[:, 0]) except: raise DifferentAmpliconLengthException( 'Different amplicon lengths for the two amplicons.') len_amplicon = profile_1.shape[0] effect_vector_any_1 = profile_1[:, 1] effect_vector_any_2 = profile_2[:, 1] cut_points = run_info_1['refs'][amplicon_name]['sgRNA_cut_points'] sgRNA_intervals = run_info_1['refs'][amplicon_name][ 'sgRNA_intervals'] #Quantification comparison barchart fig = plt.figure(figsize=(30, 15)) n_groups = 2 N_TOTAL_1 = float(amplicon_info_1[amplicon_name]['Reads_aligned']) N_UNMODIFIED_1 = float( amplicon_info_1[amplicon_name]['Unmodified']) N_MODIFIED_1 = float(amplicon_info_1[amplicon_name]['Modified']) N_TOTAL_2 = float(amplicon_info_2[amplicon_name]['Reads_aligned']) N_UNMODIFIED_2 = float( amplicon_info_2[amplicon_name]['Unmodified']) N_MODIFIED_2 = float(amplicon_info_2[amplicon_name]['Modified']) means_sample_1 = np.array([N_UNMODIFIED_1, N_MODIFIED_1 ]) / N_TOTAL_1 * 100 means_sample_2 = np.array([N_UNMODIFIED_2, N_MODIFIED_2 ]) / N_TOTAL_2 * 100 ax1 = fig.add_subplot(1, 2, 1) index = np.arange(n_groups) bar_width = 0.35 opacity = 0.4 error_config = {'ecolor': '0.3'} rects1 = ax1.bar(index, means_sample_1, bar_width, alpha=opacity, color=(0, 0, 1, 0.4), label=sample_1_name) rects2 = ax1.bar(index + bar_width, means_sample_2, bar_width, alpha=opacity, color=(1, 0, 0, 0.4), label=sample_2_name) plt.ylabel('% Sequences') plt.title( get_plot_title_with_ref_name( '%s VS %s' % (sample_1_name, sample_2_name), amplicon_name)) plt.xticks(index + bar_width / 2.0, ('Unmodified', 'Modified')) plt.legend() # plt.xlim(index[0]-0.2,(index + bar_width)[-1]+bar_width+0.2) plt.tight_layout() ax2 = fig.add_subplot(1, 2, 2) ax2.bar(index, means_sample_1 - means_sample_2, bar_width + 0.35, alpha=opacity, color=(0, 1, 1, 0.4), label='') plt.ylabel('% Sequences Difference') plt.title( get_plot_title_with_ref_name( '%s - %s' % (sample_1_name, sample_2_name), amplicon_name)) plt.xticks(index, ['Unmodified', 'Modified']) # plt.xlim(index[0]-bar_width/2, (index+bar_width)[-1]+2*bar_width) plt.tight_layout() plot_name = '1.' + amplicon_plot_name + 'Editing_comparison' plt.savefig(_jp(plot_name) + '.pdf', bbox_inches='tight') if save_png: plt.savefig(_jp(plot_name) + '.png', bbox_inches='tight') crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Editing efficiency comparison' crispresso2_info['summary_plot_labels'][ plot_name] = 'Figure 1: Comparison for amplicon ' + amplicon_name + '; Left: Percentage of modified and unmodified reads in each sample; Right: relative percentage of modified and unmodified reads' output_1 = os.path.join(args.crispresso_output_folder_1, run_info_1['report_filename']) output_2 = os.path.join(args.crispresso_output_folder_1, run_info_2['report_filename']) crispresso2_info['summary_plot_datas'][plot_name] = [] if os.path.isfile(output_1): crispresso2_info['summary_plot_datas'][plot_name].append( (sample_1_name + ' output', os.path.relpath(output_1, OUTPUT_DIRECTORY))) if os.path.isfile(output_2): crispresso2_info['summary_plot_datas'][plot_name].append( (sample_2_name + ' output', os.path.relpath(output_2, OUTPUT_DIRECTORY))) mod_file_1 = amplicon_info_1[amplicon_name][ 'modification_count_file'] amp_seq_1, mod_freqs_1 = CRISPRessoShared.parse_count_file( mod_file_1) mod_file_2 = amplicon_info_2[amplicon_name][ 'modification_count_file'] amp_seq_2, mod_freqs_2 = CRISPRessoShared.parse_count_file( mod_file_2) consensus_sequence = amp_seq_1 if amp_seq_2 != consensus_sequence: raise DifferentAmpliconLengthException( 'Different amplicon lengths for the two amplicons.') for mod in [ 'Insertions', 'Deletions', 'Substitutions', 'All_modifications' ]: mod_name = mod if mod == "All_modifications": mod_name = "Combined modifications (insertions, deletions and substitutions)" mod_counts_1 = np.array(mod_freqs_1[mod], dtype=float) tot_counts_1 = np.array(mod_freqs_1['Total'], dtype=float) unmod_counts_1 = tot_counts_1 - mod_counts_1 mod_counts_2 = np.array(mod_freqs_2[mod], dtype=float) tot_counts_2 = np.array(mod_freqs_2['Total'], dtype=float) unmod_counts_2 = tot_counts_2 - mod_counts_2 fisher_results = [ stats.fisher_exact([[z[0], z[1]], [z[2], z[3]]]) if max(z) > 0 else [nan, 1.0] for z in zip(mod_counts_1, unmod_counts_1, mod_counts_2, unmod_counts_2) ] oddsratios, pvalues = [a for a, b in fisher_results ], [b for a, b in fisher_results] mod_df = [] row = [sample_1_name + '_' + mod] row.extend(mod_counts_1) mod_df.append(row) row = [sample_1_name + '_total'] row.extend(tot_counts_1) mod_df.append(row) row = [sample_2_name + '_' + mod] row.extend(mod_counts_2) mod_df.append(row) row = [sample_2_name + '_total'] row.extend(tot_counts_2) mod_df.append(row) row = ['odds_ratios'] row.extend(oddsratios) mod_df.append(row) row = ['pvalues'] row.extend(pvalues) mod_df.append(row) colnames = ['Reference'] colnames.extend(list(consensus_sequence)) mod_df = pd.DataFrame(mod_df, columns=colnames) # mod_df = pd.concat([mod_df.iloc[:,0:2], mod_df.iloc[:,2:].apply(pd.to_numeric)],axis=1) #write to file mod_filename = _jp(amplicon_plot_name + mod + "_quantification.txt") mod_df.to_csv(mod_filename, sep='\t', index=None) #plot fig = plt.figure(figsize=(20, 10)) ax1 = fig.add_subplot(2, 1, 1) diff = np.divide(mod_counts_1, tot_counts_1) - np.divide( mod_counts_2, tot_counts_2) diff_plot = ax1.plot(diff, color=(0, 1, 0, 0.4), lw=3, label='Difference') ax1.set_title( get_plot_title_with_ref_name( '%s: %s - %s' % (mod, sample_1_name, sample_2_name), amplicon_name)) ax1.set_xticks( np.arange( 0, len_amplicon, max(3, (len_amplicon / 6) - (len_amplicon / 6) % 5)).astype(int)) ax1.set_ylabel('Sequences Difference %') ax1.set_xlim(xmin=0, xmax=len_amplicon - 1) pvalues = np.array(pvalues) min_nonzero = np.min(pvalues[np.nonzero(pvalues)]) pvalues[pvalues == 0] = min_nonzero #ax2 = ax1.twinx() ax2 = fig.add_subplot(2, 1, 2) pval_plot = ax2.plot(-1 * np.log10(pvalues), color=(1, 0, 0, 0.4), lw=2, label='-log10 P-value') ax2.set_ylabel('-log10 P-value') ax2.set_xlim(xmin=0, xmax=len_amplicon - 1) ax2.set_xticks( np.arange( 0, len_amplicon, max(3, (len_amplicon / 6) - (len_amplicon / 6) % 5)).astype(int)) ax2.set_xlabel('Reference amplicon position (bp)') #bonferroni correction corrected_p = -1 * np.log10( 0.01 / float(len(consensus_sequence))) cutoff_plot = ax2.plot([0, len(consensus_sequence)], [corrected_p, corrected_p], color='k', dashes=(5, 10), label='Bonferronni corrected cutoff') plots = diff_plot + pval_plot + cutoff_plot diff_y_min, diff_y_max = ax1.get_ylim() p_y_min, p_y_max = ax2.get_ylim() if cut_points: for idx, cut_point in enumerate(cut_points): if idx == 0: plot_cleavage = ax1.plot( [cut_point, cut_point], [diff_y_min, diff_y_max], '--k', lw=2, label='Predicted cleavage position') ax2.plot([cut_point, cut_point], [p_y_min, p_y_max], '--k', lw=2, label='Predicted cleavage position') plots = plots + plot_cleavage else: ax1.plot([cut_point, cut_point], [diff_y_min, diff_y_max], '--k', lw=2, label='_nolegend_') ax2.plot([cut_point, cut_point], [diff_y_min, diff_y_max], '--k', lw=2, label='_nolegend_') for idx, sgRNA_int in enumerate(sgRNA_intervals): if idx == 0: p2 = ax1.plot([sgRNA_int[0], sgRNA_int[1]], [diff_y_min, diff_y_min], lw=10, c=(0, 0, 0, 0.15), label='sgRNA') ax2.plot([sgRNA_int[0], sgRNA_int[1]], [p_y_min, p_y_min], lw=10, c=(0, 0, 0, 0.15), label='sgRNA') plots = plots + p2 else: ax1.plot([sgRNA_int[0], sgRNA_int[1]], [diff_y_min, diff_y_min], lw=10, c=(0, 0, 0, 0.15), label='_nolegend_') ax2.plot([sgRNA_int[0], sgRNA_int[1]], [p_y_min, p_y_min], lw=10, c=(0, 0, 0, 0.15), label='_nolegend_') labs = [p.get_label() for p in plots] lgd = plt.legend(plots, labs, loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=1, fancybox=True, shadow=False) plot_name = '2.' + amplicon_plot_name + mod + '_quantification' plt.savefig(_jp(plot_name + '.pdf'), bbox_inches='tight', bbox_extra_artists=(lgd, )) if save_png: plt.savefig(_jp(plot_name + '.png'), bbox_inches='tight', bbox_extra_artists=(lgd, )) crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = mod_name + ' locations' crispresso2_info['summary_plot_labels'][ plot_name] = mod_name + ' location comparison for amplicon ' + amplicon_name + '; Top: percent difference; Bottom: p-value.' crispresso2_info['summary_plot_datas'][plot_name] = [ (mod_name + ' quantification', os.path.basename(mod_filename)) ] #create merged heatmaps for each cut site allele_files_1 = amplicon_info_1[amplicon_name]['allele_files'] allele_files_2 = amplicon_info_2[amplicon_name]['allele_files'] for allele_file_1 in allele_files_1: allele_file_1_name = os.path.split(allele_file_1)[ 1] #get file part of path for allele_file_2 in allele_files_2: allele_file_2_name = os.path.split(allele_file_2)[ 1] #get file part of path #if files are the same (same amplicon, cut site, guide), run comparison if allele_file_1_name == allele_file_2_name: df1 = pd.read_csv(allele_file_1, sep="\t") df2 = pd.read_csv(allele_file_2, sep="\t") #find unmodified reference for comparison (if it exists) ref_seq_around_cut = "" if len(df1.loc[df1['Reference_Sequence'].str.contains( '-') == False]) > 0: ref_seq_around_cut = df1.loc[ df1['Reference_Sequence'].str.contains('-') == False]['Reference_Sequence'].iloc[0] #otherwise figure out which sgRNA was used for this comparison elif len(df2.loc[df2['Reference_Sequence'].str. contains('-') == False]) > 0: ref_seq_around_cut = df2.loc[ df2['Reference_Sequence'].str.contains('-') == False]['Reference_Sequence'].iloc[0] else: seq_len = df2[df2['Unedited'] == True]['Reference_Sequence'].iloc[0] for sgRNA_interval, cut_point in zip( sgRNA_intervals, cut_points): sgRNA_seq = consensus_sequence[ sgRNA_interval[0]:sgRNA_interval[1]] if sgRNA_seq in allele_file_1_name: this_sgRNA_seq = sgRNA_seq this_cut_point = cut_point ref_seq_around_cut = consensus_sequence[max( 0, this_cut_point - args.offset_around_cut_to_plot + 1):min( len(reference_seq), cut_point + args.offset_around_cut_to_plot + 1)] break merged = pd.merge(df1, df2, on=[ 'Aligned_Sequence', 'Reference_Sequence', 'Unedited', 'n_deleted', 'n_inserted', 'n_mutated' ], suffixes=('_' + sample_1_name, '_' + sample_2_name), how='outer') quant_cols = [ '#Reads_' + sample_1_name, '%Reads_' + sample_1_name, '#Reads_' + sample_2_name, '%Reads_' + sample_2_name ] merged[quant_cols] = merged[quant_cols].fillna(0) lfc_error = 0.1 merged['each_LFC'] = np.log2( ((merged['%Reads_' + sample_1_name] + lfc_error) / (merged['%Reads_' + sample_2_name] + lfc_error) ).astype(float)).replace([np.inf, np.NaN], 0) merged = merged.reset_index().set_index( 'Aligned_Sequence') output_root = allele_file_1_name.replace(".txt", "") allele_comparison_file = _jp(output_root + '.txt') merged.to_csv(allele_comparison_file, sep="\t", index=None) plot_name = '3.' + output_root + '_top' CRISPRessoPlot.plot_alleles_table_compare( ref_seq_around_cut, merged.sort_values(['each_LFC'], ascending=True), sample_1_name, sample_2_name, _jp(plot_name), MIN_FREQUENCY=args. min_frequency_alleles_around_cut_to_plot, MAX_N_ROWS=args. max_rows_alleles_around_cut_to_plot, SAVE_ALSO_PNG=save_png) crispresso2_info['summary_plot_names'].append( plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Alleles enriched in ' + sample_1_name crispresso2_info['summary_plot_labels'][plot_name] = 'Distribution comparison of alleles. Nucleotides are indicated by unique colors (A = green; C = red; G = yellow; T = purple). Substitutions are shown in bold font. Red rectangles highlight inserted sequences. Horizontal dashed lines indicate deleted sequences. The vertical dashed line indicates the predicted cleavage site. '+ \ 'The proportion and number of reads is shown for each sample on the right, with the values for ' + sample_1_name + ' followed by the values for ' + sample_2_name +'. Alleles are sorted for enrichment in ' + sample_1_name+'.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('Allele comparison table', os.path.basename(allele_comparison_file)) ] plot_name = '3.' + output_root + '_bottom' CRISPRessoPlot.plot_alleles_table_compare( ref_seq_around_cut, merged.sort_values(['each_LFC'], ascending=False), sample_1_name, sample_2_name, _jp(plot_name), MIN_FREQUENCY=args. min_frequency_alleles_around_cut_to_plot, MAX_N_ROWS=args. max_rows_alleles_around_cut_to_plot, SAVE_ALSO_PNG=save_png) crispresso2_info['summary_plot_names'].append( plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Alleles enriched in ' + sample_2_name crispresso2_info['summary_plot_labels'][plot_name] = 'Distribution comparison of alleles. Nucleotides are indicated by unique colors (A = green; C = red; G = yellow; T = purple). Substitutions are shown in bold font. Red rectangles highlight inserted sequences. Horizontal dashed lines indicate deleted sequences. The vertical dashed line indicates the predicted cleavage site. '+ \ 'The proportion and number of reads is shown for each sample on the right, with the values for ' + sample_1_name + ' followed by the values for ' + sample_2_name +'. Alleles are sorted for enrichment in ' + sample_2_name+'.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('Allele comparison table', os.path.basename(allele_comparison_file)) ] if not args.suppress_report: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2Batch_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_compare_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) cp.dump(crispresso2_info, open(crispresso2Compare_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)