def main(): try: description = [ '~~~CRISPRessoMeta~~~', '-Analysis of CRISPR/Cas9 outcomes from deep sequencing data using a metadata file-' ] meta_string = r''' ________________________________________ | _________ ______ _______ ______ | | | | | | | \ | | | | | | | | | | | | | | | | | |---- | | | |__| | | | |_| |_| |_| |_|____ |_| |_| |_| | |________________________________________| ''' print(CRISPRessoShared.get_crispresso_header(description, meta_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoMeta Parameters') #batch specific params parser.add_argument( '--metadata', type=str, help='Metadata file according to NIST specification', required=True) parser.add_argument( '-mo', '--meta_output_folder', help='Directory where analysis output will be stored') parser.add_argument( '-p', '--n_processes', type=int, help='Specify the number of processes to use for quantification.\ Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.', default=1) parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() debug_flag = args.debug crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set(['name', 'output_folder']) crispresso_options_for_meta = list(crispresso_options - options_to_ignore) CRISPRessoShared.check_file(args.metadata) meta_params = pd.DataFrame( columns=['name', 'guide_seq', 'amplicon_seq']) with open(args.metadata) as metadata_file: metadata = json.load(metadata_file) exp = metadata['Experiment'] for guide in data['Experiment']: print('Guide: ' + guide['name']) print('Sequence: ' + guide['sequence']) print('Amplicon: ' + guide['amplicon']) print('Fastq_R1: ' + guide['fastq_r1']) print('Fastq_R2: ' + guide['fastq_r2']) meta_params.append({ 'name': guide['name'], 'guide_seq': guide['sequence'], 'amplicon_seq': guide['amplicon'], 'fastq_r1': guide['fastq_r1'], 'fastq_r2': guide['fastq_r2'] }) print('table:') print(meta_params) #rename column "a" to "amplicon_seq", etc meta_params.rename( index=str, columns=CRISPRessoShared.get_crispresso_options_lookup(), inplace=True) meta_count = meta_params.shape[0] meta_params.index = range(meta_count) if 'fastq_r1' not in meta_params: raise CRISPRessoShared.BadParameterException( "fastq_r1 must be specified in the meta settings file. Current headings are: " + str(meta_params.columns.values)) #add args from the command line to meta_params for arg in vars(args): if arg not in meta_params: meta_params[arg] = getattr(args, arg) else: if (getattr(args, arg) is not None): meta_params[arg].fillna(value=getattr(args, arg), inplace=True) #assert that all names are unique #and clean names for i in range(meta_count): if meta_params.loc[i, 'name'] == '': meta_params.at[i, 'name'] = i meta_params.at[i, 'name'] = CRISPRessoShared.clean_filename( meta_params.loc[i, 'name']) if meta_params.drop_duplicates( 'name').shape[0] != meta_params.shape[0]: raise CRISPRessoShared.BadParameterException( 'Sample input names must be unique. The given names are not unique: ' + str(meta_params.loc[:, 'name'])) #Check files meta_params[ "sgRNA_intervals"] = '' #create empty array for sgRNA intervals meta_params["sgRNA_intervals"] = meta_params["sgRNA_intervals"].apply( list) meta_params[ "cut_point_include_idx"] = '' #create empty array for cut point intervals for each batch based on sgRNA meta_params["cut_point_include_idx"] = meta_params[ "cut_point_include_idx"].apply(list) for idx, row in meta_params.iterrows(): if row.fastq_r1 is None: raise CRISPRessoShared.BadParameterException( "At least one fastq file must be given as a command line parameter or be specified in the meta settings file with the heading 'fastq_r1' (fastq_r1 on row %s '%s' is invalid)" % (int(idx) + 1, row.fastq_r1)) CRISPRessoShared.check_file(row.fastq_r1) if row.fastq_r2 != "": CRISPRessoShared.check_file(row.fastq_r2) if args.auto: continue curr_amplicon_seq_str = row.amplicon_seq if curr_amplicon_seq_str is None: raise CRISPRessoShared.BadParameterException( "Amplicon sequence must be given as a command line parameter or be specified in the meta settings file with the heading 'amplicon_seq' (Amplicon seq on row %s '%s' is invalid)" % (int(idx) + 1, curr_amplicon_seq_str)) guides_are_in_amplicon = { } #dict of whether a guide is in at least one amplicon sequence #iterate through amplicons for curr_amplicon_seq in curr_amplicon_seq_str.split(','): this_include_idxs = [ ] #mask for bp to include for this amplicon seq, as specified by sgRNA cut points this_sgRNA_intervals = [] wrong_nt = CRISPRessoShared.find_wrong_nt(curr_amplicon_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The amplicon sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_amplicon_seq_str, ' '.join(wrong_nt))) #iterate through guides curr_guide_seq_string = row.guide_seq if curr_guide_seq_string is not None and curr_guide_seq_string != "": guides = curr_guide_seq_string.strip().upper().split(',') for curr_guide_seq in guides: wrong_nt = CRISPRessoShared.find_wrong_nt( curr_guide_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The sgRNA sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_guide_seq, ' '.join(wrong_nt))) guide_names = [''] * len(guides) guide_mismatches = [[]] * len(guides) guide_qw_centers = CRISPRessoShared.set_guide_array( row.quantification_window_center, guides, 'guide quantification center') guide_qw_sizes = CRISPRessoShared.set_guide_array( row.quantification_window_size, guides, 'guide quantification size') guide_plot_cut_points = [1] * len(guides) discard_guide_positions_overhanging_amplicon_edge = False if 'discard_guide_positions_overhanging_amplicon_edge' in row: discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_names, this_include_idxs, this_exclude_idxs ) = CRISPRessoShared.get_amplicon_info_for_guides( curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers, guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge) for guide_seq in this_sgRNA_sequences: guides_are_in_amplicon[guide_seq] = 1 meta_params.ix[idx, "cut_point_include_idx"].append( this_include_idxs) meta_params.ix[idx, "sgRNA_intervals"].append(this_sgRNA_intervals) for guide_seq in guides_are_in_amplicon: if guides_are_in_amplicon[guide_seq] != 1: warn( '\nThe guide sequence provided on row %d (%s) is not present in any amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!' % (idx + 1, row.guide_seq, curr_amplicon_seq)) meta_folder_name = os.path.splitext(os.path.basename(args.metadata))[0] if args.name and args.name != "": meta_folder_name = args.name output_folder_name = 'CRISPRessoMeta_on_%s' % meta_folder_name OUTPUT_DIRECTORY = os.path.abspath(output_folder_name) if args.meta_output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.meta_output_folder), output_folder_name) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoMeta_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Meta_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Meta_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) crispresso_cmds = [] meta_names_arr = [] meta_input_names = {} for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) meta_names_arr.append(metaName) meta_input_names[metaName] = row["name"] crispresso_cmd = args.crispresso_command + ' -o %s --name %s' % ( OUTPUT_DIRECTORY, metaName) crispresso_cmd = propagate_options(crispresso_cmd, crispresso_options_for_meta, meta_params, idx) crispresso_cmds.append(crispresso_cmd) crispresso2_info['meta_names_arr'] = meta_names_arr crispresso2_info['meta_input_names'] = meta_input_names CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, args.n_processes, 'meta', args.skip_failed) run_datas = [] #crispresso2 info from each row all_amplicons = set() amplicon_names = {} amplicon_counts = {} completed_meta_arr = [] for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % metaName) run_data_file = os.path.join(folder_name, 'CRISPResso2_info.pickle') if os.path.isfile(run_data_file) is False: info("Skipping folder '%s'. Cannot find run data at '%s'." % (folder_name, run_data_file)) run_datas.append(None) continue run_data = cp.load(open(run_data_file, 'rb')) run_datas.append(run_data) for ref_name in run_data['ref_names']: ref_seq = run_data['refs'][ref_name]['sequence'] all_amplicons.add(ref_seq) #if this amplicon is called something else in another sample, just call it the amplicon if ref_name in amplicon_names and amplicon_names[ ref_seq] != ref_name: amplicon_names[ref_seq] = ref_seq else: amplicon_names[ref_seq] = ref_name if ref_seq not in amplicon_counts: amplicon_counts[ref_seq] = 0 amplicon_counts[ref_seq] += 1 completed_meta_arr.append(metaName) crispresso2_info['completed_meta_arr'] = completed_meta_arr #make sure amplicon names aren't super long for amplicon in all_amplicons: if len(amplicon_names[amplicon]) > 20: amplicon_names[amplicon] = amplicon_names[amplicon][0:20] #make sure no duplicate names (same name for the different amplicons) seen_names = {} for amplicon in all_amplicons: suffix_counter = 2 while amplicon_names[amplicon] in seen_names: amplicon_names[amplicon] = amplicon_names[ amplicon] + "_" + str(suffix_counter) suffix_counter += 1 seen_names[amplicon_names[amplicon]] = 1 save_png = True if args.suppress_report: save_png = False #summarize amplicon modifications with open( _jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'), 'w') as outfile: wrote_header = False for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % metaName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['quant_of_editing_freq_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(metaName + "\t" + line) #summarize alignment with open(_jp('CRISPRessoBatch_mapping_statistics.txt'), 'w') as outfile: wrote_header = False for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % metaName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['mapping_stats_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(metaName + "\t" + line) if not args.suppress_report: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2Meta_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_meta_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) cp.dump(crispresso2_info, open(crispresso2Meta_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): try: description = [ '~~~CRISPRessoBatch~~~', '-Analysis of CRISPR/Cas9 outcomes from batch deep sequencing data-' ] batch_string = r''' _________________ | __ ___ __ | ||__) /\ | / |__|| ||__)/--\| \__| || |_________________| ''' print(CRISPRessoShared.get_crispresso_header(description, batch_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoBatch Parameters') #batch specific params parser.add_argument( '-bs', '--batch_settings', type=str, help= 'Settings file for batch. Must be tab-separated text file. The header row contains CRISPResso parameters (e.g., fastq_r1, fastq_r2, amplicon_seq, and other optional parameters). Each following row sets parameters for an additional batch.', required=True) parser.add_argument( '--skip_failed', help='Continue with batch analysis even if one sample fails', action='store_true') parser.add_argument( '--min_reads_for_inclusion', help= 'Minimum number of reads for a batch to be included in the batch summary', type=int) parser.add_argument( '-p', '--n_processes', type=int, help='Specify the number of processes to use for quantification.\ Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.', default=1) parser.add_argument( '-bo', '--batch_output_folder', help='Directory where batch analysis output will be stored') parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() debug_flag = args.debug crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set(['name', 'output_folder']) crispresso_options_for_batch = list(crispresso_options - options_to_ignore) CRISPRessoShared.check_file(args.batch_settings) ##parse excel sheet batch_params = pd.read_csv(args.batch_settings, comment='#', sep='\t') #pandas either allows for auto-detect sep or for comment. not both # batch_params=pd.read_csv(args.batch_settings,sep=None,engine='python',error_bad_lines=False) batch_params.columns = batch_params.columns.str.strip(' -\xd0') #rename column "a" to "amplicon_seq", etc batch_params.rename( index=str, columns=CRISPRessoShared.get_crispresso_options_lookup(), inplace=True) batch_count = batch_params.shape[0] batch_params.index = range(batch_count) if 'fastq_r1' not in batch_params and 'bam_input' not in batch_params: raise CRISPRessoShared.BadParameterException( "fastq_r1 must be specified in the batch settings file. Current headings are: " + str(batch_params.columns.values)) #add args from the command line to batch_params_df for arg in vars(args): if arg not in batch_params: batch_params[arg] = getattr(args, arg) else: if (getattr(args, arg) is not None): batch_params[arg].fillna(value=getattr(args, arg), inplace=True) #assert that all names are unique #and clean names for i in range(batch_count): if batch_params.loc[i, 'name'] == '': batch_params.at[i, 'name'] = i batch_params.at[i, 'name'] = CRISPRessoShared.clean_filename( batch_params.loc[i, 'name']) if batch_params.drop_duplicates( 'name').shape[0] != batch_params.shape[0]: raise CRISPRessoShared.BadParameterException( 'Batch input names must be unique. The given names are not unique: ' + str(batch_params.loc[:, 'name'])) #Check files batch_params[ "sgRNA_intervals"] = '' #create empty array for sgRNA intervals batch_params["sgRNA_intervals"] = batch_params[ "sgRNA_intervals"].apply(list) batch_params[ "cut_point_include_idx"] = '' #create empty array for cut point intervals for each batch based on sgRNA batch_params["cut_point_include_idx"] = batch_params[ "cut_point_include_idx"].apply(list) for idx, row in batch_params.iterrows(): if 'fastq_r1' in row: if row.fastq_r1 is None: raise CRISPRessoShared.BadParameterException( "At least one fastq file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' (fastq_r1 on row %s '%s' is invalid)" % (int(idx) + 1, row.fastq_r1)) else: CRISPRessoShared.check_file(row.fastq_r1) if 'fastq_r2' in row and row.fastq_r2 != "": CRISPRessoShared.check_file(row.fastq_r2) if 'input_bam' in row: if row.input_bam is None: raise CRISPRessoShared.BadParameterException( "At least one input file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' or 'input_bam' (input_bam on row %s '%s' is invalid)" % (int(idx) + 1, row.input_bam)) else: CRISPRessoShared.check_file(row.input_bam) if args.auto: continue curr_amplicon_seq_str = row.amplicon_seq if curr_amplicon_seq_str is None: raise CRISPRessoShared.BadParameterException( "Amplicon sequence must be given as a command line parameter or be specified in the batch settings file with the heading 'amplicon_seq' (Amplicon seq on row %s '%s' is invalid)" % (int(idx) + 1, curr_amplicon_seq_str)) guides_are_in_amplicon = { } #dict of whether a guide is in at least one amplicon sequence #iterate through amplicons for curr_amplicon_seq in curr_amplicon_seq_str.split(','): this_include_idxs = [ ] #mask for bp to include for this amplicon seq, as specified by sgRNA cut points this_sgRNA_intervals = [] wrong_nt = CRISPRessoShared.find_wrong_nt(curr_amplicon_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The amplicon sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_amplicon_seq_str, ' '.join(wrong_nt))) #iterate through guides curr_guide_seq_string = row.guide_seq if curr_guide_seq_string is not None and curr_guide_seq_string != "": guides = curr_guide_seq_string.strip().upper().split(',') for curr_guide_seq in guides: wrong_nt = CRISPRessoShared.find_wrong_nt( curr_guide_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The sgRNA sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_guide_seq, ' '.join(wrong_nt))) guide_mismatches = [[]] * len(guides) guide_names = [""] * len(guides) guide_qw_centers = CRISPRessoShared.set_guide_array( row.quantification_window_center, guides, 'guide quantification center') guide_qw_sizes = CRISPRessoShared.set_guide_array( row.quantification_window_size, guides, 'guide quantification size') guide_plot_cut_points = [1] * len(guides) (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs, this_exclude_idxs ) = CRISPRessoShared.get_amplicon_info_for_guides( curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers, guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points) for guide_seq in this_sgRNA_sequences: guides_are_in_amplicon[guide_seq] = 1 batch_params.ix[idx, "cut_point_include_idx"].append( this_include_idxs) batch_params.ix[idx, "sgRNA_intervals"].append(this_sgRNA_intervals) for guide_seq in guides_are_in_amplicon: if guides_are_in_amplicon[guide_seq] != 1: warn( '\nThe guide sequence provided on row %d (%s) is not present in any amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!' % (idx + 1, row.guide_seq, curr_amplicon_seq)) batch_folder_name = os.path.splitext( os.path.basename(args.batch_settings))[0] if args.name and args.name != "": batch_folder_name = args.name output_folder_name = 'CRISPRessoBatch_on_%s' % batch_folder_name OUTPUT_DIRECTORY = os.path.abspath(output_folder_name) if args.batch_output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.batch_output_folder), output_folder_name) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoBatch_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Batch_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Batch_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) crispresso_cmds = [] batch_names_arr = [] batch_input_names = {} for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) batch_names_arr.append(batchName) batch_input_names[batchName] = row["name"] crispresso_cmd = args.crispresso_command + ' -o %s --name %s' % ( OUTPUT_DIRECTORY, batchName) crispresso_cmd = propagate_options(crispresso_cmd, crispresso_options_for_batch, batch_params, idx) crispresso_cmds.append(crispresso_cmd) crispresso2_info['batch_names_arr'] = batch_names_arr crispresso2_info['batch_input_names'] = batch_input_names CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, args.n_processes, 'batch', args.skip_failed) run_datas = [] #crispresso2 info from each row all_amplicons = set() amplicon_names = {} amplicon_counts = {} completed_batch_arr = [] for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data_file = os.path.join(folder_name, 'CRISPResso2_info.pickle') if os.path.isfile(run_data_file) is False: info("Skipping folder '%s'. Cannot find run data at '%s'." % (folder_name, run_data_file)) run_datas.append(None) continue run_data = cp.load(open(run_data_file, 'rb')) run_datas.append(run_data) for ref_name in run_data['ref_names']: ref_seq = run_data['refs'][ref_name]['sequence'] all_amplicons.add(ref_seq) #if this amplicon is called something else in another sample, just call it the amplicon if ref_name in amplicon_names and amplicon_names[ ref_seq] != ref_name: amplicon_names[ref_seq] = ref_seq else: amplicon_names[ref_seq] = ref_name if ref_seq not in amplicon_counts: amplicon_counts[ref_seq] = 0 amplicon_counts[ref_seq] += 1 completed_batch_arr.append(batchName) crispresso2_info['completed_batch_arr'] = completed_batch_arr #make sure amplicon names aren't super long for amplicon in all_amplicons: if len(amplicon_names[amplicon]) > 20: amplicon_names[amplicon] = amplicon_names[amplicon][0:20] #make sure no duplicate names (same name for the different amplicons) seen_names = {} for amplicon in all_amplicons: suffix_counter = 2 orig_name = amplicon_names[amplicon] while amplicon_names[amplicon] in seen_names: amplicon_names[amplicon] = orig_name + "_" + str( suffix_counter) suffix_counter += 1 seen_names[amplicon_names[amplicon]] = 1 save_png = True if args.suppress_report: save_png = False window_nuc_pct_quilt_plot_names = [] nuc_pct_quilt_plot_names = [] window_nuc_conv_plot_names = [] nuc_conv_plot_names = [] #report for amplicons that appear multiple times for amplicon_index, amplicon_seq in enumerate(all_amplicons): #only perform comparison if amplicon seen in more than one sample if amplicon_counts[amplicon_seq] < 2: continue amplicon_name = amplicon_names[amplicon_seq] info('Reporting summary for amplicon: "' + amplicon_name + '"') consensus_sequence = "" nucleotide_frequency_summary = [] nucleotide_percentage_summary = [] modification_frequency_summary = [] modification_percentage_summary = [] amp_found_count = 0 #how many folders had information for this amplicon consensus_guides = [] consensus_include_idxs = [] consensus_sgRNA_plot_idxs = [] consensus_sgRNA_intervals = [] guides_all_same = True batches_with_this_amplicon = [] for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue batch_has_amplicon = False batch_amplicon_name = '' for ref_name in run_data['ref_names']: if amplicon_seq == run_data['refs'][ref_name]['sequence']: batch_has_amplicon = True batch_amplicon_name = ref_name if not batch_has_amplicon: continue batches_with_this_amplicon.append(idx) if consensus_guides == []: consensus_guides = run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] consensus_include_idxs = run_data['refs'][ batch_amplicon_name]['include_idxs'] consensus_sgRNA_intervals = run_data['refs'][ batch_amplicon_name]['sgRNA_intervals'] consensus_sgRNA_plot_idxs = run_data['refs'][ batch_amplicon_name]['sgRNA_plot_idxs'] if run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] != consensus_guides: guides_all_same = False if set(run_data['refs'][batch_amplicon_name] ['include_idxs']) != set(consensus_include_idxs): guides_all_same = False if 'nuc_freq_filename' not in run_data['refs'][ batch_amplicon_name]: info( "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information." % (batch_amplicon_name, folder_name)) continue nucleotide_frequency_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name]['nuc_freq_filename']) ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file( nucleotide_frequency_file) nucleotide_pct_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name]['nuc_pct_filename']) ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file( nucleotide_pct_file) count_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name] ['mod_count_filename']) ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file( count_file) if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None: info( "Skipping the amplicon '%s' in folder '%s'. Could not parse batch output." % (batch_amplicon_name, folder_name)) info( "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'" % (ampSeq_nf, ampSeq_np, ampSeq_cf)) continue if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf: warn( "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s" % (batch_amplicon_name, folder_name, ampSeq_nf, ampSeq_np, ampSeq_cf, amplicon_seq)) continue if consensus_sequence == "": consensus_sequence = ampSeq_nf if ampSeq_nf != consensus_sequence: info( "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match." % (batch_amplicon_name, folder_name)) continue if 'Total' not in mod_freqs: info( "Skipping the amplicon '%s' in folder '%s'. Processing did not complete." % (batch_amplicon_name, folder_name)) continue if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][0] == "0": info( "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon." % (batch_amplicon_name, folder_name)) continue if (args.min_reads_for_inclusion is not None) and (int( mod_freqs['Total'][0]) < args.min_reads_for_inclusion): info( "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)." % (batch_amplicon_name, folder_name, str(mod_freqs['Total'][0]), args.min_reads_for_inclusion)) continue mod_pcts = {} for key in mod_freqs: mod_pcts[key] = np.array(mod_freqs[key]).astype( np.float) / float(mod_freqs['Total'][0]) amp_found_count += 1 for nuc in ['A', 'T', 'C', 'G', 'N', '-']: row = [batchName, nuc] row.extend(nuc_freqs[nuc]) nucleotide_frequency_summary.append(row) pct_row = [batchName, nuc] pct_row.extend(nuc_pcts[nuc]) nucleotide_percentage_summary.append(pct_row) for mod in [ 'Insertions', 'Insertions_Left', 'Deletions', 'Substitutions', 'All_modifications' ]: row = [batchName, mod] row.extend(mod_freqs[mod]) modification_frequency_summary.append(row) pct_row = [batchName, mod] pct_row.extend(mod_pcts[mod]) modification_percentage_summary.append(pct_row) if amp_found_count == 0: info( "Couldn't find any data for amplicon '%s'. Not compiling results." % amplicon_name) else: amplicon_plot_name = amplicon_name + "." if len(amplicon_names) == 1 and amplicon_name == "Reference": amplicon_plot_name = "" colnames = ['Batch', 'Nucleotide'] colnames.extend(list(consensus_sequence)) nucleotide_frequency_summary_df = pd.DataFrame( nucleotide_frequency_summary, columns=colnames) nucleotide_frequency_summary_df = pd.concat([ nucleotide_frequency_summary_df.iloc[:, 0:2], nucleotide_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary.txt') nucleotide_frequency_summary_df.to_csv( nucleotide_frequency_summary_filename, sep='\t', index=None) nucleotide_percentage_summary_df = pd.DataFrame( nucleotide_percentage_summary, columns=colnames) nucleotide_percentage_summary_df = pd.concat([ nucleotide_percentage_summary_df.iloc[:, 0:2], nucleotide_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary.txt') nucleotide_percentage_summary_df.to_csv( nucleotide_percentage_summary_filename, sep='\t', index=None) colnames = ['Batch', 'Modification'] colnames.extend(list(consensus_sequence)) modification_frequency_summary_df = pd.DataFrame( modification_frequency_summary, columns=colnames) modification_frequency_summary_df = pd.concat([ modification_frequency_summary_df.iloc[:, 0:2], modification_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_frequency_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_FREQUENCY_SUMMARY.txt') modification_frequency_summary_df.to_csv( modification_frequency_summary_filename, sep='\t', index=None) modification_percentage_summary_df = pd.DataFrame( modification_percentage_summary, columns=colnames) modification_percentage_summary_df = pd.concat([ modification_percentage_summary_df.iloc[:, 0:2], modification_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_percentage_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_PERCENTAGE_SUMMARY.txt') modification_percentage_summary_df.to_csv( modification_percentage_summary_filename, sep='\t', index=None) crispresso2_info[ 'nucleotide_frequency_summary_filename'] = os.path.basename( nucleotide_frequency_summary_filename) crispresso2_info[ 'nucleotide_percentage_summary_filename'] = os.path.basename( nucleotide_percentage_summary_filename) crispresso2_info[ 'modification_frequency_summary_filename'] = os.path.basename( modification_frequency_summary_filename) crispresso2_info[ 'modification_percentage_summary_filename'] = os.path.basename( modification_percentage_summary_filename) crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window if guides_all_same and consensus_guides != []: info( "All guides are equal. Performing comparison of batches for amplicon '%s'" % amplicon_name) include_idxs = consensus_include_idxs #include indexes are the same for all guides for idx, sgRNA in enumerate(consensus_guides): sgRNA_intervals = consensus_sgRNA_intervals[idx] sgRNA_plot_idxs = consensus_sgRNA_plot_idxs[idx] plot_idxs_flat = [0, 1] # guide, nucleotide plot_idxs_flat.extend( [plot_idx + 2 for plot_idx in sgRNA_plot_idxs]) sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:, plot_idxs_flat] sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:, plot_idxs_flat] sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:, plot_idxs_flat] #show all sgRNA's on the plot sub_sgRNA_intervals = [] for sgRNA_interval in consensus_sgRNA_intervals: newstart = None newend = None for idx, i in enumerate(sgRNA_plot_idxs): if i <= sgRNA_interval[0]: newstart = idx if newend is None and i >= sgRNA_interval[1]: newend = idx #if guide doesn't overlap with plot idxs if newend == 0 or newstart == len(sgRNA_plot_idxs): continue #otherwise, correct partial overlaps elif newstart == None and newend == None: newstart = 0 newend = len(include_idxs) - 1 elif newstart == None: newstart = 0 elif newend == None: newend = len(include_idxs) - 1 #and add it to the list sub_sgRNA_intervals.append((newstart, newend)) if not args.suppress_plots: #plot for each guide this_window_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_nucleotide_quilt( sub_nucleotide_percentage_summary_df, sub_modification_percentage_summary_df, this_window_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_window_nuc_pct_quilt_plot_name) window_nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] sub_nucleotide_frequency_summary_df = pd.concat( [ sub_nucleotide_frequency_summary_df. iloc[:, 0:2], sub_nucleotide_frequency_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_frequency_summary_df.to_csv( sub_nucleotide_frequency_summary_filename, sep='\t', index=None) sub_nucleotide_percentage_summary_df = pd.concat( [ sub_nucleotide_percentage_summary_df. iloc[:, 0:2], sub_nucleotide_percentage_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_percentage_summary_df.to_csv( sub_nucleotide_percentage_summary_filename, sep='\t', index=None) if args.base_editor_output: this_window_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_conversion_map_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_conversion_map( sub_nucleotide_percentage_summary_df, this_window_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_window_nuc_conv_plot_name) window_nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][ plot_name] = [ ('Nucleotide frequencies around sgRNA', os.path.basename( sub_nucleotide_frequency_summary_filename )), ('Nucleotide percentages around sgRNA', os.path.basename( sub_nucleotide_percentage_summary_filename )) ] if not args.suppress_plots: # plot the whole region this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] if args.base_editor_output: this_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_conversion_map') CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, this_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_nuc_conv_plot_name) nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] else: #guides are not the same if not args.suppress_plots: this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png) plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] if args.base_editor_output: this_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, this_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png) plot_name = os.path.basename( this_nuc_conv_plot_name) nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] crispresso2_info[ 'window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names crispresso2_info['nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names crispresso2_info[ 'window_nuc_conv_plot_names'] = window_nuc_conv_plot_names crispresso2_info['nuc_conv_plot_names'] = nuc_conv_plot_names #summarize amplicon modifications with open( _jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'), 'w') as outfile: wrote_header = False for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['quant_of_editing_freq_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(batchName + "\t" + line) #summarize alignment with open(_jp('CRISPRessoBatch_mapping_statistics.txt'), 'w') as outfile: wrote_header = False for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['mapping_stats_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(batchName + "\t" + line) if not args.suppress_report: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2Batch_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_batch_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) cp.dump(crispresso2_info, open(crispresso2Batch_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)