def main(): try: description = [ '~~~CRISPRessoMeta~~~', '-Analysis of CRISPR/Cas9 outcomes from deep sequencing data using a metadata file-' ] meta_string = r''' ________________________________________ | _________ ______ _______ ______ | | | | | | | \ | | | | | | | | | | | | | | | | | |---- | | | |__| | | | |_| |_| |_| |_|____ |_| |_| |_| | |________________________________________| ''' print(CRISPRessoShared.get_crispresso_header(description, meta_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoMeta Parameters') #batch specific params parser.add_argument( '--metadata', type=str, help='Metadata file according to NIST specification', required=True) parser.add_argument( '-mo', '--meta_output_folder', help='Directory where analysis output will be stored') parser.add_argument( '-p', '--n_processes', type=int, help='Specify the number of processes to use for quantification.\ Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.', default=1) parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() debug_flag = args.debug crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set(['name', 'output_folder']) crispresso_options_for_meta = list(crispresso_options - options_to_ignore) CRISPRessoShared.check_file(args.metadata) meta_params = pd.DataFrame( columns=['name', 'guide_seq', 'amplicon_seq']) with open(args.metadata) as metadata_file: metadata = json.load(metadata_file) exp = metadata['Experiment'] for guide in data['Experiment']: print('Guide: ' + guide['name']) print('Sequence: ' + guide['sequence']) print('Amplicon: ' + guide['amplicon']) print('Fastq_R1: ' + guide['fastq_r1']) print('Fastq_R2: ' + guide['fastq_r2']) meta_params.append({ 'name': guide['name'], 'guide_seq': guide['sequence'], 'amplicon_seq': guide['amplicon'], 'fastq_r1': guide['fastq_r1'], 'fastq_r2': guide['fastq_r2'] }) print('table:') print(meta_params) #rename column "a" to "amplicon_seq", etc meta_params.rename( index=str, columns=CRISPRessoShared.get_crispresso_options_lookup(), inplace=True) meta_count = meta_params.shape[0] meta_params.index = range(meta_count) if 'fastq_r1' not in meta_params: raise CRISPRessoShared.BadParameterException( "fastq_r1 must be specified in the meta settings file. Current headings are: " + str(meta_params.columns.values)) #add args from the command line to meta_params for arg in vars(args): if arg not in meta_params: meta_params[arg] = getattr(args, arg) else: if (getattr(args, arg) is not None): meta_params[arg].fillna(value=getattr(args, arg), inplace=True) #assert that all names are unique #and clean names for i in range(meta_count): if meta_params.loc[i, 'name'] == '': meta_params.at[i, 'name'] = i meta_params.at[i, 'name'] = CRISPRessoShared.clean_filename( meta_params.loc[i, 'name']) if meta_params.drop_duplicates( 'name').shape[0] != meta_params.shape[0]: raise CRISPRessoShared.BadParameterException( 'Sample input names must be unique. The given names are not unique: ' + str(meta_params.loc[:, 'name'])) #Check files meta_params[ "sgRNA_intervals"] = '' #create empty array for sgRNA intervals meta_params["sgRNA_intervals"] = meta_params["sgRNA_intervals"].apply( list) meta_params[ "cut_point_include_idx"] = '' #create empty array for cut point intervals for each batch based on sgRNA meta_params["cut_point_include_idx"] = meta_params[ "cut_point_include_idx"].apply(list) for idx, row in meta_params.iterrows(): if row.fastq_r1 is None: raise CRISPRessoShared.BadParameterException( "At least one fastq file must be given as a command line parameter or be specified in the meta settings file with the heading 'fastq_r1' (fastq_r1 on row %s '%s' is invalid)" % (int(idx) + 1, row.fastq_r1)) CRISPRessoShared.check_file(row.fastq_r1) if row.fastq_r2 != "": CRISPRessoShared.check_file(row.fastq_r2) if args.auto: continue curr_amplicon_seq_str = row.amplicon_seq if curr_amplicon_seq_str is None: raise CRISPRessoShared.BadParameterException( "Amplicon sequence must be given as a command line parameter or be specified in the meta settings file with the heading 'amplicon_seq' (Amplicon seq on row %s '%s' is invalid)" % (int(idx) + 1, curr_amplicon_seq_str)) guides_are_in_amplicon = { } #dict of whether a guide is in at least one amplicon sequence #iterate through amplicons for curr_amplicon_seq in curr_amplicon_seq_str.split(','): this_include_idxs = [ ] #mask for bp to include for this amplicon seq, as specified by sgRNA cut points this_sgRNA_intervals = [] wrong_nt = CRISPRessoShared.find_wrong_nt(curr_amplicon_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The amplicon sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_amplicon_seq_str, ' '.join(wrong_nt))) #iterate through guides curr_guide_seq_string = row.guide_seq if curr_guide_seq_string is not None and curr_guide_seq_string != "": guides = curr_guide_seq_string.strip().upper().split(',') for curr_guide_seq in guides: wrong_nt = CRISPRessoShared.find_wrong_nt( curr_guide_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The sgRNA sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_guide_seq, ' '.join(wrong_nt))) guide_names = [''] * len(guides) guide_mismatches = [[]] * len(guides) guide_qw_centers = CRISPRessoShared.set_guide_array( row.quantification_window_center, guides, 'guide quantification center') guide_qw_sizes = CRISPRessoShared.set_guide_array( row.quantification_window_size, guides, 'guide quantification size') guide_plot_cut_points = [1] * len(guides) discard_guide_positions_overhanging_amplicon_edge = False if 'discard_guide_positions_overhanging_amplicon_edge' in row: discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_names, this_include_idxs, this_exclude_idxs ) = CRISPRessoShared.get_amplicon_info_for_guides( curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers, guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points, discard_guide_positions_overhanging_amplicon_edge) for guide_seq in this_sgRNA_sequences: guides_are_in_amplicon[guide_seq] = 1 meta_params.ix[idx, "cut_point_include_idx"].append( this_include_idxs) meta_params.ix[idx, "sgRNA_intervals"].append(this_sgRNA_intervals) for guide_seq in guides_are_in_amplicon: if guides_are_in_amplicon[guide_seq] != 1: warn( '\nThe guide sequence provided on row %d (%s) is not present in any amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!' % (idx + 1, row.guide_seq, curr_amplicon_seq)) meta_folder_name = os.path.splitext(os.path.basename(args.metadata))[0] if args.name and args.name != "": meta_folder_name = args.name output_folder_name = 'CRISPRessoMeta_on_%s' % meta_folder_name OUTPUT_DIRECTORY = os.path.abspath(output_folder_name) if args.meta_output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.meta_output_folder), output_folder_name) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoMeta_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Meta_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Meta_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) crispresso_cmds = [] meta_names_arr = [] meta_input_names = {} for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) meta_names_arr.append(metaName) meta_input_names[metaName] = row["name"] crispresso_cmd = args.crispresso_command + ' -o %s --name %s' % ( OUTPUT_DIRECTORY, metaName) crispresso_cmd = propagate_options(crispresso_cmd, crispresso_options_for_meta, meta_params, idx) crispresso_cmds.append(crispresso_cmd) crispresso2_info['meta_names_arr'] = meta_names_arr crispresso2_info['meta_input_names'] = meta_input_names CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, args.n_processes, 'meta', args.skip_failed) run_datas = [] #crispresso2 info from each row all_amplicons = set() amplicon_names = {} amplicon_counts = {} completed_meta_arr = [] for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % metaName) run_data_file = os.path.join(folder_name, 'CRISPResso2_info.pickle') if os.path.isfile(run_data_file) is False: info("Skipping folder '%s'. Cannot find run data at '%s'." % (folder_name, run_data_file)) run_datas.append(None) continue run_data = cp.load(open(run_data_file, 'rb')) run_datas.append(run_data) for ref_name in run_data['ref_names']: ref_seq = run_data['refs'][ref_name]['sequence'] all_amplicons.add(ref_seq) #if this amplicon is called something else in another sample, just call it the amplicon if ref_name in amplicon_names and amplicon_names[ ref_seq] != ref_name: amplicon_names[ref_seq] = ref_seq else: amplicon_names[ref_seq] = ref_name if ref_seq not in amplicon_counts: amplicon_counts[ref_seq] = 0 amplicon_counts[ref_seq] += 1 completed_meta_arr.append(metaName) crispresso2_info['completed_meta_arr'] = completed_meta_arr #make sure amplicon names aren't super long for amplicon in all_amplicons: if len(amplicon_names[amplicon]) > 20: amplicon_names[amplicon] = amplicon_names[amplicon][0:20] #make sure no duplicate names (same name for the different amplicons) seen_names = {} for amplicon in all_amplicons: suffix_counter = 2 while amplicon_names[amplicon] in seen_names: amplicon_names[amplicon] = amplicon_names[ amplicon] + "_" + str(suffix_counter) suffix_counter += 1 seen_names[amplicon_names[amplicon]] = 1 save_png = True if args.suppress_report: save_png = False #summarize amplicon modifications with open( _jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'), 'w') as outfile: wrote_header = False for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % metaName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['quant_of_editing_freq_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(metaName + "\t" + line) #summarize alignment with open(_jp('CRISPRessoBatch_mapping_statistics.txt'), 'w') as outfile: wrote_header = False for idx, row in meta_params.iterrows(): metaName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % metaName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['mapping_stats_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(metaName + "\t" + line) if not args.suppress_report: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2Meta_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_meta_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) cp.dump(crispresso2_info, open(crispresso2Meta_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): def print_stacktrace_if_debug(): debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error(traceback.format_exc()) try: start_time = datetime.now() start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S') description = [ '~~~CRISPRessoWGS~~~', '-Analysis of CRISPR/Cas9 outcomes from WGS data-' ] wgs_string = r''' ____________ | __ __ | || |/ _ (_ | ||/\|\__)__) | |____________| ''' print(CRISPRessoShared.get_crispresso_header(description, wgs_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoWGS Parameters', requiredParams={}) #tool specific optional parser.add_argument('-b', '--bam_file', type=str, help='WGS aligned bam file', required=True, default='bam filename') parser.add_argument( '-f', '--region_file', type=str, help= 'Regions description file. A BED format file containing the regions to analyze, one per line. The REQUIRED\ columns are: chr_id(chromosome name), bpstart(start position), bpend(end position), the optional columns are:name (an unique indentifier for the region), guide_seq, expected_hdr_amplicon_seq,coding_seq, see CRISPResso help for more details on these last 3 parameters)', required=True) parser.add_argument( '-r', '--reference_file', type=str, help= 'A FASTA format reference file (for example hg19.fa for the human genome)', default='', required=True) parser.add_argument( '--min_reads_to_use_region', type=float, help= 'Minimum number of reads that align to a region to perform the CRISPResso analysis', default=10) parser.add_argument( '--skip_failed', help='Continue with pooled analysis even if one sample fails', action='store_true') parser.add_argument( '--gene_annotations', type=str, help= 'Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \ please select as table "knownGene", as output format "all fields from selected table" and as file returned "gzip compressed"', default='') parser.add_argument( '-p', '--n_processes', type=str, help='The number of processes to use for the quantification.\ Please use with caution since increasing this parameter will increase the memory required to run CRISPResso. Can be set to \'max\'.', default="1") parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set([ 'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name' ]) crispresso_options_for_wgs = list(crispresso_options - options_to_ignore) info('Checking dependencies...') if check_samtools() and check_bowtie2(): info('\n All the required dependencies are present!') else: sys.exit(1) #check files check_file(args.bam_file) check_file(args.reference_file) check_file(args.region_file) if args.gene_annotations: check_file(args.gene_annotations) n_processes = 1 if args.n_processes == "max": n_processes = CRISPRessoMultiProcessing.get_max_processes() else: n_processes = int(args.n_processes) #INIT get_name_from_bam = lambda x: os.path.basename(x).replace('.bam', '') if not args.name: database_id = '%s' % get_name_from_bam(args.bam_file) else: database_id = args.name OUTPUT_DIRECTORY = 'CRISPRessoWGS_on_%s' % database_id if args.output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.output_folder), OUTPUT_DIRECTORY) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) info('Done!') except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoWGS_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) crispresso2_info_file = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso2WGS_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) crispresso2_info['finished_steps'] = {} crispresso_cmd_to_write = ' '.join(sys.argv) if args.write_cleaned_report: cmd_copy = sys.argv[:] cmd_copy[0] = 'CRISPRessoWGS' for i in range(len(cmd_copy)): if os.sep in cmd_copy[i]: cmd_copy[i] = os.path.basename(cmd_copy[i]) crispresso_cmd_to_write = ' '.join( cmd_copy ) #clean command doesn't show the absolute path to the executable or other files crispresso2_info['command_used'] = crispresso_cmd_to_write with open(log_filename, 'w+') as outfile: outfile.write( 'CRISPResso version %s\n[Command used]:\n%s\n\n[Execution log]:\n' % (CRISPRessoShared.__version__, crispresso_cmd_to_write)) #keep track of args to see if it is possible to skip computation steps on rerun can_finish_incomplete_run = False if args.no_rerun: if os.path.exists(crispresso2_info_file): previous_run_data = cp.load(open(crispresso2_info_file, 'rb')) if previous_run_data[ 'version'] == CRISPRessoShared.__version__: args_are_same = True for arg in vars(args): if arg is "no_rerun" or arg is "debug" or arg is "n_processes": continue if arg not in vars(previous_run_data['args']): info( 'Comparing current run to previous run: old run had argument ' + str(arg) + ' \nRerunning.') args_are_same = False elif str(getattr(previous_run_data['args'], arg)) != str(getattr(args, arg)): info( 'Comparing current run to previous run:\n\told argument ' + str(arg) + ' = ' + str(getattr(previous_run_data['args'], arg)) + '\n\tnew argument: ' + str(arg) + ' = ' + str(getattr(args, arg)) + '\nRerunning.') args_are_same = False if args_are_same: if 'end_time_string' in previous_run_data: info('Analysis already completed on %s!' % previous_run_data['end_time_string']) sys.exit(0) else: can_finish_incomplete_run = True if 'finished_steps' in previous_run_data: for key in previous_run_data[ 'finished_steps'].keys(): crispresso2_info['finished_steps'][ key] = previous_run_data[ 'finished_steps'][key] if args.debug: info('finished: ' + key) else: info( 'The no_rerun flag is set, but this analysis will be rerun because the existing run was performed using an old version of CRISPResso (' + str(previous_run_data['version']) + ').') #write this file early on so we can check the params if we have to rerun cp.dump(crispresso2_info, open(crispresso2_info_file, 'wb')) def rreplace(s, old, new): li = s.rsplit(old) return new.join(li) bam_index = '' #check if bam has the index already if os.path.exists(rreplace(args.bam_file, ".bam", ".bai")): info('Index file for input .bam file exists, skipping generation.') bam_index = args.bam_file.replace(".bam", ".bai") elif os.path.exists(args.bam_file + '.bai'): info('Index file for input .bam file exists, skipping generation.') bam_index = args.bam_file + '.bai' else: info('Creating index file for input .bam file...') sb.call('samtools index %s ' % (args.bam_file), shell=True) bam_index = args.bam_file + '.bai' #load gene annotation if args.gene_annotations: info('Loading gene coordinates from annotation file: %s...' % args.gene_annotations) try: df_genes = pd.read_csv(args.gene_annotations, compression='gzip', sep="\t") df_genes.txEnd = df_genes.txEnd.astype(int) df_genes.txStart = df_genes.txStart.astype(int) df_genes.head() except: raise Exception('Failed to load the gene annotations file.') #Load and validate the REGION FILE df_regions = pd.read_csv(args.region_file, names=[ 'chr_id', 'bpstart', 'bpend', 'Name', 'sgRNA', 'Expected_HDR', 'Coding_sequence' ], comment='#', sep='\t', dtype={'Name': str}) #remove empty amplicons/lines df_regions.dropna(subset=['chr_id', 'bpstart', 'bpend'], inplace=True) df_regions.Expected_HDR = df_regions.Expected_HDR.apply( capitalize_sequence) df_regions.sgRNA = df_regions.sgRNA.apply(capitalize_sequence) df_regions.Coding_sequence = df_regions.Coding_sequence.apply( capitalize_sequence) #check or create names for idx, row in df_regions.iterrows(): if pd.isnull(row.Name): df_regions.ix[idx, 'Name'] = '_'.join( map(str, [row['chr_id'], row['bpstart'], row['bpend']])) if not len(df_regions.Name.unique()) == df_regions.shape[0]: raise Exception('The amplicon names should be all distinct!') df_regions.set_index('Name', inplace=True) #df_regions.index=df_regions.index.str.replace(' ','_') df_regions.index = df_regions.index.to_series().str.replace(' ', '_') #extract sequence for each region uncompressed_reference = args.reference_file if os.path.exists(uncompressed_reference + '.fai'): info( 'The index for the reference fasta file is already present! Skipping generation.' ) else: info('Indexing reference file... Please be patient!') sb.call('samtools faidx %s >>%s 2>&1' % (uncompressed_reference, log_filename), shell=True) info( 'Retrieving reference sequences for amplicons and checking for sgRNAs' ) df_regions['sequence'] = df_regions.apply( lambda row: get_region_from_fa(row.chr_id, row.bpstart, row.bpend, uncompressed_reference), axis=1) for idx, row in df_regions.iterrows(): if not pd.isnull(row.sgRNA): cut_points = [] guides = row.sgRNA.strip().upper().split(',') guide_qw_centers = CRISPRessoShared.set_guide_array( args.quantification_window_center, guides, 'guide quantification center') for idx, current_guide_seq in enumerate(guides): wrong_nt = find_wrong_nt(current_guide_seq) if wrong_nt: raise NTException( 'The sgRNA sequence %s contains wrong characters:%s' % (current_guide_seq, ' '.join(wrong_nt))) offset_fw = guide_qw_centers[idx] + len( current_guide_seq) - 1 offset_rc = (-guide_qw_centers[idx]) - 1 cut_points+=[m.start() + offset_fw for \ m in re.finditer(current_guide_seq, row.sequence)]+[m.start() + offset_rc for m in re.finditer(CRISPRessoShared.reverse_complement(current_guide_seq), row.sequence)] if not cut_points: df_regions.ix[idx, 'sgRNA'] = '' info('Cannot find guide ' + str(row.sgRNA) + ' in amplicon ' + str(idx) + ' (' + str(row) + ')') df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart']) df_regions['bpend'] = pd.to_numeric(df_regions['bpend']) df_regions.bpstart = df_regions.bpstart.astype(int) df_regions.bpend = df_regions.bpend.astype(int) if args.gene_annotations: df_regions = df_regions.apply( lambda row: find_overlapping_genes(row, df_genes), axis=1) #extract reads with samtools in that region and create a bam #create a fasta file with all the trimmed reads info('\nProcessing each region...') ANALYZED_REGIONS = _jp('ANALYZED_REGIONS/') if not os.path.exists(ANALYZED_REGIONS): os.mkdir(ANALYZED_REGIONS) df_regions['region_number'] = np.arange(len(df_regions)) def set_filenames(row): row_fastq_exists = False fastq_gz_filename = os.path.join( ANALYZED_REGIONS, '%s.fastq.gz' % clean_filename('REGION_' + str(row.region_number))) bam_region_filename = os.path.join( ANALYZED_REGIONS, '%s.bam' % clean_filename('REGION_' + str(row.region_number))) #if bam file already exists, don't regenerate it if os.path.isfile(fastq_gz_filename): row_fastq_exists = True return bam_region_filename, fastq_gz_filename, row_fastq_exists df_regions['bam_file_with_reads_in_region'], df_regions[ 'fastq_file_trimmed_reads_in_region'], df_regions[ 'row_fastq_exists'] = zip( *df_regions.apply(set_filenames, axis=1)) df_regions['n_reads'] = 0 df_regions[ 'original_bam'] = args.bam_file #stick this in the df so we can parallelize the analysis and not pass params report_reads_aligned_filename = _jp( 'REPORT_READS_ALIGNED_TO_SELECTED_REGIONS_WGS.txt') num_rows_without_fastq = len( df_regions[df_regions.row_fastq_exists == False]) if can_finish_incomplete_run and num_rows_without_fastq == 0 and os.path.isfile( report_reads_aligned_filename ) and 'generation_of_fastq_files_for_each_amplicon' in crispresso2_info[ 'finished_steps']: info('Skipping generation of fastq files for each amplicon.') df_regions = pd.read_csv(report_reads_aligned_filename, sep="\t") df_regions.set_index('Name', inplace=True) else: #run region extraction here df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel( df_regions, extract_reads_chunk, n_processes) df_regions.sort_values('region_number', inplace=True) cols_to_print = [ "chr_id", "bpstart", "bpend", "sgRNA", "Expected_HDR", "Coding_sequence", "sequence", "n_reads", "bam_file_with_reads_in_region", "fastq_file_trimmed_reads_in_region" ] if args.gene_annotations: cols_to_print.append('gene_overlapping') df_regions.fillna('NA').to_csv(report_reads_aligned_filename, sep='\t', columns=cols_to_print, index_label="Name") #save progress crispresso2_info['finished_steps'][ 'generation_of_fastq_files_for_each_amplicon'] = True with open(crispresso2_info_file, "wb") as info_file: cp.dump(crispresso2_info, info_file) #Run Crispresso info('Running CRISPResso on each region...') crispresso_cmds = [] for idx, row in df_regions.iterrows(): if row['n_reads'] >= args.min_reads_to_use_region: info('\nThe region [%s] has enough reads (%d) mapped to it!' % (idx, row['n_reads'])) crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' %\ (row['fastq_file_trimmed_reads_in_region'],row['sequence'],OUTPUT_DIRECTORY,idx) if row['sgRNA'] and not pd.isnull(row['sgRNA']): crispresso_cmd += ' -g %s' % row['sgRNA'] if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']): crispresso_cmd += ' -e %s' % row['Expected_HDR'] if row['Coding_sequence'] and not pd.isnull( row['Coding_sequence']): crispresso_cmd += ' -c %s' % row['Coding_sequence'] crispresso_cmd = CRISPRessoShared.propagate_crispresso_options( crispresso_cmd, crispresso_options_for_wgs, args) #logging like this causes the multiprocessing step to not block for some reason #mysteriesOfThPythonUniverse #log_name = _jp("CRISPResso_on_"+idx) +".log" #crispresso_cmd += " &> %s"%log_name crispresso_cmds.append(crispresso_cmd) # info('Running CRISPResso:%s' % crispresso_cmd) # sb.call(crispresso_cmd,shell=True) else: info( '\nThe region [%s] has too few reads mapped to it (%d)! Not running CRISPResso!' % (idx, row['n_reads'])) CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, n_processes, 'region', args.skip_failed) quantification_summary = [] all_region_names = [] all_region_read_counts = {} good_region_names = [] good_region_folders = {} header = 'Name\tUnmodified%\tModified%\tReads_total\tReads_aligned\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions' header_els = header.split("\t") header_el_count = len(header_els) empty_line_els = [np.nan] * (header_el_count - 1) n_reads_index = header_els.index('Reads_total') - 1 for idx, row in df_regions.iterrows(): folder_name = 'CRISPResso_on_%s' % idx run_name = idx all_region_names.append(run_name) all_region_read_counts[run_name] = row.n_reads run_file = os.path.join(_jp(folder_name), 'CRISPResso2_info.pickle') if not os.path.exists(run_file): warn( 'Skipping the folder %s: not enough reads, incomplete, or empty folder.' % folder_name) this_els = empty_line_els[:] this_els[n_reads_index] = row.n_reads to_add = [run_name] to_add.extend(this_els) quantification_summary.append(to_add) else: run_data = cp.load(open(run_file, 'rb')) ref_name = run_data['ref_names'][ 0] #only expect one amplicon sequence n_tot = row.n_reads n_aligned = run_data['counts_total'][ref_name] n_unmod = run_data['counts_unmodified'][ref_name] n_mod = run_data['counts_modified'][ref_name] n_discarded = run_data['counts_discarded'][ref_name] n_insertion = run_data['counts_insertion'][ref_name] n_deletion = run_data['counts_deletion'][ref_name] n_substitution = run_data['counts_substitution'][ref_name] n_only_insertion = run_data['counts_only_insertion'][ref_name] n_only_deletion = run_data['counts_only_deletion'][ref_name] n_only_substitution = run_data['counts_only_substitution'][ ref_name] n_insertion_and_deletion = run_data[ 'counts_insertion_and_deletion'][ref_name] n_insertion_and_substitution = run_data[ 'counts_insertion_and_substitution'][ref_name] n_deletion_and_substitution = run_data[ 'counts_deletion_and_substitution'][ref_name] n_insertion_and_deletion_and_substitution = run_data[ 'counts_insertion_and_deletion_and_substitution'][ref_name] unmod_pct = "NA" mod_pct = "NA" if n_aligned > 0: unmod_pct = 100 * n_unmod / float(n_aligned) mod_pct = 100 * n_mod / float(n_aligned) vals = [run_name] vals.extend([ round(unmod_pct, 8), round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod, n_discarded, n_insertion, n_deletion, n_substitution, n_only_insertion, n_only_deletion, n_only_substitution, n_insertion_and_deletion, n_insertion_and_substitution, n_deletion_and_substitution, n_insertion_and_deletion_and_substitution ]) quantification_summary.append(vals) good_region_names.append(idx) good_region_folders[idx] = folder_name samples_quantification_summary_filename = _jp( 'SAMPLES_QUANTIFICATION_SUMMARY.txt') df_summary_quantification = pd.DataFrame(quantification_summary, columns=header_els) if args.crispresso1_mode: crispresso1_columns = [ 'Name', 'Unmodified%', 'Modified%', 'Reads_aligned', 'Reads_total' ] df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None, columns=crispresso1_columns) else: df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None) crispresso2_info[ 'samples_quantification_summary_filename'] = os.path.basename( samples_quantification_summary_filename) crispresso2_info['regions'] = df_regions crispresso2_info['all_region_names'] = all_region_names crispresso2_info['all_region_read_counts'] = all_region_read_counts crispresso2_info['good_region_names'] = good_region_names crispresso2_info['good_region_folders'] = good_region_folders crispresso2_info['summary_plot_names'] = [] crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} df_summary_quantification.set_index('Name') save_png = True if args.suppress_report: save_png = False if not args.suppress_plots: plot_root = _jp("CRISPRessoWGS_reads_summary") CRISPRessoPlot.plot_reads_total(plot_root, df_summary_quantification, save_png, args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['reads_summary_plot'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoWGS Read Allocation Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoWGS summary', os.path.basename(samples_quantification_summary_filename)) ] plot_root = _jp("CRISPRessoWGS_modification_summary") CRISPRessoPlot.plot_unmod_mod_pcts(plot_root, df_summary_quantification, save_png, args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['modification_summary_plot'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoWGS Modification Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoWGS summary', os.path.basename(samples_quantification_summary_filename)) ] if not args.suppress_report and not args.suppress_plots: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2WGS_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_wgs_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) end_time = datetime.now() end_time_string = end_time.strftime('%Y-%m-%d %H:%M:%S') running_time = end_time - start_time running_time_string = str(running_time) crispresso2_info['end_time'] = end_time crispresso2_info['end_time_string'] = end_time_string crispresso2_info['running_time'] = running_time crispresso2_info['running_time_string'] = running_time_string cp.dump(crispresso2_info, open(crispresso2_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: print_stacktrace_if_debug() error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): try: description = [ '~~~CRISPRessoBatch~~~', '-Analysis of CRISPR/Cas9 outcomes from batch deep sequencing data-' ] batch_string = r''' _________________ | __ ___ __ | ||__) /\ | / |__|| ||__)/--\| \__| || |_________________| ''' print(CRISPRessoShared.get_crispresso_header(description, batch_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoBatch Parameters') #batch specific params parser.add_argument( '-bs', '--batch_settings', type=str, help= 'Settings file for batch. Must be tab-separated text file. The header row contains CRISPResso parameters (e.g., fastq_r1, fastq_r2, amplicon_seq, and other optional parameters). Each following row sets parameters for an additional batch.', required=True) parser.add_argument( '--skip_failed', help='Continue with batch analysis even if one sample fails', action='store_true') parser.add_argument( '--min_reads_for_inclusion', help= 'Minimum number of reads for a batch to be included in the batch summary', type=int) parser.add_argument( '-p', '--n_processes', type=int, help='Specify the number of processes to use for quantification.\ Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.', default=1) parser.add_argument( '-bo', '--batch_output_folder', help='Directory where batch analysis output will be stored') parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() debug_flag = args.debug crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set(['name', 'output_folder']) crispresso_options_for_batch = list(crispresso_options - options_to_ignore) CRISPRessoShared.check_file(args.batch_settings) ##parse excel sheet batch_params = pd.read_csv(args.batch_settings, comment='#', sep='\t') #pandas either allows for auto-detect sep or for comment. not both # batch_params=pd.read_csv(args.batch_settings,sep=None,engine='python',error_bad_lines=False) batch_params.columns = batch_params.columns.str.strip(' -\xd0') #rename column "a" to "amplicon_seq", etc batch_params.rename( index=str, columns=CRISPRessoShared.get_crispresso_options_lookup(), inplace=True) batch_count = batch_params.shape[0] batch_params.index = range(batch_count) if 'fastq_r1' not in batch_params and 'bam_input' not in batch_params: raise CRISPRessoShared.BadParameterException( "fastq_r1 must be specified in the batch settings file. Current headings are: " + str(batch_params.columns.values)) #add args from the command line to batch_params_df for arg in vars(args): if arg not in batch_params: batch_params[arg] = getattr(args, arg) else: if (getattr(args, arg) is not None): batch_params[arg].fillna(value=getattr(args, arg), inplace=True) #assert that all names are unique #and clean names for i in range(batch_count): if batch_params.loc[i, 'name'] == '': batch_params.at[i, 'name'] = i batch_params.at[i, 'name'] = CRISPRessoShared.clean_filename( batch_params.loc[i, 'name']) if batch_params.drop_duplicates( 'name').shape[0] != batch_params.shape[0]: raise CRISPRessoShared.BadParameterException( 'Batch input names must be unique. The given names are not unique: ' + str(batch_params.loc[:, 'name'])) #Check files batch_params[ "sgRNA_intervals"] = '' #create empty array for sgRNA intervals batch_params["sgRNA_intervals"] = batch_params[ "sgRNA_intervals"].apply(list) batch_params[ "cut_point_include_idx"] = '' #create empty array for cut point intervals for each batch based on sgRNA batch_params["cut_point_include_idx"] = batch_params[ "cut_point_include_idx"].apply(list) for idx, row in batch_params.iterrows(): if 'fastq_r1' in row: if row.fastq_r1 is None: raise CRISPRessoShared.BadParameterException( "At least one fastq file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' (fastq_r1 on row %s '%s' is invalid)" % (int(idx) + 1, row.fastq_r1)) else: CRISPRessoShared.check_file(row.fastq_r1) if 'fastq_r2' in row and row.fastq_r2 != "": CRISPRessoShared.check_file(row.fastq_r2) if 'input_bam' in row: if row.input_bam is None: raise CRISPRessoShared.BadParameterException( "At least one input file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' or 'input_bam' (input_bam on row %s '%s' is invalid)" % (int(idx) + 1, row.input_bam)) else: CRISPRessoShared.check_file(row.input_bam) if args.auto: continue curr_amplicon_seq_str = row.amplicon_seq if curr_amplicon_seq_str is None: raise CRISPRessoShared.BadParameterException( "Amplicon sequence must be given as a command line parameter or be specified in the batch settings file with the heading 'amplicon_seq' (Amplicon seq on row %s '%s' is invalid)" % (int(idx) + 1, curr_amplicon_seq_str)) guides_are_in_amplicon = { } #dict of whether a guide is in at least one amplicon sequence #iterate through amplicons for curr_amplicon_seq in curr_amplicon_seq_str.split(','): this_include_idxs = [ ] #mask for bp to include for this amplicon seq, as specified by sgRNA cut points this_sgRNA_intervals = [] wrong_nt = CRISPRessoShared.find_wrong_nt(curr_amplicon_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The amplicon sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_amplicon_seq_str, ' '.join(wrong_nt))) #iterate through guides curr_guide_seq_string = row.guide_seq if curr_guide_seq_string is not None and curr_guide_seq_string != "": guides = curr_guide_seq_string.strip().upper().split(',') for curr_guide_seq in guides: wrong_nt = CRISPRessoShared.find_wrong_nt( curr_guide_seq) if wrong_nt: raise CRISPRessoShared.NTException( 'The sgRNA sequence in row %d (%s) contains incorrect characters:%s' % (idx + 1, curr_guide_seq, ' '.join(wrong_nt))) guide_mismatches = [[]] * len(guides) guide_names = [""] * len(guides) guide_qw_centers = CRISPRessoShared.set_guide_array( row.quantification_window_center, guides, 'guide quantification center') guide_qw_sizes = CRISPRessoShared.set_guide_array( row.quantification_window_size, guides, 'guide quantification size') guide_plot_cut_points = [1] * len(guides) (this_sgRNA_sequences, this_sgRNA_intervals, this_sgRNA_cut_points, this_sgRNA_plot_cut_points, this_sgRNA_plot_idxs, this_sgRNA_mismatches, this_sgRNA_names, this_include_idxs, this_exclude_idxs ) = CRISPRessoShared.get_amplicon_info_for_guides( curr_amplicon_seq, guides, guide_mismatches, guide_names, guide_qw_centers, guide_qw_sizes, row.quantification_window_coordinates, row.exclude_bp_from_left, row.exclude_bp_from_right, row.plot_window_size, guide_plot_cut_points) for guide_seq in this_sgRNA_sequences: guides_are_in_amplicon[guide_seq] = 1 batch_params.ix[idx, "cut_point_include_idx"].append( this_include_idxs) batch_params.ix[idx, "sgRNA_intervals"].append(this_sgRNA_intervals) for guide_seq in guides_are_in_amplicon: if guides_are_in_amplicon[guide_seq] != 1: warn( '\nThe guide sequence provided on row %d (%s) is not present in any amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!' % (idx + 1, row.guide_seq, curr_amplicon_seq)) batch_folder_name = os.path.splitext( os.path.basename(args.batch_settings))[0] if args.name and args.name != "": batch_folder_name = args.name output_folder_name = 'CRISPRessoBatch_on_%s' % batch_folder_name OUTPUT_DIRECTORY = os.path.abspath(output_folder_name) if args.batch_output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.batch_output_folder), output_folder_name) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoBatch_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Batch_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Batch_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) crispresso_cmds = [] batch_names_arr = [] batch_input_names = {} for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) batch_names_arr.append(batchName) batch_input_names[batchName] = row["name"] crispresso_cmd = args.crispresso_command + ' -o %s --name %s' % ( OUTPUT_DIRECTORY, batchName) crispresso_cmd = propagate_options(crispresso_cmd, crispresso_options_for_batch, batch_params, idx) crispresso_cmds.append(crispresso_cmd) crispresso2_info['batch_names_arr'] = batch_names_arr crispresso2_info['batch_input_names'] = batch_input_names CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, args.n_processes, 'batch', args.skip_failed) run_datas = [] #crispresso2 info from each row all_amplicons = set() amplicon_names = {} amplicon_counts = {} completed_batch_arr = [] for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data_file = os.path.join(folder_name, 'CRISPResso2_info.pickle') if os.path.isfile(run_data_file) is False: info("Skipping folder '%s'. Cannot find run data at '%s'." % (folder_name, run_data_file)) run_datas.append(None) continue run_data = cp.load(open(run_data_file, 'rb')) run_datas.append(run_data) for ref_name in run_data['ref_names']: ref_seq = run_data['refs'][ref_name]['sequence'] all_amplicons.add(ref_seq) #if this amplicon is called something else in another sample, just call it the amplicon if ref_name in amplicon_names and amplicon_names[ ref_seq] != ref_name: amplicon_names[ref_seq] = ref_seq else: amplicon_names[ref_seq] = ref_name if ref_seq not in amplicon_counts: amplicon_counts[ref_seq] = 0 amplicon_counts[ref_seq] += 1 completed_batch_arr.append(batchName) crispresso2_info['completed_batch_arr'] = completed_batch_arr #make sure amplicon names aren't super long for amplicon in all_amplicons: if len(amplicon_names[amplicon]) > 20: amplicon_names[amplicon] = amplicon_names[amplicon][0:20] #make sure no duplicate names (same name for the different amplicons) seen_names = {} for amplicon in all_amplicons: suffix_counter = 2 orig_name = amplicon_names[amplicon] while amplicon_names[amplicon] in seen_names: amplicon_names[amplicon] = orig_name + "_" + str( suffix_counter) suffix_counter += 1 seen_names[amplicon_names[amplicon]] = 1 save_png = True if args.suppress_report: save_png = False window_nuc_pct_quilt_plot_names = [] nuc_pct_quilt_plot_names = [] window_nuc_conv_plot_names = [] nuc_conv_plot_names = [] #report for amplicons that appear multiple times for amplicon_index, amplicon_seq in enumerate(all_amplicons): #only perform comparison if amplicon seen in more than one sample if amplicon_counts[amplicon_seq] < 2: continue amplicon_name = amplicon_names[amplicon_seq] info('Reporting summary for amplicon: "' + amplicon_name + '"') consensus_sequence = "" nucleotide_frequency_summary = [] nucleotide_percentage_summary = [] modification_frequency_summary = [] modification_percentage_summary = [] amp_found_count = 0 #how many folders had information for this amplicon consensus_guides = [] consensus_include_idxs = [] consensus_sgRNA_plot_idxs = [] consensus_sgRNA_intervals = [] guides_all_same = True batches_with_this_amplicon = [] for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue batch_has_amplicon = False batch_amplicon_name = '' for ref_name in run_data['ref_names']: if amplicon_seq == run_data['refs'][ref_name]['sequence']: batch_has_amplicon = True batch_amplicon_name = ref_name if not batch_has_amplicon: continue batches_with_this_amplicon.append(idx) if consensus_guides == []: consensus_guides = run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] consensus_include_idxs = run_data['refs'][ batch_amplicon_name]['include_idxs'] consensus_sgRNA_intervals = run_data['refs'][ batch_amplicon_name]['sgRNA_intervals'] consensus_sgRNA_plot_idxs = run_data['refs'][ batch_amplicon_name]['sgRNA_plot_idxs'] if run_data['refs'][batch_amplicon_name][ 'sgRNA_sequences'] != consensus_guides: guides_all_same = False if set(run_data['refs'][batch_amplicon_name] ['include_idxs']) != set(consensus_include_idxs): guides_all_same = False if 'nuc_freq_filename' not in run_data['refs'][ batch_amplicon_name]: info( "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information." % (batch_amplicon_name, folder_name)) continue nucleotide_frequency_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name]['nuc_freq_filename']) ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file( nucleotide_frequency_file) nucleotide_pct_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name]['nuc_pct_filename']) ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file( nucleotide_pct_file) count_file = os.path.join( folder_name, run_data['refs'][batch_amplicon_name] ['mod_count_filename']) ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file( count_file) if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None: info( "Skipping the amplicon '%s' in folder '%s'. Could not parse batch output." % (batch_amplicon_name, folder_name)) info( "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'" % (ampSeq_nf, ampSeq_np, ampSeq_cf)) continue if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf: warn( "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s" % (batch_amplicon_name, folder_name, ampSeq_nf, ampSeq_np, ampSeq_cf, amplicon_seq)) continue if consensus_sequence == "": consensus_sequence = ampSeq_nf if ampSeq_nf != consensus_sequence: info( "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match." % (batch_amplicon_name, folder_name)) continue if 'Total' not in mod_freqs: info( "Skipping the amplicon '%s' in folder '%s'. Processing did not complete." % (batch_amplicon_name, folder_name)) continue if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][0] == "0": info( "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon." % (batch_amplicon_name, folder_name)) continue if (args.min_reads_for_inclusion is not None) and (int( mod_freqs['Total'][0]) < args.min_reads_for_inclusion): info( "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)." % (batch_amplicon_name, folder_name, str(mod_freqs['Total'][0]), args.min_reads_for_inclusion)) continue mod_pcts = {} for key in mod_freqs: mod_pcts[key] = np.array(mod_freqs[key]).astype( np.float) / float(mod_freqs['Total'][0]) amp_found_count += 1 for nuc in ['A', 'T', 'C', 'G', 'N', '-']: row = [batchName, nuc] row.extend(nuc_freqs[nuc]) nucleotide_frequency_summary.append(row) pct_row = [batchName, nuc] pct_row.extend(nuc_pcts[nuc]) nucleotide_percentage_summary.append(pct_row) for mod in [ 'Insertions', 'Insertions_Left', 'Deletions', 'Substitutions', 'All_modifications' ]: row = [batchName, mod] row.extend(mod_freqs[mod]) modification_frequency_summary.append(row) pct_row = [batchName, mod] pct_row.extend(mod_pcts[mod]) modification_percentage_summary.append(pct_row) if amp_found_count == 0: info( "Couldn't find any data for amplicon '%s'. Not compiling results." % amplicon_name) else: amplicon_plot_name = amplicon_name + "." if len(amplicon_names) == 1 and amplicon_name == "Reference": amplicon_plot_name = "" colnames = ['Batch', 'Nucleotide'] colnames.extend(list(consensus_sequence)) nucleotide_frequency_summary_df = pd.DataFrame( nucleotide_frequency_summary, columns=colnames) nucleotide_frequency_summary_df = pd.concat([ nucleotide_frequency_summary_df.iloc[:, 0:2], nucleotide_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary.txt') nucleotide_frequency_summary_df.to_csv( nucleotide_frequency_summary_filename, sep='\t', index=None) nucleotide_percentage_summary_df = pd.DataFrame( nucleotide_percentage_summary, columns=colnames) nucleotide_percentage_summary_df = pd.concat([ nucleotide_percentage_summary_df.iloc[:, 0:2], nucleotide_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary.txt') nucleotide_percentage_summary_df.to_csv( nucleotide_percentage_summary_filename, sep='\t', index=None) colnames = ['Batch', 'Modification'] colnames.extend(list(consensus_sequence)) modification_frequency_summary_df = pd.DataFrame( modification_frequency_summary, columns=colnames) modification_frequency_summary_df = pd.concat([ modification_frequency_summary_df.iloc[:, 0:2], modification_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_frequency_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_FREQUENCY_SUMMARY.txt') modification_frequency_summary_df.to_csv( modification_frequency_summary_filename, sep='\t', index=None) modification_percentage_summary_df = pd.DataFrame( modification_percentage_summary, columns=colnames) modification_percentage_summary_df = pd.concat([ modification_percentage_summary_df.iloc[:, 0:2], modification_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_percentage_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_PERCENTAGE_SUMMARY.txt') modification_percentage_summary_df.to_csv( modification_percentage_summary_filename, sep='\t', index=None) crispresso2_info[ 'nucleotide_frequency_summary_filename'] = os.path.basename( nucleotide_frequency_summary_filename) crispresso2_info[ 'nucleotide_percentage_summary_filename'] = os.path.basename( nucleotide_percentage_summary_filename) crispresso2_info[ 'modification_frequency_summary_filename'] = os.path.basename( modification_frequency_summary_filename) crispresso2_info[ 'modification_percentage_summary_filename'] = os.path.basename( modification_percentage_summary_filename) crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window if guides_all_same and consensus_guides != []: info( "All guides are equal. Performing comparison of batches for amplicon '%s'" % amplicon_name) include_idxs = consensus_include_idxs #include indexes are the same for all guides for idx, sgRNA in enumerate(consensus_guides): sgRNA_intervals = consensus_sgRNA_intervals[idx] sgRNA_plot_idxs = consensus_sgRNA_plot_idxs[idx] plot_idxs_flat = [0, 1] # guide, nucleotide plot_idxs_flat.extend( [plot_idx + 2 for plot_idx in sgRNA_plot_idxs]) sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:, plot_idxs_flat] sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:, plot_idxs_flat] sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:, plot_idxs_flat] #show all sgRNA's on the plot sub_sgRNA_intervals = [] for sgRNA_interval in consensus_sgRNA_intervals: newstart = None newend = None for idx, i in enumerate(sgRNA_plot_idxs): if i <= sgRNA_interval[0]: newstart = idx if newend is None and i >= sgRNA_interval[1]: newend = idx #if guide doesn't overlap with plot idxs if newend == 0 or newstart == len(sgRNA_plot_idxs): continue #otherwise, correct partial overlaps elif newstart == None and newend == None: newstart = 0 newend = len(include_idxs) - 1 elif newstart == None: newstart = 0 elif newend == None: newend = len(include_idxs) - 1 #and add it to the list sub_sgRNA_intervals.append((newstart, newend)) if not args.suppress_plots: #plot for each guide this_window_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_nucleotide_quilt( sub_nucleotide_percentage_summary_df, sub_modification_percentage_summary_df, this_window_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_window_nuc_pct_quilt_plot_name) window_nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] sub_nucleotide_frequency_summary_df = pd.concat( [ sub_nucleotide_frequency_summary_df. iloc[:, 0:2], sub_nucleotide_frequency_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_frequency_summary_df.to_csv( sub_nucleotide_frequency_summary_filename, sep='\t', index=None) sub_nucleotide_percentage_summary_df = pd.concat( [ sub_nucleotide_percentage_summary_df. iloc[:, 0:2], sub_nucleotide_percentage_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_percentage_summary_df.to_csv( sub_nucleotide_percentage_summary_filename, sep='\t', index=None) if args.base_editor_output: this_window_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_conversion_map_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_conversion_map( sub_nucleotide_percentage_summary_df, this_window_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_window_nuc_conv_plot_name) window_nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][ plot_name] = [ ('Nucleotide frequencies around sgRNA', os.path.basename( sub_nucleotide_frequency_summary_filename )), ('Nucleotide percentages around sgRNA', os.path.basename( sub_nucleotide_percentage_summary_filename )) ] if not args.suppress_plots: # plot the whole region this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] if args.base_editor_output: this_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_conversion_map') CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, this_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs) plot_name = os.path.basename( this_nuc_conv_plot_name) nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] else: #guides are not the same if not args.suppress_plots: this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png) plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] if args.base_editor_output: this_nuc_conv_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_conversion_map( nucleotide_percentage_summary_df, this_nuc_conv_plot_name, args.conversion_nuc_from, args.conversion_nuc_to, save_png) plot_name = os.path.basename( this_nuc_conv_plot_name) nuc_conv_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ ('Nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] crispresso2_info[ 'window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names crispresso2_info['nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names crispresso2_info[ 'window_nuc_conv_plot_names'] = window_nuc_conv_plot_names crispresso2_info['nuc_conv_plot_names'] = nuc_conv_plot_names #summarize amplicon modifications with open( _jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'), 'w') as outfile: wrote_header = False for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) file_prefix = row['file_prefix'] folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['quant_of_editing_freq_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(batchName + "\t" + line) #summarize alignment with open(_jp('CRISPRessoBatch_mapping_statistics.txt'), 'w') as outfile: wrote_header = False for idx, row in batch_params.iterrows(): batchName = CRISPRessoShared.slugify(row["name"]) folder_name = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso_on_%s' % batchName) run_data = run_datas[idx] if run_data is None: continue amplicon_modification_file = os.path.join( folder_name, run_data['mapping_stats_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Batch\t' + file_head) wrote_header = True for line in infile: outfile.write(batchName + "\t" + line) if not args.suppress_report: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2Batch_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_batch_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) cp.dump(crispresso2_info, open(crispresso2Batch_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)