def main(): def print_stacktrace_if_debug(): debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error(traceback.format_exc()) try: description = [ '~~~CRISPRessoWGS~~~', '-Analysis of CRISPR/Cas9 outcomes from WGS data-' ] wgs_string = r''' ____________ | __ __ | || |/ _ (_ | ||/\|\__)__) | |____________| ''' print(CRISPRessoShared.get_crispresso_header(description, wgs_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoWGS Parameters', requiredParams={}) #tool specific optional parser.add_argument('-b', '--bam_file', type=str, help='WGS aligned bam file', required=True, default='bam filename') parser.add_argument( '-f', '--region_file', type=str, help= 'Regions description file. A BED format file containing the regions to analyze, one per line. The REQUIRED\ columns are: chr_id(chromosome name), bpstart(start position), bpend(end position), the optional columns are:name (an unique indentifier for the region), guide_seq, expected_hdr_amplicon_seq,coding_seq, see CRISPResso help for more details on these last 3 parameters)', required=True) parser.add_argument( '-r', '--reference_file', type=str, help= 'A FASTA format reference file (for example hg19.fa for the human genome)', default='', required=True) parser.add_argument( '--min_reads_to_use_region', type=float, help= 'Minimum number of reads that align to a region to perform the CRISPResso analysis', default=10) parser.add_argument( '--skip_failed', help='Continue with pooled analysis even if one sample fails', action='store_true') parser.add_argument( '--gene_annotations', type=str, help= 'Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \ please select as table "knowGene", as output format "all fields from selected table" and as file returned "gzip compressed"', default='') parser.add_argument( '-p', '--n_processes', type=int, help='Specify the number of processes to use for the quantification.\ Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.', default=1) parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set([ 'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name' ]) crispresso_options_for_wgs = list(crispresso_options - options_to_ignore) info('Checking dependencies...') if check_samtools() and check_bowtie2(): info('\n All the required dependencies are present!') else: sys.exit(1) #check files check_file(args.bam_file) check_file(args.reference_file) check_file(args.region_file) if args.gene_annotations: check_file(args.gene_annotations) #INIT get_name_from_bam = lambda x: os.path.basename(x).replace('.bam', '') if not args.name: database_id = '%s' % get_name_from_bam(args.bam_file) else: database_id = args.name OUTPUT_DIRECTORY = 'CRISPRessoWGS_on_%s' % database_id if args.output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.output_folder), OUTPUT_DIRECTORY) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) info('Done!') except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoWGS_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2WGS_info_file = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso2WGS_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) def rreplace(s, old, new): li = s.rsplit(old) return new.join(li) bam_index = '' #check if bam has the index already if os.path.exists(rreplace(args.bam_file, ".bam", ".bai")): info('Index file for input .bam file exists, skipping generation.') bam_index = args.bam_file.rreplace(".bam", ".bai") elif os.path.exists(args.bam_file + '.bai'): info('Index file for input .bam file exists, skipping generation.') bam_index = args.bam_file + '.bai' else: info('Creating index file for input .bam file...') sb.call('samtools index %s ' % (args.bam_file), shell=True) bam_index = args.bam_file + '.bai' #load gene annotation if args.gene_annotations: info('Loading gene coordinates from annotation file: %s...' % args.gene_annotations) try: df_genes = pd.read_table(args.gene_annotations, compression='gzip') df_genes.txEnd = df_genes.txEnd.astype(int) df_genes.txStart = df_genes.txStart.astype(int) df_genes.head() except: info('Failed to load the gene annotations file.') #Load and validate the REGION FILE df_regions = pd.read_csv(args.region_file, names=[ 'chr_id', 'bpstart', 'bpend', 'Name', 'sgRNA', 'Expected_HDR', 'Coding_sequence' ], comment='#', sep='\t', dtype={'Name': str}) #remove empty amplicons/lines df_regions.dropna(subset=['chr_id', 'bpstart', 'bpend'], inplace=True) df_regions.Expected_HDR = df_regions.Expected_HDR.apply( capitalize_sequence) df_regions.sgRNA = df_regions.sgRNA.apply(capitalize_sequence) df_regions.Coding_sequence = df_regions.Coding_sequence.apply( capitalize_sequence) #check or create names for idx, row in df_regions.iterrows(): if pd.isnull(row.Name): df_regions.ix[idx, 'Name'] = '_'.join( map(str, [row['chr_id'], row['bpstart'], row['bpend']])) if not len(df_regions.Name.unique()) == df_regions.shape[0]: raise Exception('The amplicon names should be all distinct!') df_regions = df_regions.set_index('Name') #df_regions.index=df_regions.index.str.replace(' ','_') df_regions.index = df_regions.index.to_series().str.replace(' ', '_') #extract sequence for each region uncompressed_reference = args.reference_file if os.path.exists(uncompressed_reference + '.fai'): info( 'The index for the reference fasta file is already present! Skipping generation.' ) else: info('Indexing reference file... Please be patient!') sb.call('samtools faidx %s >>%s 2>&1' % (uncompressed_reference, log_filename), shell=True) df_regions['sequence'] = df_regions.apply( lambda row: get_region_from_fa(row.chr_id, row.bpstart, row.bpend, uncompressed_reference), axis=1) for idx, row in df_regions.iterrows(): if not pd.isnull(row.sgRNA): cut_points = [] for current_guide_seq in row.sgRNA.strip().upper().split(','): wrong_nt = find_wrong_nt(current_guide_seq) if wrong_nt: raise NTException( 'The sgRNA sequence %s contains wrong characters:%s' % (current_guide_seq, ' '.join(wrong_nt))) offset_fw = args.quantification_window_center + len( current_guide_seq) - 1 offset_rc = (-args.quantification_window_center) - 1 cut_points+=[m.start() + offset_fw for \ m in re.finditer(current_guide_seq, row.sequence)]+[m.start() + offset_rc for m in re.finditer(CRISPRessoShared.reverse_complement(current_guide_seq), row.sequence)] if not cut_points: df_regions.ix[idx, 'sgRNA'] = '' df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart']) df_regions['bpend'] = pd.to_numeric(df_regions['bpend']) df_regions.bpstart = df_regions.bpstart.astype(int) df_regions.bpend = df_regions.bpend.astype(int) if args.gene_annotations: df_regions = df_regions.apply( lambda row: find_overlapping_genes(row, df_genes), axis=1) #extract reads with samtools in that region and create a bam #create a fasta file with all the trimmed reads info('\nProcessing each region...') ANALYZED_REGIONS = _jp('ANALYZED_REGIONS/') if not os.path.exists(ANALYZED_REGIONS): os.mkdir(ANALYZED_REGIONS) df_regions['n_reads'] = 0 df_regions['bam_file_with_reads_in_region'] = '' df_regions['fastq.gz_file_trimmed_reads_in_region'] = '' for idx, row in df_regions.iterrows(): if row['sequence']: fastq_gz_filename = os.path.join( ANALYZED_REGIONS, '%s.fastq.gz' % clean_filename('REGION_' + str(idx))) bam_region_filename = os.path.join( ANALYZED_REGIONS, '%s.bam' % clean_filename('REGION_' + str(idx))) #create place-holder fastq files open(fastq_gz_filename, 'w+').close() region = '%s:%d-%d' % (row.chr_id, row.bpstart, row.bpend - 1) info('\nExtracting reads in:%s and create the .bam file: %s' % (region, bam_region_filename)) #extract reads in region cmd = r'''samtools view -b -F 4 %s %s > %s ''' % ( args.bam_file, region, bam_region_filename) #print cmd sb.call(cmd, shell=True) #index bam file cmd = r'''samtools index %s ''' % (bam_region_filename) #print cmd sb.call(cmd, shell=True) info('Trim reads and create a fastq.gz file in: %s' % fastq_gz_filename) #trim reads in bam and convert in fastq n_reads = write_trimmed_fastq(bam_region_filename, row['bpstart'], row['bpend'], fastq_gz_filename) df_regions.ix[idx, 'n_reads'] = n_reads df_regions.ix[ idx, 'bam_file_with_reads_in_region'] = bam_region_filename df_regions.ix[ idx, 'fastq.gz_file_trimmed_reads_in_region'] = fastq_gz_filename df_regions.fillna('NA').to_csv( _jp('REPORT_READS_ALIGNED_TO_SELECTED_REGIONS_WGS.txt'), sep='\t') #Run Crispresso info('\nRunning CRISPResso on each region...') crispresso_cmds = [] for idx, row in df_regions.iterrows(): if row['n_reads'] >= args.min_reads_to_use_region: info('\nThe region [%s] has enough reads (%d) mapped to it!' % (idx, row['n_reads'])) crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' %\ (row['fastq.gz_file_trimmed_reads_in_region'],row['sequence'],OUTPUT_DIRECTORY,idx) if row['sgRNA'] and not pd.isnull(row['sgRNA']): crispresso_cmd += ' -g %s' % row['sgRNA'] if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']): crispresso_cmd += ' -e %s' % row['Expected_HDR'] if row['Coding_sequence'] and not pd.isnull( row['Coding_sequence']): crispresso_cmd += ' -c %s' % row['Coding_sequence'] crispresso_cmd = CRISPRessoShared.propagate_crispresso_options( crispresso_cmd, crispresso_options_for_wgs, args) crispresso_cmds.append(crispresso_cmd) # info('Running CRISPResso:%s' % crispresso_cmd) # sb.call(crispresso_cmd,shell=True) else: info( '\nThe region [%s] has too few reads mapped to it (%d)! Not running CRISPResso!' % (idx, row['n_reads'])) CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, args.n_processes, 'region', args.skip_failed) quantification_summary = [] all_region_names = [] all_region_read_counts = {} good_region_names = [] good_region_folders = {} header = 'Name\tUnmodified%\tModified%\tReads_aligned\tReads_total\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions' header_els = header.split("\t") header_el_count = len(header_els) empty_line_els = [np.nan] * (header_el_count - 1) n_reads_index = header_els.index('Reads_total') - 1 for idx, row in df_regions.iterrows(): folder_name = 'CRISPResso_on_%s' % idx run_name = idx all_region_names.append(run_name) all_region_read_counts[run_name] = row.n_reads run_file = os.path.join(_jp(folder_name), 'CRISPResso2_info.pickle') if not os.path.exists(run_file): warn( 'Skipping the folder %s: not enough reads, incomplete, or empty folder.' % folder_name) this_els = empty_line_els[:] this_els[n_reads_index] = row.n_reads to_add = [run_name] to_add.extend(this_els) quantification_summary.append(to_add) else: run_data = cp.load(open(run_file, 'rb')) ref_name = run_data['ref_names'][ 0] #only expect one amplicon sequence n_tot = row.n_reads n_aligned = run_data['counts_total'][ref_name] n_unmod = run_data['counts_unmodified'][ref_name] n_mod = run_data['counts_modified'][ref_name] n_discarded = run_data['counts_discarded'][ref_name] n_insertion = run_data['counts_insertion'][ref_name] n_deletion = run_data['counts_deletion'][ref_name] n_substitution = run_data['counts_substitution'][ref_name] n_only_insertion = run_data['counts_only_insertion'][ref_name] n_only_deletion = run_data['counts_only_deletion'][ref_name] n_only_substitution = run_data['counts_only_substitution'][ ref_name] n_insertion_and_deletion = run_data[ 'counts_insertion_and_deletion'][ref_name] n_insertion_and_substitution = run_data[ 'counts_insertion_and_substitution'][ref_name] n_deletion_and_substitution = run_data[ 'counts_deletion_and_substitution'][ref_name] n_insertion_and_deletion_and_substitution = run_data[ 'counts_insertion_and_deletion_and_substitution'][ref_name] unmod_pct = "NA" mod_pct = "NA" if n_aligned > 0: unmod_pct = 100 * n_unmod / float(n_aligned) mod_pct = 100 * n_mod / float(n_aligned) vals = [run_name] vals.extend([ round(unmod_pct, 8), round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod, n_discarded, n_insertion, n_deletion, n_substitution, n_only_insertion, n_only_deletion, n_only_substitution, n_insertion_and_deletion, n_insertion_and_substitution, n_deletion_and_substitution, n_insertion_and_deletion_and_substitution ]) quantification_summary.append(vals) good_region_names.append(idx) good_region_folders[idx] = folder_name samples_quantification_summary_filename = _jp( 'SAMPLES_QUANTIFICATION_SUMMARY.txt') df_summary_quantification = pd.DataFrame(quantification_summary, columns=header_els) if args.crispresso1_mode: crispresso1_columns = [ 'Name', 'Unmodified%', 'Modified%', 'Reads_aligned', 'Reads_total' ] df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None, columns=crispresso1_columns) else: df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None) crispresso2_info[ 'samples_quantification_summary_filename'] = os.path.basename( samples_quantification_summary_filename) crispresso2_info['regions'] = df_regions crispresso2_info['all_region_names'] = all_region_names crispresso2_info['all_region_read_counts'] = all_region_read_counts crispresso2_info['good_region_names'] = good_region_names crispresso2_info['good_region_folders'] = good_region_folders crispresso2_info['summary_plot_names'] = [] crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} df_summary_quantification.set_index('Name') save_png = True if args.suppress_report: save_png = False plot_root = _jp("CRISPRessoWGS_modification_summary") CRISPRessoPlot.plot_unmod_mod_pcts(plot_root, df_summary_quantification, save_png, args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoWGS Modification Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoWGS summary', os.path.basename(samples_quantification_summary_filename)) ] plot_root = _jp("CRISPRessoWGS_reads_summary") CRISPRessoPlot.plot_reads_total(plot_root, df_summary_quantification, save_png, args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoWGS Read Allocation Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoWGS summary', os.path.basename(samples_quantification_summary_filename)) ] if not args.suppress_report: report_name = _jp('CRISPResso2WGS_report.html') CRISPRessoReport.make_wgs_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) cp.dump(crispresso2_info, open(crispresso2WGS_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: print_stacktrace_if_debug() error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): def print_stacktrace_if_debug(): debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error(traceback.format_exc()) try: start_time = datetime.now() start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S') description = [ '~~~CRISPRessoWGS~~~', '-Analysis of CRISPR/Cas9 outcomes from WGS data-' ] wgs_string = r''' ____________ | __ __ | || |/ _ (_ | ||/\|\__)__) | |____________| ''' print(CRISPRessoShared.get_crispresso_header(description, wgs_string)) parser = CRISPRessoShared.getCRISPRessoArgParser( parserTitle='CRISPRessoWGS Parameters', requiredParams={}) #tool specific optional parser.add_argument('-b', '--bam_file', type=str, help='WGS aligned bam file', required=True, default='bam filename') parser.add_argument( '-f', '--region_file', type=str, help= 'Regions description file. A BED format file containing the regions to analyze, one per line. The REQUIRED\ columns are: chr_id(chromosome name), bpstart(start position), bpend(end position), the optional columns are:name (an unique indentifier for the region), guide_seq, expected_hdr_amplicon_seq,coding_seq, see CRISPResso help for more details on these last 3 parameters)', required=True) parser.add_argument( '-r', '--reference_file', type=str, help= 'A FASTA format reference file (for example hg19.fa for the human genome)', default='', required=True) parser.add_argument( '--min_reads_to_use_region', type=float, help= 'Minimum number of reads that align to a region to perform the CRISPResso analysis', default=10) parser.add_argument( '--skip_failed', help='Continue with pooled analysis even if one sample fails', action='store_true') parser.add_argument( '--gene_annotations', type=str, help= 'Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \ please select as table "knownGene", as output format "all fields from selected table" and as file returned "gzip compressed"', default='') parser.add_argument('--crispresso_command', help='CRISPResso command to call', default='CRISPResso') args = parser.parse_args() crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = { 'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name', 'output_folder', 'name' } crispresso_options_for_wgs = list(crispresso_options - options_to_ignore) info('Checking dependencies...') if check_samtools() and check_bowtie2(): info('\n All the required dependencies are present!') else: sys.exit(1) #check files check_file(args.bam_file) check_file(args.reference_file) check_file(args.region_file) if args.gene_annotations: check_file(args.gene_annotations) # for computation performed in CRISPRessoWGS (e.g. bowtie alignment, etc) use n_processes_for_wgs n_processes_for_wgs = 1 if args.n_processes == "max": n_processes_for_wgs = CRISPRessoMultiProcessing.get_max_processes() else: n_processes_for_wgs = int(args.n_processes) # here, we set args.n_processes as 1 because this value is propagated to sub-CRISPResso runs (not for usage in CRISPRessoWGS) args.n_processes = 1 #INIT get_name_from_bam = lambda x: os.path.basename(x).replace('.bam', '') if not args.name: database_id = '%s' % get_name_from_bam(args.bam_file) else: clean_name = CRISPRessoShared.slugify(args.name) if args.name != clean_name: warn( 'The specified name {0} contained invalid characters and was changed to: {1}' .format( args.name, clean_name, ), ) database_id = clean_name OUTPUT_DIRECTORY = 'CRISPRessoWGS_on_%s' % database_id if args.output_folder: OUTPUT_DIRECTORY = os.path.join( os.path.abspath(args.output_folder), OUTPUT_DIRECTORY) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) info('Done!') except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoWGS_RUNNING_LOG.txt') logger.addHandler(logging.FileHandler(log_filename)) crispresso2_info_file = os.path.join(OUTPUT_DIRECTORY, 'CRISPResso2WGS_info.json') crispresso2_info = { 'running_info': {}, 'results': { 'alignment_stats': {}, 'general_plots': {} } } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['running_info'][ 'version'] = CRISPRessoShared.__version__ crispresso2_info['running_info']['args'] = deepcopy(args) crispresso2_info['running_info']['log_filename'] = os.path.basename( log_filename) crispresso2_info['running_info']['finished_steps'] = {} crispresso_cmd_to_write = ' '.join(sys.argv) if args.write_cleaned_report: cmd_copy = sys.argv[:] cmd_copy[0] = 'CRISPRessoWGS' for i in range(len(cmd_copy)): if os.sep in cmd_copy[i]: cmd_copy[i] = os.path.basename(cmd_copy[i]) crispresso_cmd_to_write = ' '.join( cmd_copy ) #clean command doesn't show the absolute path to the executable or other files crispresso2_info['running_info'][ 'command_used'] = crispresso_cmd_to_write with open(log_filename, 'w+') as outfile: outfile.write( 'CRISPResso version %s\n[Command used]:\n%s\n\n[Execution log]:\n' % (CRISPRessoShared.__version__, crispresso_cmd_to_write)) #keep track of args to see if it is possible to skip computation steps on rerun can_finish_incomplete_run = False if args.no_rerun: if os.path.exists(crispresso2_info_file): previous_run_data = CRISPRessoShared.load_crispresso_info( OUTPUT_DIRECTORY) if previous_run_data['running_info'][ 'version'] == CRISPRessoShared.__version__: args_are_same = True for arg in vars(args): if arg == "no_rerun" or arg == "debug" or arg == "n_processes": continue if arg not in vars( previous_run_data['running_info']['args']): info( 'Comparing current run to previous run: old run had argument ' + str(arg) + ' \nRerunning.') args_are_same = False elif str( getattr( previous_run_data['running_info']['args'], arg)) != str(getattr(args, arg)): info( 'Comparing current run to previous run:\n\told argument ' + str(arg) + ' = ' + str( getattr( previous_run_data['running_info'] ['args'], arg)) + '\n\tnew argument: ' + str(arg) + ' = ' + str(getattr(args, arg)) + '\nRerunning.') args_are_same = False if args_are_same: if 'end_time_string' in previous_run_data: info('Analysis already completed on %s!' % previous_run_data['running_info'] ['end_time_string']) sys.exit(0) else: can_finish_incomplete_run = True if 'finished_steps' in previous_run_data[ 'running_info']: for key in previous_run_data['running_info'][ 'finished_steps'].keys(): crispresso2_info['running_info'][ 'finished_steps'][ key] = previous_run_data[ 'running_info'][ 'finished_steps'][key] if args.debug: info('finished: ' + key) else: info( 'The no_rerun flag is set, but this analysis will be rerun because the existing run was performed using an old version of CRISPResso (' + str(previous_run_data['running_info']['version']) + ').') #write this file early on so we can check the params if we have to rerun CRISPRessoShared.write_crispresso_info( crispresso2_info_file, crispresso2_info, ) def rreplace(s, old, new): li = s.rsplit(old) return new.join(li) #check if bam has the index already if os.path.exists(rreplace(args.bam_file, ".bam", ".bai")): info('Index file for input .bam file exists, skipping generation.') elif os.path.exists(args.bam_file + '.bai'): info('Index file for input .bam file exists, skipping generation.') else: info('Creating index file for input .bam file...') sb.call('samtools index %s ' % (args.bam_file), shell=True) #load gene annotation if args.gene_annotations: info('Loading gene coordinates from annotation file: %s...' % args.gene_annotations) try: df_genes = pd.read_csv(args.gene_annotations, compression='gzip', sep="\t") df_genes.txEnd = df_genes.txEnd.astype(int) df_genes.txStart = df_genes.txStart.astype(int) df_genes.head() except: raise Exception('Failed to load the gene annotations file.') #Load and validate the REGION FILE df_regions = pd.read_csv(args.region_file, names=[ 'chr_id', 'bpstart', 'bpend', 'Name', 'sgRNA', 'Expected_HDR', 'Coding_sequence' ], comment='#', sep='\t', dtype={ 'Name': str, 'chr_id': str }) #remove empty amplicons/lines df_regions.dropna(subset=['chr_id', 'bpstart', 'bpend'], inplace=True) df_regions.Expected_HDR = df_regions.Expected_HDR.apply( capitalize_sequence) df_regions.sgRNA = df_regions.sgRNA.apply(capitalize_sequence) df_regions.Coding_sequence = df_regions.Coding_sequence.apply( capitalize_sequence) #check or create names for idx, row in df_regions.iterrows(): if pd.isnull(row.Name): df_regions.iloc[idx, ]['Name'] = '_'.join( map(str, [row['chr_id'], row['bpstart'], row['bpend']])) if not len(df_regions.Name.unique()) == df_regions.shape[0]: raise Exception('The amplicon names should be all distinct!') df_regions.set_index('Name', inplace=True) #df_regions.index=df_regions.index.str.replace(' ','_') df_regions.index = df_regions.index.to_series().str.replace(' ', '_') #extract sequence for each region uncompressed_reference = args.reference_file if os.path.exists(uncompressed_reference + '.fai'): info( 'The index for the reference fasta file is already present! Skipping generation.' ) else: info('Indexing reference file... Please be patient!') sb.call('samtools faidx %s >>%s 2>&1' % (uncompressed_reference, log_filename), shell=True) info( 'Retrieving reference sequences for amplicons and checking for sgRNAs' ) df_regions['sequence'] = df_regions.apply( lambda row: get_region_from_fa(row.chr_id, row.bpstart, row.bpend, uncompressed_reference), axis=1) for idx, row in df_regions.iterrows(): if not pd.isnull(row.sgRNA): cut_points = [] guides = row.sgRNA.strip().upper().split(',') guide_qw_centers = CRISPRessoShared.set_guide_array( args.quantification_window_center, guides, 'guide quantification center') for idx, current_guide_seq in enumerate(guides): wrong_nt = find_wrong_nt(current_guide_seq) if wrong_nt: raise NTException( 'The sgRNA sequence %s contains wrong characters:%s' % (current_guide_seq, ' '.join(wrong_nt))) offset_fw = guide_qw_centers[idx] + len( current_guide_seq) - 1 offset_rc = (-guide_qw_centers[idx]) - 1 cut_points+=[m.start() + offset_fw for \ m in re.finditer(current_guide_seq, row.sequence)]+[m.start() + offset_rc for m in re.finditer(CRISPRessoShared.reverse_complement(current_guide_seq), row.sequence)] if not cut_points: df_regions.iloc[idx, :]['sgRNA'] = '' info('Cannot find guide ' + str(row.sgRNA) + ' in amplicon ' + str(idx) + ' (' + str(row) + ')') df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart']) df_regions['bpend'] = pd.to_numeric(df_regions['bpend']) df_regions.bpstart = df_regions.bpstart.astype(int) df_regions.bpend = df_regions.bpend.astype(int) if args.gene_annotations: df_regions = df_regions.apply( lambda row: find_overlapping_genes(row, df_genes), axis=1) #extract reads with samtools in that region and create a bam #create a fasta file with all the trimmed reads info('\nProcessing each region...') ANALYZED_REGIONS = _jp('ANALYZED_REGIONS/') if not os.path.exists(ANALYZED_REGIONS): os.mkdir(ANALYZED_REGIONS) df_regions['region_number'] = np.arange(len(df_regions)) def set_filenames(row): row_fastq_exists = False fastq_gz_filename = os.path.join( ANALYZED_REGIONS, '%s.fastq.gz' % clean_filename('REGION_' + str(row.region_number))) bam_region_filename = os.path.join( ANALYZED_REGIONS, '%s.bam' % clean_filename('REGION_' + str(row.region_number))) #if bam file already exists, don't regenerate it if os.path.isfile(fastq_gz_filename): row_fastq_exists = True return bam_region_filename, fastq_gz_filename, row_fastq_exists df_regions['bam_file_with_reads_in_region'], df_regions[ 'fastq_file_trimmed_reads_in_region'], df_regions[ 'row_fastq_exists'] = zip( *df_regions.apply(set_filenames, axis=1)) df_regions['n_reads'] = 0 df_regions[ 'original_bam'] = args.bam_file #stick this in the df so we can parallelize the analysis and not pass params report_reads_aligned_filename = _jp( 'REPORT_READS_ALIGNED_TO_SELECTED_REGIONS_WGS.txt') num_rows_without_fastq = len( df_regions[df_regions.row_fastq_exists == False]) if can_finish_incomplete_run and num_rows_without_fastq == 0 and os.path.isfile( report_reads_aligned_filename ) and 'generation_of_fastq_files_for_each_amplicon' in crispresso2_info[ 'running_info']['finished_steps']: info('Skipping generation of fastq files for each amplicon.') df_regions = pd.read_csv(report_reads_aligned_filename, comment='#', sep='\t', dtype={ 'Name': str, 'chr_id': str }) df_regions.set_index('Name', inplace=True) else: #run region extraction here df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel( df_regions, extract_reads_chunk, n_processes_for_wgs) df_regions.sort_values('region_number', inplace=True) cols_to_print = [ "chr_id", "bpstart", "bpend", "sgRNA", "Expected_HDR", "Coding_sequence", "sequence", "n_reads", "bam_file_with_reads_in_region", "fastq_file_trimmed_reads_in_region" ] if args.gene_annotations: cols_to_print.append('gene_overlapping') df_regions.fillna('NA').to_csv(report_reads_aligned_filename, sep='\t', columns=cols_to_print, index_label="Name") #save progress crispresso2_info['running_info']['finished_steps'][ 'generation_of_fastq_files_for_each_amplicon'] = True CRISPRessoShared.write_crispresso_info( crispresso2_info_file, crispresso2_info, ) #Run Crispresso info('Running CRISPResso on each region...') crispresso_cmds = [] for idx, row in df_regions.iterrows(): if row['n_reads'] >= args.min_reads_to_use_region: info('\nThe region [%s] has enough reads (%d) mapped to it!' % (idx, row['n_reads'])) crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' %\ (row['fastq_file_trimmed_reads_in_region'], row['sequence'], OUTPUT_DIRECTORY, idx) if row['sgRNA'] and not pd.isnull(row['sgRNA']): crispresso_cmd += ' -g %s' % row['sgRNA'] if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']): crispresso_cmd += ' -e %s' % row['Expected_HDR'] if row['Coding_sequence'] and not pd.isnull( row['Coding_sequence']): crispresso_cmd += ' -c %s' % row['Coding_sequence'] crispresso_cmd = CRISPRessoShared.propagate_crispresso_options( crispresso_cmd, crispresso_options_for_wgs, args) #logging like this causes the multiprocessing step to not block for some reason #mysteriesOfThPythonUniverse #log_name = _jp("CRISPResso_on_"+idx) +".log" #crispresso_cmd += " &> %s"%log_name crispresso_cmds.append(crispresso_cmd) # info('Running CRISPResso:%s' % crispresso_cmd) # sb.call(crispresso_cmd,shell=True) else: info( '\nThe region [%s] has too few reads mapped to it (%d)! Not running CRISPResso!' % (idx, row['n_reads'])) CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds, n_processes_for_wgs, 'region', args.skip_failed) quantification_summary = [] all_region_names = [] all_region_read_counts = {} good_region_names = [] good_region_folders = {} header = 'Name\tUnmodified%\tModified%\tReads_total\tReads_aligned\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions' header_els = header.split("\t") header_el_count = len(header_els) empty_line_els = [np.nan] * (header_el_count - 1) n_reads_index = header_els.index('Reads_total') - 1 for idx, row in df_regions.iterrows(): folder_name = 'CRISPResso_on_%s' % idx run_name = idx all_region_names.append(run_name) all_region_read_counts[run_name] = row.n_reads run_file = os.path.join(_jp(folder_name), 'CRISPResso2_info.json') if not os.path.exists(run_file): warn( 'Skipping the folder %s: not enough reads, incomplete, or empty folder.' % folder_name) this_els = empty_line_els[:] this_els[n_reads_index] = row.n_reads to_add = [run_name] to_add.extend(this_els) quantification_summary.append(to_add) else: run_data = CRISPRessoShared.load_crispresso_info( _jp(folder_name), ) ref_name = run_data['results']['ref_names'][ 0] #only expect one amplicon sequence n_tot = row.n_reads n_aligned = run_data['results']['alignment_stats'][ 'counts_total'][ref_name] n_unmod = run_data['results']['alignment_stats'][ 'counts_unmodified'][ref_name] n_mod = run_data['results']['alignment_stats'][ 'counts_modified'][ref_name] n_discarded = run_data['results']['alignment_stats'][ 'counts_discarded'][ref_name] n_insertion = run_data['results']['alignment_stats'][ 'counts_insertion'][ref_name] n_deletion = run_data['results']['alignment_stats'][ 'counts_deletion'][ref_name] n_substitution = run_data['results']['alignment_stats'][ 'counts_substitution'][ref_name] n_only_insertion = run_data['results']['alignment_stats'][ 'counts_only_insertion'][ref_name] n_only_deletion = run_data['results']['alignment_stats'][ 'counts_only_deletion'][ref_name] n_only_substitution = run_data['results']['alignment_stats'][ 'counts_only_substitution'][ref_name] n_insertion_and_deletion = run_data['results'][ 'alignment_stats']['counts_insertion_and_deletion'][ ref_name] n_insertion_and_substitution = run_data['results'][ 'alignment_stats']['counts_insertion_and_substitution'][ ref_name] n_deletion_and_substitution = run_data['results'][ 'alignment_stats']['counts_deletion_and_substitution'][ ref_name] n_insertion_and_deletion_and_substitution = run_data[ 'results']['alignment_stats'][ 'counts_insertion_and_deletion_and_substitution'][ ref_name] unmod_pct = "NA" mod_pct = "NA" if n_aligned > 0: unmod_pct = 100 * n_unmod / float(n_aligned) mod_pct = 100 * n_mod / float(n_aligned) vals = [run_name] vals.extend([ round(unmod_pct, 8), round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod, n_discarded, n_insertion, n_deletion, n_substitution, n_only_insertion, n_only_deletion, n_only_substitution, n_insertion_and_deletion, n_insertion_and_substitution, n_deletion_and_substitution, n_insertion_and_deletion_and_substitution ]) quantification_summary.append(vals) good_region_names.append(idx) good_region_folders[idx] = folder_name samples_quantification_summary_filename = _jp( 'SAMPLES_QUANTIFICATION_SUMMARY.txt') df_summary_quantification = pd.DataFrame(quantification_summary, columns=header_els) if args.crispresso1_mode: crispresso1_columns = [ 'Name', 'Unmodified%', 'Modified%', 'Reads_aligned', 'Reads_total' ] df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None, columns=crispresso1_columns) else: df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None) crispresso2_info['results']['alignment_stats'][ 'samples_quantification_summary_filename'] = os.path.basename( samples_quantification_summary_filename) crispresso2_info['results']['regions'] = df_regions crispresso2_info['results']['all_region_names'] = all_region_names crispresso2_info['results'][ 'all_region_read_counts'] = all_region_read_counts crispresso2_info['results']['good_region_names'] = good_region_names crispresso2_info['results'][ 'good_region_folders'] = good_region_folders crispresso2_info['results']['general_plots']['summary_plot_names'] = [] crispresso2_info['results']['general_plots'][ 'summary_plot_titles'] = {} crispresso2_info['results']['general_plots'][ 'summary_plot_labels'] = {} crispresso2_info['results']['general_plots']['summary_plot_datas'] = {} df_summary_quantification.set_index('Name') save_png = True if args.suppress_report: save_png = False if not args.suppress_plots: plot_root = _jp("CRISPRessoWGS_reads_summary") CRISPRessoPlot.plot_reads_total(plot_root, df_summary_quantification, save_png, args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['results']['general_plots'][ 'reads_summary_plot'] = plot_name crispresso2_info['results']['general_plots'][ 'summary_plot_names'].append(plot_name) crispresso2_info['results']['general_plots'][ 'summary_plot_titles'][ plot_name] = 'CRISPRessoWGS Read Allocation Summary' crispresso2_info['results']['general_plots']['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['results']['general_plots']['summary_plot_datas'][ plot_name] = [ ('CRISPRessoWGS summary', os.path.basename(samples_quantification_summary_filename)) ] plot_root = _jp("CRISPRessoWGS_modification_summary") CRISPRessoPlot.plot_unmod_mod_pcts(plot_root, df_summary_quantification, save_png, args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['results']['general_plots'][ 'modification_summary_plot'] = plot_name crispresso2_info['results']['general_plots'][ 'summary_plot_names'].append(plot_name) crispresso2_info['results']['general_plots'][ 'summary_plot_titles'][ plot_name] = 'CRISPRessoWGS Modification Summary' crispresso2_info['results']['general_plots']['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['results']['general_plots']['summary_plot_datas'][ plot_name] = [ ('CRISPRessoWGS summary', os.path.basename(samples_quantification_summary_filename)) ] if not args.suppress_report and not args.suppress_plots: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2WGS_report.html") else: report_name = OUTPUT_DIRECTORY + '.html' CRISPRessoReport.make_wgs_report_from_folder( report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT) crispresso2_info['running_info']['report_location'] = report_name crispresso2_info['running_info'][ 'report_filename'] = os.path.basename(report_name) end_time = datetime.now() end_time_string = end_time.strftime('%Y-%m-%d %H:%M:%S') running_time = end_time - start_time running_time_string = str(running_time) crispresso2_info['running_info']['end_time'] = end_time crispresso2_info['running_info']['end_time_string'] = end_time_string crispresso2_info['running_info']['running_time'] = running_time crispresso2_info['running_info'][ 'running_time_string'] = running_time_string CRISPRessoShared.write_crispresso_info( crispresso2_info_file, crispresso2_info, ) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: print_stacktrace_if_debug() error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): try: start_time = datetime.now() start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S') description = [ '~~~CRISPRessoAggregate~~~', '-Aggregation of CRISPResso Run Data-' ] aggregate_string = r''' ___________________________________ | __ __ _ _ __ ___ _ | | /\ /__ /__ |_) |_ /__ /\ | |_ | |/--\ \_| \_| | \ |_ \_| /--\ | |_ | |__________________________________| ''' print( CRISPRessoShared.get_crispresso_header(description, aggregate_string)) parser = argparse.ArgumentParser( description="Aggreate CRISPResso2 Runs") parser.add_argument( "-p", "--prefix", action='append', help= "Prefix for CRISPResso folders to aggregate (may be specified multiple times)", default=[]) parser.add_argument("-s", "--suffix", type=str, help="Suffix for CRISPResso folders to aggregate", default="") parser.add_argument("-n", "--name", type=str, help="Output name of the report", required=True) parser.add_argument( '--min_reads_for_inclusion', help= 'Minimum number of reads for a run to be included in the run summary', type=int, default=0) parser.add_argument( '--place_report_in_output_folder', help= 'If true, report will be written inside the CRISPResso output folder. By default, the report will be written one directory up from the report output.', action='store_true') parser.add_argument('--suppress_report', help='Suppress output report', action='store_true') parser.add_argument('--suppress_plots', help='Suppress output plots', action='store_true') parser.add_argument('--debug', help='Show debug messages', action='store_true') args = parser.parse_args() output_folder_name = 'CRISPRessoAggregate_on_%s' % args.name OUTPUT_DIRECTORY = os.path.abspath(output_folder_name) _jp = lambda filename: os.path.join( OUTPUT_DIRECTORY, filename ) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename = _jp('CRISPRessoAggregate_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) with open(log_filename, 'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) crispresso2Aggregate_info_file = os.path.join( OUTPUT_DIRECTORY, 'CRISPResso2Aggregate_info.pickle') crispresso2_info = { } #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) #glob returns paths including the original prefix all_files = [] for prefix in args.prefix: all_files.extend(glob.glob(prefix + '*' + args.suffix)) if args.prefix != "": all_files.extend(glob.glob( prefix + '/*' + args.suffix)) #if a folder is given, add all subfolders seen_folders = {} crispresso2_folder_infos = { } #file_loc->crispresso_info; these are only CRISPResso runs -- this bit unrolls batch, pooled, and wgs runs successfully_imported_count = 0 not_imported_count = 0 for folder in all_files: if folder in seen_folders: #skip if we've seen this folder (glob could have added it twice) continue seen_folders[folder] = 1 if os.path.isdir(folder) and str(folder).endswith(args.suffix): #first, try to import a plain CRISPResso2 run crispresso_info_file = os.path.join(folder, 'CRISPResso2_info.pickle') if os.path.exists(crispresso_info_file): try: run_data = CRISPRessoShared.load_crispresso_info( folder) crispresso2_folder_infos[folder] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + folder) not_imported_count += 1 #second, check pooled pooled_info_file = os.path.join( folder, 'CRISPResso2Pooled_info.pickle') if os.path.exists(pooled_info_file): pooled_data = cp.load(open(pooled_info_file, 'rb')) if 'good_region_names' in pooled_data: run_names = pooled_data['good_region_names'] for run_name in run_names: run_folder_loc = os.path.join( folder, 'CRISPResso_on_%s' % run_name) try: run_data = CRISPRessoShared.load_crispresso_info( run_folder_loc) crispresso2_folder_infos[ run_folder_loc] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + run_folder_loc) not_imported_count += 1 else: warn('Could not process pooled folder ' + folder) not_imported_count += 1 #third, check batch batch_info_file = os.path.join(folder, 'CRISPResso2Batch_info.pickle') if os.path.exists(batch_info_file): batch_data = cp.load(open(batch_info_file, 'rb')) if 'completed_batch_arr' in batch_data: run_names = batch_data['completed_batch_arr'] for run_name in run_names: run_folder_loc = os.path.join( folder, 'CRISPResso_on_%s' % run_name) try: run_data = CRISPRessoShared.load_crispresso_info( run_folder_loc) crispresso2_folder_infos[ run_folder_loc] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + run_folder_loc) not_imported_count += 1 else: warn('Could not process batch folder ' + folder) not_imported_count += 1 #fourth, check WGS wgs_info_file = os.path.join(folder, 'CRISPResso2WGS_info.pickle') if os.path.exists(wgs_info_file): wgs_data = cp.load(open(wgs_info_file, 'rb')) if 'good_region_folders' in wgs_data: run_names = wgs_data['good_region_folders'] for run_name in run_names: run_folder_loc = os.path.join( folder, 'CRISPResso_on_%s' % run_name) try: run_data = CRISPRessoShared.load_crispresso_info( run_folder_loc) crispresso2_folder_infos[ run_folder_loc] = run_data successfully_imported_count += 1 except Exception as e: warn('Could not open CRISPResso2 info file in ' + run_folder_loc) not_imported_count += 1 else: warn('Could not process WGS folder ' + folder) not_imported_count += 1 info('Read ' + str(successfully_imported_count) + ' folders (' + str(not_imported_count) + ' not imported)') save_png = True if args.suppress_report: save_png = False if successfully_imported_count > 0: crispresso2_folders = crispresso2_folder_infos.keys() crispresso2_folder_names = {} crispresso2_folder_htmls = {} #file_loc->html folder loc for crispresso2_folder in crispresso2_folders: crispresso2_folder_names[ crispresso2_folder] = CRISPRessoShared.slugify( crispresso2_folder) this_sub_html_file = crispresso2_folder + ".html" if crispresso2_folder_infos[crispresso2_folder][ 'args'].place_report_in_output_folder: this_sub_html_file = os.path.join( crispresso2_folder, crispresso2_folder_infos[crispresso2_folder] ['report_filename']) crispresso2_folder_htmls[crispresso2_folder] = os.path.abspath( this_sub_html_file) all_amplicons = set() amplicon_names = { } #sequence -> ref name (to check for amplicons with the same name but different sequences) amplicon_counts = {} amplicon_sources = {} completed_batch_arr = [] for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] for ref_name in run_data['ref_names']: ref_seq = run_data['refs'][ref_name]['sequence'] all_amplicons.add(ref_seq) #if this amplicon is called something else in another sample, just call it the amplicon if ref_name in amplicon_names and amplicon_names[ ref_seq] != ref_name: amplicon_names[ref_seq] = ref_seq else: amplicon_names[ref_seq] = ref_name if ref_seq not in amplicon_counts: amplicon_counts[ref_seq] = 0 amplicon_sources[ref_seq] = [] amplicon_counts[ref_seq] += 1 amplicon_sources[ref_seq].append(crispresso2_folder + '(' + ref_name + ')') #make sure amplicon names aren't super long for amplicon in all_amplicons: if len(amplicon_names[amplicon]) > 21: amplicon_names[amplicon] = amplicon_names[amplicon][0:21] #make sure no duplicate amplicon names (same name for the different amplicons) seen_names = [] for amplicon in all_amplicons: suffix_counter = 2 orig_name = amplicon_names[amplicon] while amplicon_names[amplicon] in seen_names: amplicon_names[amplicon] = orig_name + "_" + str( suffix_counter) suffix_counter += 1 seen_names.append(amplicon_names[amplicon]) crispresso2_info['ref_names'] = seen_names crispresso2_info['refs'] = {} crispresso2_info['summary_plot_names'] = [] crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} with open(_jp('CRISPRessoAggregate_amplicon_information.txt'), 'w') as outfile: outfile.write("\t".join([ 'Amplicon Name', 'Number of sources', 'Amplicon sources', 'Amplicon sequence' ]) + "\n") for amplicon in all_amplicons: outfile.write("\t".join([ amplicon_names[amplicon], str(amplicon_counts[amplicon]), ';'.join( amplicon_sources[amplicon]), amplicon ]) + "\n") window_nuc_pct_quilt_plot_names = [] nuc_pct_quilt_plot_names = [] window_nuc_conv_plot_names = [] nuc_conv_plot_names = [] #report for amplicons that appear multiple times for amplicon_index, amplicon_seq in enumerate(all_amplicons): amplicon_name = amplicon_names[amplicon_seq] crispresso2_info['refs'][amplicon_name] = {} #only perform comparison if amplicon seen in more than one sample if amplicon_counts[amplicon_seq] < 2: continue info('Reporting summary for amplicon: "' + amplicon_name + '"') consensus_sequence = "" nucleotide_frequency_summary = [] nucleotide_percentage_summary = [] modification_frequency_summary = [] modification_percentage_summary = [] amp_found_count = 0 #how many folders had information for this amplicon consensus_guides = [] consensus_include_idxs = [] consensus_sgRNA_plot_idxs = [] consensus_sgRNA_intervals = [] guides_all_same = True runs_with_this_amplicon = [] for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] run_has_amplicon = False run_amplicon_name = '' for ref_name in run_data['ref_names']: if amplicon_seq == run_data['refs'][ref_name][ 'sequence']: run_has_amplicon = True run_amplicon_name = ref_name if not run_has_amplicon: continue runs_with_this_amplicon.append(crispresso2_folder) if consensus_guides == []: consensus_guides = run_data['refs'][run_amplicon_name][ 'sgRNA_sequences'] consensus_include_idxs = run_data['refs'][ run_amplicon_name]['include_idxs'] consensus_sgRNA_intervals = run_data['refs'][ run_amplicon_name]['sgRNA_intervals'] consensus_sgRNA_plot_idxs = run_data['refs'][ run_amplicon_name]['sgRNA_plot_idxs'] if run_data['refs'][run_amplicon_name][ 'sgRNA_sequences'] != consensus_guides: guides_all_same = False if set(run_data['refs'][run_amplicon_name] ['include_idxs']) != set(consensus_include_idxs): guides_all_same = False if 'nuc_freq_filename' not in run_data['refs'][ run_amplicon_name]: info( "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information." % (run_amplicon_name, crispresso2_folder)) continue nucleotide_frequency_file = os.path.join( crispresso2_folder, run_data['refs'][run_amplicon_name] ['nuc_freq_filename']) ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file( nucleotide_frequency_file) nucleotide_pct_file = os.path.join( crispresso2_folder, run_data['refs'][run_amplicon_name] ['nuc_pct_filename']) ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file( nucleotide_pct_file) count_file = os.path.join( crispresso2_folder, run_data['refs'][run_amplicon_name] ['mod_count_filename']) ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file( count_file) if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None: info( "Skipping the amplicon '%s' in folder '%s'. Could not parse run output." % (run_amplicon_name, crispresso2_folder)) info( "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'" % (ampSeq_nf, ampSeq_np, ampSeq_cf)) continue if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf: warn( "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s" % (run_amplicon_name, crispresso2_folder, ampSeq_nf, ampSeq_np, ampSeq_cf, amplicon_seq)) continue if consensus_sequence == "": consensus_sequence = ampSeq_nf if ampSeq_nf != consensus_sequence: info( "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match." % (run_amplicon_name, crispresso2_folder)) continue if 'Total' not in mod_freqs: info( "Skipping the amplicon '%s' in folder '%s'. Processing did not complete." % (run_amplicon_name, crispresso2_folder)) continue if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][ 0] == "0": info( "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon." % (run_amplicon_name, crispresso2_folder)) continue this_amp_total_reads = run_data['counts_total'][ run_amplicon_name] if this_amp_total_reads < args.min_reads_for_inclusion: info( "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)." % (run_amplicon_name, crispresso2_folder, str(this_amp_total_reads), args.min_reads_for_inclusion)) continue mod_pcts = {} for key in mod_freqs: mod_pcts[key] = np.array(mod_freqs[key]).astype( np.float) / float(this_amp_total_reads) amp_found_count += 1 run_name = crispresso2_folder_names[crispresso2_folder] for nuc in ['A', 'T', 'C', 'G', 'N', '-']: row = [run_name, nuc] row.extend(nuc_freqs[nuc]) nucleotide_frequency_summary.append(row) pct_row = [run_name, nuc] pct_row.extend(nuc_pcts[nuc]) nucleotide_percentage_summary.append(pct_row) for mod in [ 'Insertions', 'Insertions_Left', 'Deletions', 'Substitutions', 'All_modifications' ]: row = [run_name, mod] row.extend(mod_freqs[mod]) modification_frequency_summary.append(row) pct_row = [run_name, mod] pct_row.extend(mod_pcts[mod]) modification_percentage_summary.append(pct_row) if amp_found_count == 0: info( "Couldn't find any data for amplicon '%s'. Not compiling results." % amplicon_name) else: amplicon_plot_name = amplicon_name + "." if len(amplicon_names ) == 1 and amplicon_name == "Reference": amplicon_plot_name = "" colnames = ['Folder', 'Nucleotide'] colnames.extend(list(consensus_sequence)) nucleotide_frequency_summary_df = pd.DataFrame( nucleotide_frequency_summary, columns=colnames) nucleotide_frequency_summary_df = pd.concat([ nucleotide_frequency_summary_df.iloc[:, 0:2], nucleotide_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary.txt') nucleotide_frequency_summary_df.to_csv( nucleotide_frequency_summary_filename, sep='\t', index=None) nucleotide_percentage_summary_df = pd.DataFrame( nucleotide_percentage_summary, columns=colnames) nucleotide_percentage_summary_df = pd.concat([ nucleotide_percentage_summary_df.iloc[:, 0:2], nucleotide_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary.txt') nucleotide_percentage_summary_df.to_csv( nucleotide_percentage_summary_filename, sep='\t', index=None) colnames = ['Folder', 'Modification'] colnames.extend(list(consensus_sequence)) modification_frequency_summary_df = pd.DataFrame( modification_frequency_summary, columns=colnames) modification_frequency_summary_df = pd.concat([ modification_frequency_summary_df.iloc[:, 0:2], modification_frequency_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_frequency_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_FREQUENCY_SUMMARY.txt') modification_frequency_summary_df.to_csv( modification_frequency_summary_filename, sep='\t', index=None) modification_percentage_summary_df = pd.DataFrame( modification_percentage_summary, columns=colnames) modification_percentage_summary_df = pd.concat([ modification_percentage_summary_df.iloc[:, 0:2], modification_percentage_summary_df.iloc[:, 2:].apply( pd.to_numeric) ], axis=1) modification_percentage_summary_filename = _jp( amplicon_plot_name + 'MODIFICATION_PERCENTAGE_SUMMARY.txt') modification_percentage_summary_df.to_csv( modification_percentage_summary_filename, sep='\t', index=None) crispresso2_info['refs'][amplicon_name][ 'nucleotide_frequency_summary_filename'] = os.path.basename( nucleotide_frequency_summary_filename) crispresso2_info['refs'][amplicon_name][ 'nucleotide_percentage_summary_filename'] = os.path.basename( nucleotide_percentage_summary_filename) crispresso2_info['refs'][amplicon_name][ 'modification_frequency_summary_filename'] = os.path.basename( modification_frequency_summary_filename) crispresso2_info['refs'][amplicon_name][ 'modification_percentage_summary_filename'] = os.path.basename( modification_percentage_summary_filename) #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window if guides_all_same and consensus_guides != []: info( "All guides are equal. Performing comparison of runs for amplicon '%s'" % amplicon_name) include_idxs = consensus_include_idxs #include indexes are the same for all guides for idx, sgRNA in enumerate(consensus_guides): sgRNA_intervals = consensus_sgRNA_intervals[idx] sgRNA_plot_idxs = consensus_sgRNA_plot_idxs[idx] plot_idxs_flat = [0, 1] # guide, nucleotide plot_idxs_flat.extend( [plot_idx + 2 for plot_idx in sgRNA_plot_idxs]) sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:, plot_idxs_flat] sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:, plot_idxs_flat] sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:, plot_idxs_flat] #show all sgRNA's on the plot sub_sgRNA_intervals = [] for sgRNA_interval in consensus_sgRNA_intervals: newstart = None newend = None for idx, i in enumerate(sgRNA_plot_idxs): if i <= sgRNA_interval[0]: newstart = idx if newend is None and i >= sgRNA_interval[ 1]: newend = idx #if guide doesn't overlap with plot idxs if newend == 0 or newstart == len( sgRNA_plot_idxs): continue #otherwise, correct partial overlaps elif newstart == None and newend == None: newstart = 0 newend = len(include_idxs) - 1 elif newstart == None: newstart = 0 elif newend == None: newend = len(include_idxs) - 1 #and add it to the list sub_sgRNA_intervals.append((newstart, newend)) if not args.suppress_plots: #plot for each guide this_window_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt_around_sgRNA_' + sgRNA) CRISPRessoPlot.plot_nucleotide_quilt( sub_nucleotide_percentage_summary_df, sub_modification_percentage_summary_df, this_window_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=sub_sgRNA_intervals, quantification_window_idxs=include_idxs, group_column='Folder') plot_name = os.path.basename( this_window_nuc_pct_quilt_plot_name) window_nuc_pct_quilt_plot_names.append( plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name if len(consensus_guides) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][ plot_name] = [ (amplicon_name + ' nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename )), (amplicon_name + ' modification frequencies', os.path.basename( modification_frequency_summary_filename )) ] sub_nucleotide_frequency_summary_df = pd.concat( [ sub_nucleotide_frequency_summary_df. iloc[:, 0:2], sub_nucleotide_frequency_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_frequency_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_frequency_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_frequency_summary_df.to_csv( sub_nucleotide_frequency_summary_filename, sep='\t', index=None) sub_nucleotide_percentage_summary_df = pd.concat( [ sub_nucleotide_percentage_summary_df. iloc[:, 0:2], sub_nucleotide_percentage_summary_df. iloc[:, 2:].apply(pd.to_numeric) ], axis=1) sub_nucleotide_percentage_summary_filename = _jp( amplicon_plot_name + 'Nucleotide_percentage_summary_around_sgRNA_' + sgRNA + '.txt') sub_nucleotide_percentage_summary_df.to_csv( sub_nucleotide_percentage_summary_filename, sep='\t', index=None) if not args.suppress_plots: # plot the whole region this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png, sgRNA_intervals=consensus_sgRNA_intervals, quantification_window_idxs=include_idxs, group_column='Folder') plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'Amplicon: ' + amplicon_name if len(amplicon_names) == 1: crispresso2_info['summary_plot_titles'][ plot_name] = '' crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ (amplicon_name + ' nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), (amplicon_name + ' modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] else: #guides are not the same if not args.suppress_plots: this_nuc_pct_quilt_plot_name = _jp( amplicon_plot_name + 'Nucleotide_percentage_quilt') CRISPRessoPlot.plot_nucleotide_quilt( nucleotide_percentage_summary_df, modification_percentage_summary_df, this_nuc_pct_quilt_plot_name, save_png, group_column='Folder') plot_name = os.path.basename( this_nuc_pct_quilt_plot_name) nuc_pct_quilt_plot_names.append(plot_name) crispresso2_info['summary_plot_labels'][ plot_name] = 'Composition of each base for the amplicon ' + amplicon_name crispresso2_info['summary_plot_datas'][plot_name] = [ (amplicon_name + ' nucleotide frequencies', os.path.basename( nucleotide_frequency_summary_filename)), (amplicon_name + ' modification frequencies', os.path.basename( modification_frequency_summary_filename)) ] crispresso2_info[ 'window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names crispresso2_info[ 'nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names crispresso2_info[ 'window_nuc_conv_plot_names'] = window_nuc_conv_plot_names crispresso2_info['nuc_conv_plot_names'] = nuc_conv_plot_names quantification_summary = [] #summarize amplicon modifications samples_quantification_summary_by_amplicon_filename = _jp( 'CRISPRessoAggregate_quantification_of_editing_frequency_by_amplicon.txt' ) #this file has separate lines for each amplicon in each run with open(samples_quantification_summary_by_amplicon_filename, 'w') as outfile: wrote_header = False for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] run_name = crispresso2_folder_names[crispresso2_folder] amplicon_modification_file = os.path.join( crispresso2_folder, run_data['quant_of_editing_freq_filename']) with open(amplicon_modification_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Folder\t' + file_head) wrote_header = True for line in infile: outfile.write(crispresso2_folder + "\t" + line) n_tot = run_data['aln_stats']['N_TOT_READS'] n_aligned = 0 n_unmod = 0 n_mod = 0 n_discarded = 0 n_insertion = 0 n_deletion = 0 n_substitution = 0 n_only_insertion = 0 n_only_deletion = 0 n_only_substitution = 0 n_insertion_and_deletion = 0 n_insertion_and_substitution = 0 n_deletion_and_substitution = 0 n_insertion_and_deletion_and_substitution = 0 for ref_name in run_data[ 'ref_names']: #multiple alleles could be provided n_aligned += run_data['counts_total'][ref_name] n_unmod += run_data['counts_unmodified'][ref_name] n_mod += run_data['counts_modified'][ref_name] n_discarded += run_data['counts_discarded'][ref_name] n_insertion += run_data['counts_insertion'][ref_name] n_deletion += run_data['counts_deletion'][ref_name] n_substitution += run_data['counts_substitution'][ ref_name] n_only_insertion += run_data['counts_only_insertion'][ ref_name] n_only_deletion += run_data['counts_only_deletion'][ ref_name] n_only_substitution += run_data[ 'counts_only_substitution'][ref_name] n_insertion_and_deletion += run_data[ 'counts_insertion_and_deletion'][ref_name] n_insertion_and_substitution += run_data[ 'counts_insertion_and_substitution'][ref_name] n_deletion_and_substitution += run_data[ 'counts_deletion_and_substitution'][ref_name] n_insertion_and_deletion_and_substitution += run_data[ 'counts_insertion_and_deletion_and_substitution'][ ref_name] unmod_pct = np.nan mod_pct = np.nan if n_aligned > 0: unmod_pct = 100 * n_unmod / float(n_aligned) mod_pct = 100 * n_mod / float(n_aligned) vals = [run_name] vals.extend([ round(unmod_pct, 8), round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod, n_discarded, n_insertion, n_deletion, n_substitution, n_only_insertion, n_only_deletion, n_only_substitution, n_insertion_and_deletion, n_insertion_and_substitution, n_deletion_and_substitution, n_insertion_and_deletion_and_substitution ]) quantification_summary.append(vals) header = 'Name\tUnmodified%\tModified%\tReads_total\tReads_aligned\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions' header_els = header.split("\t") df_summary_quantification = pd.DataFrame(quantification_summary, columns=header_els) samples_quantification_summary_filename = _jp( 'CRISPRessoAggregate_quantification_of_editing_frequency.txt' ) #this file has one line for each run (sum of all amplicons) df_summary_quantification.fillna('NA').to_csv( samples_quantification_summary_filename, sep='\t', index=None) crispresso2_info[ 'samples_quantification_summary_filename'] = os.path.basename( samples_quantification_summary_filename) crispresso2_info[ 'samples_quantification_summary_by_amplicon_filename'] = os.path.basename( samples_quantification_summary_by_amplicon_filename) df_summary_quantification.set_index('Name') if not args.suppress_plots: plot_root = _jp("CRISPRessoAggregate_reads_summary") CRISPRessoPlot.plot_reads_total(plot_root, df_summary_quantification, save_png, args.min_reads_for_inclusion) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoAggregate Mapping Statistics Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads in each sample. The vertical line shows the cutoff for analysis, set using the --min_reads_for_inclusion parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoAggregate summary', os.path.basename(samples_quantification_summary_filename) ), ('CRISPRessoAggregate summary by amplicon', os.path.basename( samples_quantification_summary_by_amplicon_filename)) ] plot_root = _jp( "CRISPRessoAggregate_quantification_of_editing_frequency") CRISPRessoPlot.plot_unmod_mod_pcts( plot_root, df_summary_quantification, save_png, args.min_reads_for_inclusion) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][ plot_name] = 'CRISPRessoAggregate Modification Summary' crispresso2_info['summary_plot_labels'][ plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_for_inclusion parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [ ('CRISPRessoAggregate summary', os.path.basename(samples_quantification_summary_filename) ), ('CRISPRessoAggregate summary by amplicon', os.path.basename( samples_quantification_summary_by_amplicon_filename)) ] #summarize alignment with open(_jp('CRISPRessoAggregate_mapping_statistics.txt'), 'w') as outfile: wrote_header = False for crispresso2_folder in crispresso2_folders: run_data = crispresso2_folder_infos[crispresso2_folder] run_name = crispresso2_folder_names[crispresso2_folder] mapping_file = os.path.join( crispresso2_folder, run_data['mapping_stats_filename']) with open(mapping_file, 'r') as infile: file_head = infile.readline() if not wrote_header: outfile.write('Folder\t' + file_head) wrote_header = True for line in infile: outfile.write(crispresso2_folder + "\t" + line) if not args.suppress_report: report_filename = OUTPUT_DIRECTORY + '.html' if (args.place_report_in_output_folder): report_filename = _jp("CRISPResso2Aggregate_report.html") CRISPRessoReport.make_aggregate_report( crispresso2_info, args.name, report_filename, OUTPUT_DIRECTORY, _ROOT, crispresso2_folders, crispresso2_folder_htmls) crispresso2_info['report_location'] = report_filename crispresso2_info['report_filename'] = os.path.basename( report_filename) end_time = datetime.now() end_time_string = end_time.strftime('%Y-%m-%d %H:%M:%S') running_time = end_time - start_time running_time_string = str(running_time) crispresso2_info['end_time'] = end_time crispresso2_info['end_time_string'] = end_time_string crispresso2_info['running_time'] = running_time crispresso2_info['running_time_string'] = running_time_string cp.dump(crispresso2_info, open(crispresso2Aggregate_info_file, 'wb')) info('Analysis Complete!') print(CRISPRessoShared.get_crispresso_footer()) sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)
def main(): try: description = ['~~~CRISPRessoPooled~~~','-Analysis of CRISPR/Cas9 outcomes from POOLED deep sequencing data-'] pooled_string = r''' _______________________ | __ __ __ __ __ | ||__)/ \/ \| |_ | \ | || \__/\__/|__|__|__/ | |_______________________| ''' print(CRISPRessoShared.get_crispresso_header(description,pooled_string)) parser = CRISPRessoShared.getCRISPRessoArgParser(parserTitle = 'CRISPRessoPooled Parameters',requiredParams={'fastq_r1':True}) parser.add_argument('-f','--amplicons_file', type=str, help='Amplicons description file. This file is a tab-delimited text file with up to 5 columns (2 required):\ \nAMPLICON_NAME: an identifier for the amplicon (must be unique)\nAMPLICON_SEQUENCE: amplicon sequence used in the experiment\n\ \nsgRNA_SEQUENCE (OPTIONAL): sgRNA sequence used for this amplicon without the PAM sequence. Multiple guides can be given separated by commas and not spaces. If not available enter NA.\ \nEXPECTED_AMPLICON_AFTER_HDR (OPTIONAL): expected amplicon sequence in case of HDR. If not available enter NA.\ \nCODING_SEQUENCE (OPTIONAL): Subsequence(s) of the amplicon corresponding to coding sequences. If more than one separate them by commas and not spaces. If not available enter NA.', default='') #tool specific optional parser.add_argument('--gene_annotations', type=str, help='Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \ please select as table "knownGene", as output format "all fields from selected table" and as file returned "gzip compressed"', default='') parser.add_argument('-p','--n_processes',type=int, help='Specify the number of processes to use for Bowtie2.\ Please use with caution since increasing this parameter will increase significantly the memory required to run CRISPResso.',default=1) parser.add_argument('-x','--bowtie2_index', type=str, help='Basename of Bowtie2 index for the reference genome', default='') parser.add_argument('--bowtie2_options_string', type=str, help='Override options for the Bowtie2 alignment command',default=' -k 1 --end-to-end -N 0 --np 0 ') parser.add_argument('--min_reads_to_use_region', type=float, help='Minimum number of reads that align to a region to perform the CRISPResso analysis', default=1000) parser.add_argument('--skip_failed', help='Continue with pooled analysis even if one sample fails',action='store_true') parser.add_argument('--crispresso_command', help='CRISPResso command to call',default='CRISPResso') args = parser.parse_args() crispresso_options = CRISPRessoShared.get_crispresso_options() options_to_ignore = set(['fastq_r1','fastq_r2','amplicon_seq','amplicon_name','output_folder','name']) crispresso_options_for_pooled = list(crispresso_options-options_to_ignore) info('Checking dependencies...') if check_samtools() and check_bowtie2(): info('All the required dependencies are present!') else: sys.exit(1) #check files check_file(args.fastq_r1) if args.fastq_r2: check_file(args.fastq_r2) if args.bowtie2_index: check_file(args.bowtie2_index+'.1.bt2') if args.amplicons_file: check_file(args.amplicons_file) if args.gene_annotations: check_file(args.gene_annotations) if args.amplicons_file and not args.bowtie2_index: RUNNING_MODE='ONLY_AMPLICONS' info('Only the Amplicon description file was provided. The analysis will be perfomed using only the provided amplicons sequences.') elif args.bowtie2_index and not args.amplicons_file: RUNNING_MODE='ONLY_GENOME' info('Only the bowtie2 reference genome index file was provided. The analysis will be perfomed using only genomic regions where enough reads align.') elif args.bowtie2_index and args.amplicons_file: RUNNING_MODE='AMPLICONS_AND_GENOME' info('Amplicon description file and bowtie2 reference genome index files provided. The analysis will be perfomed using the reads that are aligned ony to the amplicons provided and not to other genomic regions.') else: error('Please provide the amplicons description file (-f or --amplicons_file option) or the bowtie2 reference genome index file (-x or --bowtie2_index option) or both.') sys.exit(1) ####TRIMMING AND MERGING get_name_from_fasta=lambda x: os.path.basename(x).replace('.fastq','').replace('.gz','') if not args.name: if args.fastq_r2!='': database_id='%s_%s' % (get_name_from_fasta(args.fastq_r1),get_name_from_fasta(args.fastq_r2)) else: database_id='%s' % get_name_from_fasta(args.fastq_r1) else: database_id=args.name OUTPUT_DIRECTORY='CRISPRessoPooled_on_%s' % database_id if args.output_folder: OUTPUT_DIRECTORY=os.path.join(os.path.abspath(args.output_folder),OUTPUT_DIRECTORY) _jp=lambda filename: os.path.join(OUTPUT_DIRECTORY,filename) #handy function to put a file in the output directory try: info('Creating Folder %s' % OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY) info('Done!') except: warn('Folder %s already exists.' % OUTPUT_DIRECTORY) log_filename=_jp('CRISPRessoPooled_RUNNING_LOG.txt') logging.getLogger().addHandler(logging.FileHandler(log_filename)) crispresso2WGS_info_file = os.path.join(OUTPUT_DIRECTORY,'CRISPResso2Pooled_info.pickle') crispresso2_info = {} #keep track of all information for this run to be pickled and saved at the end of the run crispresso2_info['version'] = CRISPRessoShared.__version__ crispresso2_info['args'] = deepcopy(args) crispresso2_info['log_filename'] = os.path.basename(log_filename) with open(log_filename,'w+') as outfile: outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv)) if args.fastq_r2=='': #single end reads #check if we need to trim if not args.trim_sequences: #create a symbolic link symlink_filename=_jp(os.path.basename(args.fastq_r1)) force_symlink(os.path.abspath(args.fastq_r1),symlink_filename) output_forward_filename=symlink_filename else: output_forward_filename=_jp('reads.trimmed.fq.gz') #Trimming with trimmomatic cmd='%s SE -phred33 %s %s %s >>%s 2>&1'\ % (args.trimmomatic_command,args.fastq_r1, output_forward_filename, args.trimmomatic_options_string, log_filename) #print cmd TRIMMOMATIC_STATUS=sb.call(cmd,shell=True) if TRIMMOMATIC_STATUS: raise TrimmomaticException('TRIMMOMATIC failed to run, please check the log file.') processed_output_filename=output_forward_filename else:#paired end reads case if not args.trim_sequences: output_forward_paired_filename=args.fastq_r1 output_reverse_paired_filename=args.fastq_r2 else: info('Trimming sequences with Trimmomatic...') output_forward_paired_filename=_jp('output_forward_paired.fq.gz') output_forward_unpaired_filename=_jp('output_forward_unpaired.fq.gz') output_reverse_paired_filename=_jp('output_reverse_paired.fq.gz') output_reverse_unpaired_filename=_jp('output_reverse_unpaired.fq.gz') #Trimming with trimmomatic cmd='%s PE -phred33 %s %s %s %s %s %s %s >>%s 2>&1'\ % (args.trimmomatic_command, args.fastq_r1,args.fastq_r2,output_forward_paired_filename, output_forward_unpaired_filename,output_reverse_paired_filename, output_reverse_unpaired_filename,args.trimmomatic_options_string,log_filename) #print cmd TRIMMOMATIC_STATUS=sb.call(cmd,shell=True) if TRIMMOMATIC_STATUS: raise TrimmomaticException('TRIMMOMATIC failed to run, please check the log file.') info('Done!') max_overlap_string = "" min_overlap_string = "" if args.max_paired_end_reads_overlap: max_overlap_string = "--max-overlap " + str(args.max_paired_end_reads_overlap) if args.min_paired_end_reads_overlap: min_overlap_string = args.min_paired_end_reads_overlap #Merging with Flash info('Merging paired sequences with Flash...') cmd=args.flash_command+' --allow-outies %s %s %s %s -z -d %s >>%s 2>&1' %\ (output_forward_paired_filename, output_reverse_paired_filename, max_overlap_string, max_overlap_string, OUTPUT_DIRECTORY,log_filename) FLASH_STATUS=sb.call(cmd,shell=True) if FLASH_STATUS: raise FlashException('Flash failed to run, please check the log file.') info('Done!') flash_hist_filename=_jp('out.hist') flash_histogram_filename=_jp('out.histogram') flash_not_combined_1_filename=_jp('out.notCombined_1.fastq.gz') flash_not_combined_2_filename=_jp('out.notCombined_2.fastq.gz') processed_output_filename=_jp('out.extendedFrags.fastq.gz') #count reads N_READS_INPUT=get_n_reads_fastq(args.fastq_r1) N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename) #load gene annotation if args.gene_annotations: info('Loading gene coordinates from annotation file: %s...' % args.gene_annotations) try: df_genes=pd.read_table(args.gene_annotations,compression='gzip') df_genes.txEnd=df_genes.txEnd.astype(int) df_genes.txStart=df_genes.txStart.astype(int) df_genes.head() except: info('Failed to load the gene annotations file.') if RUNNING_MODE=='ONLY_AMPLICONS' or RUNNING_MODE=='AMPLICONS_AND_GENOME': #load and validate template file df_template=pd.read_csv(args.amplicons_file,names=[ 'Name','Amplicon_Sequence','sgRNA', 'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str}) if str(df_template.iloc[0,1]).lower() == "amplicon_sequence": df_template.drop(0,axis=0,inplace=True) info('Detected header in amplicon file.') #remove empty amplicons/lines df_template.dropna(subset=['Amplicon_Sequence'],inplace=True) df_template.dropna(subset=['Name'],inplace=True) df_template.Amplicon_Sequence=df_template.Amplicon_Sequence.apply(capitalize_sequence) df_template.Expected_HDR=df_template.Expected_HDR.apply(capitalize_sequence) df_template.sgRNA=df_template.sgRNA.apply(capitalize_sequence) df_template.Coding_sequence=df_template.Coding_sequence.apply(capitalize_sequence) if not len(df_template.Amplicon_Sequence.unique())==df_template.shape[0]: duplicated_entries = df_template.Amplicon_Sequence[df_template.Amplicon_Sequence.duplicated()] raise Exception('The amplicon sequences must be distinct! (Duplicated entries: ' + str(duplicated_entries.values) + ')') if not len(df_template.Name.unique())==df_template.shape[0]: duplicated_entries = df_template.Name[df_template.Name.duplicated()] raise Exception('The amplicon names must be distinct! (Duplicated names: ' + str(duplicated_entries.values) + ')') df_template=df_template.set_index('Name') df_template.index=df_template.index.to_series().str.replace(' ','_') for idx,row in df_template.iterrows(): wrong_nt=find_wrong_nt(row.Amplicon_Sequence) if wrong_nt: raise NTException('The amplicon sequence %s contains wrong characters:%s' % (idx,' '.join(wrong_nt))) if not pd.isnull(row.sgRNA): cut_points=[] for current_guide_seq in row.sgRNA.strip().upper().split(','): wrong_nt=find_wrong_nt(current_guide_seq) if wrong_nt: raise NTException('The sgRNA sequence %s contains wrong characters:%s' % (current_guide_seq, ' '.join(wrong_nt))) offset_fw=args.quantification_window_center+len(current_guide_seq)-1 offset_rc=(-args.quantification_window_center)-1 cut_points+=[m.start() + offset_fw for \ m in re.finditer(current_guide_seq, row.Amplicon_Sequence)]+[m.start() + offset_rc for m in re.finditer(reverse_complement(current_guide_seq), row.Amplicon_Sequence)] if not cut_points: warn('\nThe guide sequence/s provided: %s is(are) not present in the amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!' % (row.sgRNA,row.Amplicon_Sequence)) df_template.ix[idx,'sgRNA']='' if RUNNING_MODE=='ONLY_AMPLICONS': #create a fasta file with all the amplicons amplicon_fa_filename=_jp('AMPLICONS.fa') fastq_gz_amplicon_filenames=[] with open(amplicon_fa_filename,'w+') as outfile: for idx,row in df_template.iterrows(): if row['Amplicon_Sequence']: outfile.write('>%s\n%s\n' %(clean_filename('AMPL_'+idx),row['Amplicon_Sequence'])) #create place-holder fastq files fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % clean_filename('AMPL_'+idx))) open(fastq_gz_amplicon_filenames[-1], 'w+').close() df_template['Demultiplexed_fastq.gz_filename']=fastq_gz_amplicon_filenames info('Creating a custom index file with all the amplicons...') custom_index_filename=_jp('CUSTOM_BOWTIE2_INDEX') sb.call('bowtie2-build %s %s >>%s 2>&1' %(amplicon_fa_filename,custom_index_filename,log_filename), shell=True) #align the file to the amplicons (MODE 1) info('Align reads to the amplicons...') bam_filename_amplicons= _jp('CRISPResso_AMPLICONS_ALIGNED.bam') aligner_command= 'bowtie2 -x %s -p %s %s -U %s 2>>%s | samtools view -bS - > %s' %(custom_index_filename,args.n_processes,args.bowtie2_options_string,processed_output_filename,log_filename,bam_filename_amplicons) info('Alignment command: ' + aligner_command) sb.call(aligner_command,shell=True) N_READS_ALIGNED=get_n_aligned_bam(bam_filename_amplicons) s1=r"samtools view -F 4 %s 2>>%s | grep -v ^'@'" % (bam_filename_amplicons,log_filename) s2=r'''|awk '{ gzip_filename=sprintf("gzip >> OUTPUTPATH%s.fastq.gz",$3);\ print "@"$1"\n"$10"\n+\n"$11 | gzip_filename;}' ''' cmd=s1+s2.replace('OUTPUTPATH',_jp('')) sb.call(cmd,shell=True) info('Demultiplex reads and run CRISPResso on each amplicon...') n_reads_aligned_amplicons=[] crispresso_cmds = [] for idx,row in df_template.iterrows(): info('\n Processing:%s' %idx) n_reads_aligned_amplicons.append(get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])) crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' % (row['Demultiplexed_fastq.gz_filename'],row['Amplicon_Sequence'],OUTPUT_DIRECTORY,idx) if n_reads_aligned_amplicons[-1]>args.min_reads_to_use_region: if row['sgRNA'] and not pd.isnull(row['sgRNA']): crispresso_cmd+=' -g %s' % row['sgRNA'] if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']): crispresso_cmd+=' -e %s' % row['Expected_HDR'] if row['Coding_sequence'] and not pd.isnull(row['Coding_sequence']): crispresso_cmd+=' -c %s' % row['Coding_sequence'] crispresso_cmd=CRISPRessoShared.propagate_crispresso_options(crispresso_cmd,crispresso_options_for_pooled,args) crispresso_cmds.append(crispresso_cmd) else: warn('Skipping amplicon [%s] because no reads align to it\n'% idx) CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,args.n_processes,'amplicon',args.skip_failed) df_template['n_reads']=n_reads_aligned_amplicons df_template['n_reads_aligned_%']=df_template['n_reads']/float(N_READS_ALIGNED)*100 df_template.fillna('NA').to_csv(_jp('REPORT_READS_ALIGNED_TO_AMPLICONS.txt'),sep='\t') if RUNNING_MODE=='AMPLICONS_AND_GENOME': print 'Mapping amplicons to the reference genome...' #find the locations of the amplicons on the genome and their strand and check if there are mutations in the reference genome additional_columns=[] for idx,row in df_template.iterrows(): fields_to_append=list(np.take(get_align_sequence(row.Amplicon_Sequence, args.bowtie2_index).split('\t'),[0,1,2,3,5])) if fields_to_append[0]=='*': info('The amplicon [%s] is not mappable to the reference genome provided!' % idx ) additional_columns.append([idx,'NOT_ALIGNED',0,-1,'+','']) else: additional_columns.append([idx]+fields_to_append) info('The amplicon [%s] was mapped to: %s ' % (idx,' '.join(fields_to_append[:3]) )) df_template=df_template.join(pd.DataFrame(additional_columns,columns=['Name','chr_id','bpstart','bpend','strand','Reference_Sequence']).set_index('Name')) df_template.bpstart=df_template.bpstart.astype(int) df_template.bpend=df_template.bpend.astype(int) #Check reference is the same otherwise throw a warning for idx,row in df_template.iterrows(): if row.Amplicon_Sequence != row.Reference_Sequence and row.Amplicon_Sequence != reverse_complement(row.Reference_Sequence): warn('The amplicon sequence %s provided:\n%s\n\nis different from the reference sequence(both strand):\n\n%s\n\n%s\n' %(row.name,row.Amplicon_Sequence,row.Amplicon_Sequence,reverse_complement(row.Amplicon_Sequence))) if RUNNING_MODE=='ONLY_GENOME' or RUNNING_MODE=='AMPLICONS_AND_GENOME': ###HERE we recreate the uncompressed genome file if not available### #check you have all the files for the genome and create a fa idx for samtools uncompressed_reference=args.bowtie2_index+'.fa' #if not os.path.exists(GENOME_LOCAL_FOLDER): # os.mkdir(GENOME_LOCAL_FOLDER) if os.path.exists(uncompressed_reference): info('The uncompressed reference fasta file for %s is already present! Skipping generation.' % args.bowtie2_index) else: #uncompressed_reference=os.path.join(GENOME_LOCAL_FOLDER,'UNCOMPRESSED_REFERENCE_FROM_'+args.bowtie2_index.replace('/','_')+'.fa') info('Extracting uncompressed reference from the provided bowtie2 index since it is not available... Please be patient!') cmd_to_uncompress='bowtie2-inspect %s > %s 2>>%s' % (args.bowtie2_index,uncompressed_reference,log_filename) sb.call(cmd_to_uncompress,shell=True) info('Indexing fasta file with samtools...') #!samtools faidx {uncompressed_reference} sb.call('samtools faidx %s 2>>%s ' % (uncompressed_reference,log_filename),shell=True) #####CORRECT ONE#### #align in unbiased way the reads to the genome if RUNNING_MODE=='ONLY_GENOME' or RUNNING_MODE=='AMPLICONS_AND_GENOME': info('Aligning reads to the provided genome index...') bam_filename_genome = _jp('%s_GENOME_ALIGNED.bam' % database_id) aligner_command= 'bowtie2 -x %s -p %s %s -U %s 2>>%s| samtools view -bS - > %s' %(args.bowtie2_index,args.n_processes,args.bowtie2_options_string,processed_output_filename,log_filename,bam_filename_genome) info('aligning with command: ' + aligner_command) sb.call(aligner_command,shell=True) N_READS_ALIGNED=get_n_aligned_bam(bam_filename_genome) #REDISCOVER LOCATIONS and DEMULTIPLEX READS MAPPED_REGIONS=_jp('MAPPED_REGIONS/') if not os.path.exists(MAPPED_REGIONS): os.mkdir(MAPPED_REGIONS) s1=r'''samtools view -F 0x0004 %s 2>>%s |''' % (bam_filename_genome,log_filename)+\ r'''awk '{OFS="\t"; bpstart=$4; bpend=bpstart; split ($6,a,"[MIDNSHP]"); n=0;\ for (i=1; i in a; i++){\ n+=1+length(a[i]);\ if (substr($6,n,1)=="S"){\ if (bpend==$4)\ bpstart-=a[i];\ else bpend+=a[i]; }\ else if( (substr($6,n,1)!="I") && (substr($6,n,1)!="H") )\ bpend+=a[i];\ }\ if ( ($2 % 32)>=16)\ print $3,bpstart,bpend,"-",$1,$10,$11;\ else\ print $3,bpstart,bpend,"+",$1,$10,$11;}' | ''' s2=r''' sort -k1,1 -k2,2n | awk \ 'BEGIN{chr_id="NA";bpstart=-1;bpend=-1; fastq_filename="NA"}\ { if ( (chr_id!=$1) || (bpstart!=$2) || (bpend!=$3) )\ {\ if (fastq_filename!="NA") {close(fastq_filename); system("gzip -f "fastq_filename)}\ chr_id=$1; bpstart=$2; bpend=$3;\ fastq_filename=sprintf("__OUTPUTPATH__REGION_%s_%s_%s.fastq",$1,$2,$3);\ }\ print "@"$5"\n"$6"\n+\n"$7 >> fastq_filename;\ }' ''' cmd=s1+s2.replace('__OUTPUTPATH__',MAPPED_REGIONS) info('Demultiplexing reads by location...') sb.call(cmd,shell=True) #gzip the missing ones sb.call('gzip -f %s/*.fastq' % MAPPED_REGIONS,shell=True) ''' The most common use case, where many different target sites are pooled into a single high-throughput sequencing library for quantification, is not directly addressed by this implementation. Potential users of CRISPResso would need to write their own code to generate separate input files for processing. Importantly, this preprocessing code would need to remove any PCR amplification artifacts (such as amplification of sequences from a gene and a highly similar pseudogene ) which may confound the interpretation of results. This can be done by mapping of input sequences to a reference genome and removing those that do not map to the expected genomic location, but is non-trivial for an end-user to implement. ''' if RUNNING_MODE=='AMPLICONS_AND_GENOME': files_to_match=glob.glob(os.path.join(MAPPED_REGIONS,'REGION*')) n_reads_aligned_genome=[] fastq_region_filenames=[] crispresso_cmds = [] for idx,row in df_template.iterrows(): info('Processing amplicon: %s' % idx ) #check if we have reads fastq_filename_region=os.path.join(MAPPED_REGIONS,'REGION_%s_%s_%s.fastq.gz' % (row['chr_id'],row['bpstart'],row['bpend'])) if os.path.exists(fastq_filename_region): N_READS=get_n_reads_fastq(fastq_filename_region) n_reads_aligned_genome.append(N_READS) fastq_region_filenames.append(fastq_filename_region) files_to_match.remove(fastq_filename_region) if N_READS>=args.min_reads_to_use_region: info('\nThe amplicon [%s] has enough reads (%d) mapped to it! Running CRISPResso!\n' % (idx,N_READS)) crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' % (fastq_filename_region,row['Amplicon_Sequence'],OUTPUT_DIRECTORY,idx) if row['sgRNA'] and not pd.isnull(row['sgRNA']): crispresso_cmd+=' -g %s' % row['sgRNA'] if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']): crispresso_cmd+=' -e %s' % row['Expected_HDR'] if row['Coding_sequence'] and not pd.isnull(row['Coding_sequence']): crispresso_cmd+=' -c %s' % row['Coding_sequence'] crispresso_cmd=CRISPRessoShared.propagate_crispresso_options(crispresso_cmd,crispresso_options_for_pooled,args) info('Running CRISPResso:%s' % crispresso_cmd) crispresso_cmds.append(crispresso_cmd) else: warn('The amplicon [%s] has not enough reads (%d) mapped to it! Skipping the execution of CRISPResso!' % (idx,N_READS)) else: fastq_region_filenames.append('') n_reads_aligned_genome.append(0) warn("The amplicon %s doesn't have any read mapped to it!\n Please check your amplicon sequence." % idx) CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,args.n_processes,'amplicon',args.skip_failed) df_template['Amplicon_Specific_fastq.gz_filename']=fastq_region_filenames df_template['n_reads']=n_reads_aligned_genome df_template['n_reads_aligned_%']=df_template['n_reads']/float(N_READS_ALIGNED)*100 if args.gene_annotations: df_template=df_template.apply(lambda row: find_overlapping_genes(row, df_genes),axis=1) df_template.fillna('NA').to_csv(_jp('REPORT_READS_ALIGNED_TO_GENOME_AND_AMPLICONS.txt'),sep='\t') #write another file with the not amplicon regions info('Reporting problematic regions...') coordinates=[] for region in files_to_match: coordinates.append(os.path.basename(region).replace('.fastq.gz','').replace('.fastq','').split('_')[1:4]+[region,get_n_reads_fastq(region)]) df_regions=pd.DataFrame(coordinates,columns=['chr_id','bpstart','bpend','fastq_file','n_reads']) df_regions.dropna(inplace=True) #remove regions in chrUn df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart']) df_regions['bpend'] = pd.to_numeric(df_regions['bpend']) df_regions['n_reads'] = pd.to_numeric(df_regions['n_reads']) df_regions.bpstart=df_regions.bpstart.astype(int) df_regions.bpend=df_regions.bpend.astype(int) df_regions['n_reads_aligned_%']=df_regions['n_reads']/float(N_READS_ALIGNED)*100 df_regions['Reference_sequence']=df_regions.apply(lambda row: get_region_from_fa(row.chr_id,row.bpstart,row.bpend,uncompressed_reference),axis=1) if args.gene_annotations: info('Checking overlapping genes...') df_regions=df_regions.apply(lambda row: find_overlapping_genes(row, df_genes),axis=1) if np.sum(np.array(map(int,pd.__version__.split('.')))*(100,10,1))< 170: df_regions.sort('n_reads',ascending=False,inplace=True) else: df_regions.sort_values(by='n_reads',ascending=False,inplace=True) df_regions.fillna('NA').to_csv(_jp('REPORTS_READS_ALIGNED_TO_GENOME_NOT_MATCHING_AMPLICONS.txt'),sep='\t',index=None) if RUNNING_MODE=='ONLY_GENOME' : #Load regions and build REFERENCE TABLES info('Parsing the demultiplexed files and extracting locations and reference sequences...') coordinates=[] for region in glob.glob(os.path.join(MAPPED_REGIONS,'REGION*.fastq.gz')): coord_from_filename = os.path.basename(region).replace('.fastq.gz','').split('_')[1:4] # print('ccord from filename: ' + str(coord_from_filename)) if not (coord_from_filename[1].isdigit() and coord_from_filename[2].isdigit()): warn('Skipping region [%s] because the region name cannot be parsed\n'% region) continue coordinates.append(coord_from_filename+[region,get_n_reads_fastq(region)]) df_regions=pd.DataFrame(coordinates,columns=['chr_id','bpstart','bpend','fastq_file','n_reads']) df_regions.dropna(inplace=True) #remove regions in chrUn df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart']) df_regions['bpend'] = pd.to_numeric(df_regions['bpend']) df_regions['n_reads'] = pd.to_numeric(df_regions['n_reads']) df_regions.bpstart=df_regions.bpstart.astype(int) df_regions.bpend=df_regions.bpend.astype(int) df_regions['sequence']=df_regions.apply(lambda row: get_region_from_fa(row.chr_id,row.bpstart,row.bpend,uncompressed_reference),axis=1) df_regions['n_reads_aligned_%']=df_regions['n_reads']/float(N_READS_ALIGNED)*100 if args.gene_annotations: info('Checking overlapping genes...') df_regions=df_regions.apply(lambda row: find_overlapping_genes(row, df_genes),axis=1) if np.sum(np.array(map(int,pd.__version__.split('.')))*(100,10,1))< 170: df_regions.sort('n_reads',ascending=False,inplace=True) else: df_regions.sort_values(by='n_reads',ascending=False,inplace=True) df_regions.fillna('NA').to_csv(_jp('REPORT_READS_ALIGNED_TO_GENOME_ONLY.txt'),sep='\t',index=None) #run CRISPResso #demultiplex reads in the amplicons and call crispresso! info('Running CRISPResso on the regions discovered...') crispresso_cmds = [] for idx,row in df_regions.iterrows(): if row.n_reads > args.min_reads_to_use_region: info('\nRunning CRISPResso on: %s-%d-%d...'%(row.chr_id,row.bpstart,row.bpend )) crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s' %(row.fastq_file,row.sequence,OUTPUT_DIRECTORY) crispresso_cmd=CRISPRessoShared.propagate_crispresso_options(crispresso_cmd,crispresso_options_for_pooled,args) crispresso_cmds.append(crispresso_cmd) else: info('Skipping region: %s-%d-%d , not enough reads (%d)' %(row.chr_id,row.bpstart,row.bpend, row.n_reads)) CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,args.n_processes,'region',args.skip_failed) #write alignment statistics with open(_jp('MAPPING_STATISTICS.txt'),'w+') as outfile: outfile.write('READS IN INPUTS:%d\nREADS AFTER PREPROCESSING:%d\nREADS ALIGNED:%d' % (N_READS_INPUT,N_READS_AFTER_PREPROCESSING,N_READS_ALIGNED)) quantification_summary=[] if RUNNING_MODE=='ONLY_AMPLICONS' or RUNNING_MODE=='AMPLICONS_AND_GENOME': df_final_data=df_template else: df_final_data=df_regions all_region_names = [] all_region_read_counts = {} good_region_names = [] good_region_folders = {} header = 'Name\tUnmodified%\tModified%\tReads_aligned\tReads_total\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions' header_els = header.split("\t") header_el_count = len(header_els) empty_line_els = [np.nan]*(header_el_count-1) n_reads_index = header_els.index('Reads_total') - 1 for idx,row in df_final_data.iterrows(): run_name = idx if RUNNING_MODE=='ONLY_AMPLICONS' or RUNNING_MODE=='AMPLICONS_AND_GENOME': run_name=idx else: run_name='REGION_%s_%d_%d' %(row.chr_id,row.bpstart,row.bpend ) folder_name = 'CRISPResso_on_%s'%run_name all_region_names.append(run_name) all_region_read_counts[run_name] = row.n_reads run_file = os.path.join(_jp(folder_name),'CRISPResso2_info.pickle') if not os.path.exists(run_file): warn('Skipping the folder %s: not enough reads, incomplete, or empty folder.'% folder_name) this_els = empty_line_els[:] this_els[n_reads_index] = row.n_reads to_add = [run_name] to_add.extend(this_els) quantification_summary.append(to_add) else: run_data = cp.load(open(run_file,'rb')) ref_name = run_data['ref_names'][0] #only expect one amplicon sequence n_tot = row.n_reads n_aligned = run_data['counts_total'][ref_name] n_unmod = run_data['counts_unmodified'][ref_name] n_mod = run_data['counts_modified'][ref_name] n_discarded = run_data['counts_discarded'][ref_name] n_insertion = run_data['counts_insertion'][ref_name] n_deletion = run_data['counts_deletion'][ref_name] n_substitution = run_data['counts_substitution'][ref_name] n_only_insertion = run_data['counts_only_insertion'][ref_name] n_only_deletion = run_data['counts_only_deletion'][ref_name] n_only_substitution = run_data['counts_only_substitution'][ref_name] n_insertion_and_deletion = run_data['counts_insertion_and_deletion'][ref_name] n_insertion_and_substitution = run_data['counts_insertion_and_substitution'][ref_name] n_deletion_and_substitution = run_data['counts_deletion_and_substitution'][ref_name] n_insertion_and_deletion_and_substitution = run_data['counts_insertion_and_deletion_and_substitution'][ref_name] unmod_pct = np.nan mod_pct = np.nan if n_aligned > 0: unmod_pct = 100*n_unmod/float(n_aligned) mod_pct = 100*n_mod/float(n_aligned) vals = [run_name] vals.extend([round(unmod_pct,8),round(mod_pct,8),n_aligned,n_tot,n_unmod,n_mod,n_discarded,n_insertion,n_deletion,n_substitution,n_only_insertion,n_only_deletion,n_only_substitution,n_insertion_and_deletion,n_insertion_and_substitution,n_deletion_and_substitution,n_insertion_and_deletion_and_substitution]) quantification_summary.append(vals) good_region_names.append(run_name) good_region_folders[idx] = folder_name samples_quantification_summary_filename = _jp('SAMPLES_QUANTIFICATION_SUMMARY.txt') df_summary_quantification=pd.DataFrame(quantification_summary,columns=header_els) if args.crispresso1_mode: crispresso1_columns=['Name','Unmodified%','Modified%','Reads_aligned','Reads_total'] df_summary_quantification.fillna('NA').to_csv(samples_quantification_summary_filename,sep='\t',index=None,columns=crispresso1_columns) else: df_summary_quantification.fillna('NA').to_csv(samples_quantification_summary_filename,sep='\t',index=None) crispresso2_info['samples_quantification_summary_filename'] = os.path.basename(samples_quantification_summary_filename) crispresso2_info['final_data'] = df_final_data crispresso2_info['all_region_names'] = all_region_names crispresso2_info['all_region_read_counts'] = all_region_read_counts crispresso2_info['good_region_names'] = good_region_names crispresso2_info['good_region_folders'] = good_region_folders crispresso2_info['running_mode'] = RUNNING_MODE crispresso2_info['summary_plot_names'] = [] crispresso2_info['summary_plot_titles'] = {} crispresso2_info['summary_plot_labels'] = {} crispresso2_info['summary_plot_datas'] = {} df_summary_quantification.set_index('Name') save_png = True if args.suppress_report: save_png = False plot_root = _jp("CRISPRessoPooled_reads_summary") CRISPRessoPlot.plot_reads_total(plot_root,df_summary_quantification,save_png,args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][plot_name] = 'CRISPRessoPooled Read Allocation Summary' crispresso2_info['summary_plot_labels'][plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [('CRISPRessoPooled summary',os.path.basename(samples_quantification_summary_filename))] plot_root = _jp("CRISPRessoPooled_modification_summary") CRISPRessoPlot.plot_unmod_mod_pcts(plot_root,df_summary_quantification,save_png,args.min_reads_to_use_region) plot_name = os.path.basename(plot_root) crispresso2_info['summary_plot_root'] = plot_name crispresso2_info['summary_plot_names'].append(plot_name) crispresso2_info['summary_plot_titles'][plot_name] = 'CRISPRessoPooled Modification Summary' crispresso2_info['summary_plot_labels'][plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.' crispresso2_info['summary_plot_datas'][plot_name] = [('CRISPRessoPooled summary',os.path.basename(samples_quantification_summary_filename))] #if many reads weren't aligned, print those out for the user if RUNNING_MODE != 'ONLY_GENOME': #N_READS_INPUT=get_n_reads_fastq(args.fastq_r1) #N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename) tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum() tot_reads = df_summary_quantification['Reads_total'].sum() if RUNNING_MODE=='AMPLICONS_AND_GENOME': this_bam_filename = bam_filename_genome if RUNNING_MODE=='ONLY_AMPLICONS': this_bam_filename = bam_filename_amplicons #if less than 1/2 of reads aligned, find most common unaligned reads and advise the user if N_READS_INPUT > 0 and tot_reads/float(N_READS_INPUT) < 0.5: warn('Less than half (%d/%d) of reads aligned. Finding most frequent unaligned reads.'%(tot_reads,N_READS_INPUT)) ### ###this results in the unpretty messages being printed: ### sort: write failed: standard output: Broken pipe ### sort: write error ### #cmd = "samtools view -f 4 %s | awk '{print $10}' | sort | uniq -c | sort -nr | head -n 10"%this_bam_filename import signal def default_sigpipe(): signal.signal(signal.SIGPIPE, signal.SIG_DFL) cmd = "samtools view -f 4 %s | head -n 10000 | awk '{print $10}' | sort | uniq -c | sort -nr | head -n 10 | awk '{print $2}'"%this_bam_filename # print("command is: "+cmd) # p = sb.Popen(cmd, shell=True,stdout=sb.PIPE) p = sb.Popen(cmd, shell=True,stdout=sb.PIPE,preexec_fn=default_sigpipe) top_unaligned = p.communicate()[0] top_unaligned_filename=_jp('CRISPRessoPooled_TOP_UNALIGNED.txt') with open(top_unaligned_filename,'w') as outfile: outfile.write(top_unaligned) warn('Perhaps one or more of the given amplicon sequences were incomplete or incorrect. Below is a list of the most frequent unaligned reads (in the first 10000 unaligned reads). Check this list to see if an amplicon is among these reads.\n%s'%top_unaligned) #cleaning up if not args.keep_intermediate: info('Removing Intermediate files...') if args.fastq_r2!='': files_to_remove=[processed_output_filename,flash_hist_filename,flash_histogram_filename,\ flash_not_combined_1_filename,flash_not_combined_2_filename] else: files_to_remove=[processed_output_filename] if args.trim_sequences and args.fastq_r2!='': files_to_remove+=[output_forward_paired_filename,output_reverse_paired_filename,\ output_forward_unpaired_filename,output_reverse_unpaired_filename] if RUNNING_MODE=='ONLY_GENOME' or RUNNING_MODE=='AMPLICONS_AND_GENOME': files_to_remove+=[bam_filename_genome] if RUNNING_MODE=='ONLY_AMPLICONS': files_to_remove+=[bam_filename_amplicons,amplicon_fa_filename] for bowtie2_file in glob.glob(_jp('CUSTOM_BOWTIE2_INDEX.*')): files_to_remove.append(bowtie2_file) for file_to_remove in files_to_remove: try: if os.path.islink(file_to_remove): #print 'LINK',file_to_remove os.unlink(file_to_remove) else: os.remove(file_to_remove) except: warn('Skipping:%s' %file_to_remove) if not args.suppress_report: if (args.place_report_in_output_folder): report_name = _jp("CRISPResso2Pooled_report.html") else: report_name = OUTPUT_DIRECTORY+'.html' CRISPRessoReport.make_pooled_report_from_folder(report_name,crispresso2_info,OUTPUT_DIRECTORY,_ROOT) crispresso2_info['report_location'] = report_name crispresso2_info['report_filename'] = os.path.basename(report_name) cp.dump(crispresso2_info, open(crispresso2WGS_info_file, 'wb' ) ) info('All Done!') print CRISPRessoShared.get_crispresso_footer() sys.exit(0) except Exception as e: debug_flag = False if 'args' in vars() and 'debug' in args: debug_flag = args.debug if debug_flag: traceback.print_exc(file=sys.stdout) error('\n\nERROR: %s' % e) sys.exit(-1)