Ejemplo n.º 1
0
def plot_alleles_table(
    reference_seq,
    df_alleles,
    fig_filename_root,
    MIN_FREQUENCY=0.5,
    MAX_N_ROWS=100,
    SAVE_ALSO_PNG=False,
    plot_cut_point=True,
    cut_point_ind=None,
    sgRNA_intervals=None,
    sgRNA_names=None,
    sgRNA_mismatches=None,
    custom_colors=None,
    annotate_wildtype_allele='****',
):
    """
    plots an allele table for a dataframe with allele frequencies
    input:
    reference_seq: the reference amplicon sequence to plot
    df_alleles: merged dataframe (should include columns "#Reads','%Reads')
    fig_filename: figure filename to plot (not including '.pdf' or '.png')
    MIN_FREQUENCY: sum of alleles % must add to this to be plotted
    MAX_N_ROWS: max rows to plot
    SAVE_ALSO_PNG: whether to write png file as well
    plot_cut_point: if false, won't draw 'predicted cleavage' line
    cut_point_ind: index to plot cut point at
    sgRNA_intervals: locations where sgRNA is located
    sgRNA_mismatches: array (for each sgRNA_interval) of locations in sgRNA where there are mismatches
    sgRNA_names: array (for each sgRNA_interval) of names of sgRNAs (otherwise empty)
    custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
    annotate_wildtype_allele: string to add to the end of the wildtype allele (e.g. ** or '')
    """
    X, annot, y_labels, insertion_dict, per_element_annot_kws, is_reference = CRISPRessoPlot.prep_alleles_table(
        df_alleles, reference_seq, MAX_N_ROWS, MIN_FREQUENCY)
    if annotate_wildtype_allele != '':
        for ix, is_ref in enumerate(is_reference):
            if is_ref:
                y_labels[ix] += annotate_wildtype_allele
    plot_alleles_heatmap(reference_seq, fig_filename_root, X, annot, y_labels,
                         insertion_dict, per_element_annot_kws, SAVE_ALSO_PNG,
                         plot_cut_point, cut_point_ind, sgRNA_intervals,
                         sgRNA_names, sgRNA_mismatches, custom_colors)
Ejemplo n.º 2
0
def main():
    def print_stacktrace_if_debug():
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)
            error(traceback.format_exc())

    try:
        description = [
            '~~~CRISPRessoWGS~~~',
            '-Analysis of CRISPR/Cas9 outcomes from WGS data-'
        ]
        wgs_string = r'''
 ____________
|     __  __ |
||  |/ _ (_  |
||/\|\__)__) |
|____________|
        '''
        print(CRISPRessoShared.get_crispresso_header(description, wgs_string))

        parser = CRISPRessoShared.getCRISPRessoArgParser(
            parserTitle='CRISPRessoWGS Parameters', requiredParams={})

        #tool specific optional
        parser.add_argument('-b',
                            '--bam_file',
                            type=str,
                            help='WGS aligned bam file',
                            required=True,
                            default='bam filename')
        parser.add_argument(
            '-f',
            '--region_file',
            type=str,
            help=
            'Regions description file. A BED format  file containing the regions to analyze, one per line. The REQUIRED\
        columns are: chr_id(chromosome name), bpstart(start position), bpend(end position), the optional columns are:name (an unique indentifier for the region), guide_seq, expected_hdr_amplicon_seq,coding_seq, see CRISPResso help for more details on these last 3 parameters)',
            required=True)
        parser.add_argument(
            '-r',
            '--reference_file',
            type=str,
            help=
            'A FASTA format reference file (for example hg19.fa for the human genome)',
            default='',
            required=True)
        parser.add_argument(
            '--min_reads_to_use_region',
            type=float,
            help=
            'Minimum number of reads that align to a region to perform the CRISPResso analysis',
            default=10)
        parser.add_argument(
            '--skip_failed',
            help='Continue with pooled analysis even if one sample fails',
            action='store_true')
        parser.add_argument(
            '--gene_annotations',
            type=str,
            help=
            'Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \
        please select as table "knowGene", as output format "all fields from selected table" and as file returned "gzip compressed"',
            default='')
        parser.add_argument(
            '-p',
            '--n_processes',
            type=int,
            help='Specify the number of processes to use for the quantification.\
        Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.',
            default=1)
        parser.add_argument('--crispresso_command',
                            help='CRISPResso command to call',
                            default='CRISPResso')

        args = parser.parse_args()

        crispresso_options = CRISPRessoShared.get_crispresso_options()
        options_to_ignore = set([
            'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name',
            'output_folder', 'name'
        ])
        crispresso_options_for_wgs = list(crispresso_options -
                                          options_to_ignore)

        info('Checking dependencies...')

        if check_samtools() and check_bowtie2():
            info('\n All the required dependencies are present!')
        else:
            sys.exit(1)

        #check files
        check_file(args.bam_file)

        check_file(args.reference_file)

        check_file(args.region_file)

        if args.gene_annotations:
            check_file(args.gene_annotations)

        #INIT
        get_name_from_bam = lambda x: os.path.basename(x).replace('.bam', '')

        if not args.name:
            database_id = '%s' % get_name_from_bam(args.bam_file)
        else:
            database_id = args.name

        OUTPUT_DIRECTORY = 'CRISPRessoWGS_on_%s' % database_id

        if args.output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.output_folder), OUTPUT_DIRECTORY)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
            info('Done!')
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoWGS_RUNNING_LOG.txt')
        logging.getLogger().addHandler(logging.FileHandler(log_filename))

        with open(log_filename, 'w+') as outfile:
            outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' %
                          ' '.join(sys.argv))

        crispresso2WGS_info_file = os.path.join(OUTPUT_DIRECTORY,
                                                'CRISPResso2WGS_info.pickle')
        crispresso2_info = {
        }  #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['version'] = CRISPRessoShared.__version__
        crispresso2_info['args'] = deepcopy(args)

        crispresso2_info['log_filename'] = os.path.basename(log_filename)

        def rreplace(s, old, new):
            li = s.rsplit(old)
            return new.join(li)

        bam_index = ''
        #check if bam has the index already
        if os.path.exists(rreplace(args.bam_file, ".bam", ".bai")):
            info('Index file for input .bam file exists, skipping generation.')
            bam_index = args.bam_file.rreplace(".bam", ".bai")
        elif os.path.exists(args.bam_file + '.bai'):
            info('Index file for input .bam file exists, skipping generation.')
            bam_index = args.bam_file + '.bai'
        else:
            info('Creating index file for input .bam file...')
            sb.call('samtools index %s ' % (args.bam_file), shell=True)
            bam_index = args.bam_file + '.bai'

        #load gene annotation
        if args.gene_annotations:
            info('Loading gene coordinates from annotation file: %s...' %
                 args.gene_annotations)
            try:
                df_genes = pd.read_table(args.gene_annotations,
                                         compression='gzip')
                df_genes.txEnd = df_genes.txEnd.astype(int)
                df_genes.txStart = df_genes.txStart.astype(int)
                df_genes.head()
            except:
                info('Failed to load the gene annotations file.')

        #Load and validate the REGION FILE
        df_regions = pd.read_csv(args.region_file,
                                 names=[
                                     'chr_id', 'bpstart', 'bpend', 'Name',
                                     'sgRNA', 'Expected_HDR', 'Coding_sequence'
                                 ],
                                 comment='#',
                                 sep='\t',
                                 dtype={'Name': str})

        #remove empty amplicons/lines
        df_regions.dropna(subset=['chr_id', 'bpstart', 'bpend'], inplace=True)

        df_regions.Expected_HDR = df_regions.Expected_HDR.apply(
            capitalize_sequence)
        df_regions.sgRNA = df_regions.sgRNA.apply(capitalize_sequence)
        df_regions.Coding_sequence = df_regions.Coding_sequence.apply(
            capitalize_sequence)

        #check or create names
        for idx, row in df_regions.iterrows():
            if pd.isnull(row.Name):
                df_regions.ix[idx, 'Name'] = '_'.join(
                    map(str, [row['chr_id'], row['bpstart'], row['bpend']]))

        if not len(df_regions.Name.unique()) == df_regions.shape[0]:
            raise Exception('The amplicon names should be all distinct!')

        df_regions = df_regions.set_index('Name')
        #df_regions.index=df_regions.index.str.replace(' ','_')
        df_regions.index = df_regions.index.to_series().str.replace(' ', '_')

        #extract sequence for each region
        uncompressed_reference = args.reference_file

        if os.path.exists(uncompressed_reference + '.fai'):
            info(
                'The index for the reference fasta file is already present! Skipping generation.'
            )
        else:
            info('Indexing reference file... Please be patient!')
            sb.call('samtools faidx %s >>%s 2>&1' %
                    (uncompressed_reference, log_filename),
                    shell=True)

        df_regions['sequence'] = df_regions.apply(
            lambda row: get_region_from_fa(row.chr_id, row.bpstart, row.bpend,
                                           uncompressed_reference),
            axis=1)

        for idx, row in df_regions.iterrows():

            if not pd.isnull(row.sgRNA):

                cut_points = []

                for current_guide_seq in row.sgRNA.strip().upper().split(','):

                    wrong_nt = find_wrong_nt(current_guide_seq)
                    if wrong_nt:
                        raise NTException(
                            'The sgRNA sequence %s contains wrong characters:%s'
                            % (current_guide_seq, ' '.join(wrong_nt)))

                    offset_fw = args.quantification_window_center + len(
                        current_guide_seq) - 1
                    offset_rc = (-args.quantification_window_center) - 1
                    cut_points+=[m.start() + offset_fw for \
                                m in re.finditer(current_guide_seq,  row.sequence)]+[m.start() + offset_rc for m in re.finditer(CRISPRessoShared.reverse_complement(current_guide_seq),  row.sequence)]

                if not cut_points:
                    df_regions.ix[idx, 'sgRNA'] = ''

        df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart'])
        df_regions['bpend'] = pd.to_numeric(df_regions['bpend'])

        df_regions.bpstart = df_regions.bpstart.astype(int)
        df_regions.bpend = df_regions.bpend.astype(int)

        if args.gene_annotations:
            df_regions = df_regions.apply(
                lambda row: find_overlapping_genes(row, df_genes), axis=1)

        #extract reads with samtools in that region and create a bam
        #create a fasta file with all the trimmed reads
        info('\nProcessing each region...')

        ANALYZED_REGIONS = _jp('ANALYZED_REGIONS/')
        if not os.path.exists(ANALYZED_REGIONS):
            os.mkdir(ANALYZED_REGIONS)

        df_regions['n_reads'] = 0
        df_regions['bam_file_with_reads_in_region'] = ''
        df_regions['fastq.gz_file_trimmed_reads_in_region'] = ''

        for idx, row in df_regions.iterrows():

            if row['sequence']:

                fastq_gz_filename = os.path.join(
                    ANALYZED_REGIONS,
                    '%s.fastq.gz' % clean_filename('REGION_' + str(idx)))
                bam_region_filename = os.path.join(
                    ANALYZED_REGIONS,
                    '%s.bam' % clean_filename('REGION_' + str(idx)))

                #create place-holder fastq files
                open(fastq_gz_filename, 'w+').close()

                region = '%s:%d-%d' % (row.chr_id, row.bpstart, row.bpend - 1)
                info('\nExtracting reads in:%s and create the .bam file: %s' %
                     (region, bam_region_filename))

                #extract reads in region
                cmd = r'''samtools view -b -F 4 %s %s > %s ''' % (
                    args.bam_file, region, bam_region_filename)
                #print cmd
                sb.call(cmd, shell=True)

                #index bam file
                cmd = r'''samtools index %s ''' % (bam_region_filename)
                #print cmd
                sb.call(cmd, shell=True)

                info('Trim reads and create a fastq.gz file in: %s' %
                     fastq_gz_filename)
                #trim reads in bam and convert in fastq
                n_reads = write_trimmed_fastq(bam_region_filename,
                                              row['bpstart'], row['bpend'],
                                              fastq_gz_filename)
                df_regions.ix[idx, 'n_reads'] = n_reads
                df_regions.ix[
                    idx, 'bam_file_with_reads_in_region'] = bam_region_filename
                df_regions.ix[
                    idx,
                    'fastq.gz_file_trimmed_reads_in_region'] = fastq_gz_filename

        df_regions.fillna('NA').to_csv(
            _jp('REPORT_READS_ALIGNED_TO_SELECTED_REGIONS_WGS.txt'), sep='\t')

        #Run Crispresso
        info('\nRunning CRISPResso on each region...')
        crispresso_cmds = []
        for idx, row in df_regions.iterrows():

            if row['n_reads'] >= args.min_reads_to_use_region:
                info('\nThe region [%s] has enough reads (%d) mapped to it!' %
                     (idx, row['n_reads']))

                crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' %\
                (row['fastq.gz_file_trimmed_reads_in_region'],row['sequence'],OUTPUT_DIRECTORY,idx)

                if row['sgRNA'] and not pd.isnull(row['sgRNA']):
                    crispresso_cmd += ' -g %s' % row['sgRNA']

                if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']):
                    crispresso_cmd += ' -e %s' % row['Expected_HDR']

                if row['Coding_sequence'] and not pd.isnull(
                        row['Coding_sequence']):
                    crispresso_cmd += ' -c %s' % row['Coding_sequence']

                crispresso_cmd = CRISPRessoShared.propagate_crispresso_options(
                    crispresso_cmd, crispresso_options_for_wgs, args)
                crispresso_cmds.append(crispresso_cmd)


#                    info('Running CRISPResso:%s' % crispresso_cmd)
#                    sb.call(crispresso_cmd,shell=True)

            else:
                info(
                    '\nThe region [%s] has too few reads mapped to it (%d)! Not running CRISPResso!'
                    % (idx, row['n_reads']))

        CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,
                                                      args.n_processes,
                                                      'region',
                                                      args.skip_failed)

        quantification_summary = []
        all_region_names = []
        all_region_read_counts = {}
        good_region_names = []
        good_region_folders = {}
        header = 'Name\tUnmodified%\tModified%\tReads_aligned\tReads_total\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions'
        header_els = header.split("\t")
        header_el_count = len(header_els)
        empty_line_els = [np.nan] * (header_el_count - 1)
        n_reads_index = header_els.index('Reads_total') - 1
        for idx, row in df_regions.iterrows():
            folder_name = 'CRISPResso_on_%s' % idx
            run_name = idx

            all_region_names.append(run_name)
            all_region_read_counts[run_name] = row.n_reads

            run_file = os.path.join(_jp(folder_name),
                                    'CRISPResso2_info.pickle')
            if not os.path.exists(run_file):
                warn(
                    'Skipping the folder %s: not enough reads, incomplete, or empty folder.'
                    % folder_name)
                this_els = empty_line_els[:]
                this_els[n_reads_index] = row.n_reads
                to_add = [run_name]
                to_add.extend(this_els)
                quantification_summary.append(to_add)
            else:
                run_data = cp.load(open(run_file, 'rb'))
                ref_name = run_data['ref_names'][
                    0]  #only expect one amplicon sequence
                n_tot = row.n_reads
                n_aligned = run_data['counts_total'][ref_name]
                n_unmod = run_data['counts_unmodified'][ref_name]
                n_mod = run_data['counts_modified'][ref_name]
                n_discarded = run_data['counts_discarded'][ref_name]

                n_insertion = run_data['counts_insertion'][ref_name]
                n_deletion = run_data['counts_deletion'][ref_name]
                n_substitution = run_data['counts_substitution'][ref_name]
                n_only_insertion = run_data['counts_only_insertion'][ref_name]
                n_only_deletion = run_data['counts_only_deletion'][ref_name]
                n_only_substitution = run_data['counts_only_substitution'][
                    ref_name]
                n_insertion_and_deletion = run_data[
                    'counts_insertion_and_deletion'][ref_name]
                n_insertion_and_substitution = run_data[
                    'counts_insertion_and_substitution'][ref_name]
                n_deletion_and_substitution = run_data[
                    'counts_deletion_and_substitution'][ref_name]
                n_insertion_and_deletion_and_substitution = run_data[
                    'counts_insertion_and_deletion_and_substitution'][ref_name]

                unmod_pct = "NA"
                mod_pct = "NA"
                if n_aligned > 0:
                    unmod_pct = 100 * n_unmod / float(n_aligned)
                    mod_pct = 100 * n_mod / float(n_aligned)

                vals = [run_name]
                vals.extend([
                    round(unmod_pct, 8),
                    round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod,
                    n_discarded, n_insertion, n_deletion, n_substitution,
                    n_only_insertion, n_only_deletion, n_only_substitution,
                    n_insertion_and_deletion, n_insertion_and_substitution,
                    n_deletion_and_substitution,
                    n_insertion_and_deletion_and_substitution
                ])
                quantification_summary.append(vals)

                good_region_names.append(idx)
                good_region_folders[idx] = folder_name
        samples_quantification_summary_filename = _jp(
            'SAMPLES_QUANTIFICATION_SUMMARY.txt')

        df_summary_quantification = pd.DataFrame(quantification_summary,
                                                 columns=header_els)
        if args.crispresso1_mode:
            crispresso1_columns = [
                'Name', 'Unmodified%', 'Modified%', 'Reads_aligned',
                'Reads_total'
            ]
            df_summary_quantification.fillna('NA').to_csv(
                samples_quantification_summary_filename,
                sep='\t',
                index=None,
                columns=crispresso1_columns)
        else:
            df_summary_quantification.fillna('NA').to_csv(
                samples_quantification_summary_filename, sep='\t', index=None)

        crispresso2_info[
            'samples_quantification_summary_filename'] = os.path.basename(
                samples_quantification_summary_filename)
        crispresso2_info['regions'] = df_regions
        crispresso2_info['all_region_names'] = all_region_names
        crispresso2_info['all_region_read_counts'] = all_region_read_counts
        crispresso2_info['good_region_names'] = good_region_names
        crispresso2_info['good_region_folders'] = good_region_folders

        crispresso2_info['summary_plot_names'] = []
        crispresso2_info['summary_plot_titles'] = {}
        crispresso2_info['summary_plot_labels'] = {}
        crispresso2_info['summary_plot_datas'] = {}

        df_summary_quantification.set_index('Name')

        save_png = True
        if args.suppress_report:
            save_png = False

        plot_root = _jp("CRISPRessoWGS_modification_summary")
        CRISPRessoPlot.plot_unmod_mod_pcts(plot_root,
                                           df_summary_quantification, save_png,
                                           args.min_reads_to_use_region)
        plot_name = os.path.basename(plot_root)
        crispresso2_info['summary_plot_root'] = plot_name
        crispresso2_info['summary_plot_names'].append(plot_name)
        crispresso2_info['summary_plot_titles'][
            plot_name] = 'CRISPRessoWGS Modification Summary'
        crispresso2_info['summary_plot_labels'][
            plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
        crispresso2_info['summary_plot_datas'][plot_name] = [
            ('CRISPRessoWGS summary',
             os.path.basename(samples_quantification_summary_filename))
        ]

        plot_root = _jp("CRISPRessoWGS_reads_summary")
        CRISPRessoPlot.plot_reads_total(plot_root, df_summary_quantification,
                                        save_png, args.min_reads_to_use_region)
        plot_name = os.path.basename(plot_root)
        crispresso2_info['summary_plot_root'] = plot_name
        crispresso2_info['summary_plot_names'].append(plot_name)
        crispresso2_info['summary_plot_titles'][
            plot_name] = 'CRISPRessoWGS Read Allocation Summary'
        crispresso2_info['summary_plot_labels'][
            plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
        crispresso2_info['summary_plot_datas'][plot_name] = [
            ('CRISPRessoWGS summary',
             os.path.basename(samples_quantification_summary_filename))
        ]

        if not args.suppress_report:
            report_name = _jp('CRISPResso2WGS_report.html')
            CRISPRessoReport.make_wgs_report_from_folder(
                report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT)

        cp.dump(crispresso2_info, open(crispresso2WGS_info_file, 'wb'))

        info('Analysis Complete!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        print_stacktrace_if_debug()
        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Ejemplo n.º 3
0
def main():
    try:
        description = [
            '~~~CRISPRessoBatch~~~',
            '-Analysis of CRISPR/Cas9 outcomes from batch deep sequencing data-'
        ]
        batch_string = r'''
 _________________
| __    ___ __    |
||__) /\ | /  |__||
||__)/--\| \__|  ||
|_________________|
        '''
        print(CRISPRessoShared.get_crispresso_header(description,
                                                     batch_string))

        parser = CRISPRessoShared.getCRISPRessoArgParser(
            parserTitle='CRISPRessoBatch Parameters')

        #batch specific params
        parser.add_argument(
            '-bs',
            '--batch_settings',
            type=str,
            help=
            'Settings file for batch. Must be tab-separated text file. The header row contains CRISPResso parameters (e.g., fastq_r1, fastq_r2, amplicon_seq, and other optional parameters). Each following row sets parameters for an additional batch.',
            required=True)
        parser.add_argument(
            '--skip_failed',
            help='Continue with batch analysis even if one sample fails',
            action='store_true')
        parser.add_argument(
            '--min_reads_for_inclusion',
            help=
            'Minimum number of reads for a batch to be included in the batch summary',
            type=int)
        parser.add_argument(
            '-p',
            '--n_processes',
            type=int,
            help='Specify the number of processes to use for quantification.\
        Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.',
            default=1)
        parser.add_argument(
            '-bo',
            '--batch_output_folder',
            help='Directory where batch analysis output will be stored')
        parser.add_argument('--crispresso_command',
                            help='CRISPResso command to call',
                            default='CRISPResso')

        args = parser.parse_args()

        debug_flag = args.debug

        crispresso_options = CRISPRessoShared.get_crispresso_options()
        options_to_ignore = set(['name', 'output_folder'])
        crispresso_options_for_batch = list(crispresso_options -
                                            options_to_ignore)

        CRISPRessoShared.check_file(args.batch_settings)

        ##parse excel sheet
        batch_params = pd.read_csv(args.batch_settings, comment='#', sep='\t')
        #pandas either allows for auto-detect sep or for comment. not both
        #        batch_params=pd.read_csv(args.batch_settings,sep=None,engine='python',error_bad_lines=False)
        batch_params.columns = batch_params.columns.str.strip(' -\xd0')

        #rename column "a" to "amplicon_seq", etc
        batch_params.rename(
            index=str,
            columns=CRISPRessoShared.get_crispresso_options_lookup(),
            inplace=True)
        batch_count = batch_params.shape[0]
        batch_params.index = range(batch_count)

        if 'fastq_r1' not in batch_params and 'bam_input' not in batch_params:
            raise CRISPRessoShared.BadParameterException(
                "fastq_r1 must be specified in the batch settings file. Current headings are: "
                + str(batch_params.columns.values))

        #add args from the command line to batch_params_df
        for arg in vars(args):
            if arg not in batch_params:
                batch_params[arg] = getattr(args, arg)
            else:
                if (getattr(args, arg) is not None):
                    batch_params[arg].fillna(value=getattr(args, arg),
                                             inplace=True)

        #assert that all names are unique
        #and clean names

        for i in range(batch_count):
            if batch_params.loc[i, 'name'] == '':
                batch_params.at[i, 'name'] = i
            batch_params.at[i, 'name'] = CRISPRessoShared.clean_filename(
                batch_params.loc[i, 'name'])

        if batch_params.drop_duplicates(
                'name').shape[0] != batch_params.shape[0]:
            raise CRISPRessoShared.BadParameterException(
                'Batch input names must be unique. The given names are not unique: '
                + str(batch_params.loc[:, 'name']))

        #Check files
        batch_params[
            "sgRNA_intervals"] = ''  #create empty array for sgRNA intervals
        batch_params["sgRNA_intervals"] = batch_params[
            "sgRNA_intervals"].apply(list)
        batch_params[
            "cut_point_include_idx"] = ''  #create empty array for cut point intervals for each batch based on sgRNA
        batch_params["cut_point_include_idx"] = batch_params[
            "cut_point_include_idx"].apply(list)
        for idx, row in batch_params.iterrows():
            if 'fastq_r1' in row:
                if row.fastq_r1 is None:
                    raise CRISPRessoShared.BadParameterException(
                        "At least one fastq file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' (fastq_r1 on row %s '%s' is invalid)"
                        % (int(idx) + 1, row.fastq_r1))
                else:
                    CRISPRessoShared.check_file(row.fastq_r1)

            if 'fastq_r2' in row and row.fastq_r2 != "":
                CRISPRessoShared.check_file(row.fastq_r2)

            if 'input_bam' in row:
                if row.input_bam is None:
                    raise CRISPRessoShared.BadParameterException(
                        "At least one input file must be given as a command line parameter or be specified in the batch settings file with the heading 'fastq_r1' or 'input_bam' (input_bam on row %s '%s' is invalid)"
                        % (int(idx) + 1, row.input_bam))
                else:
                    CRISPRessoShared.check_file(row.input_bam)

            if args.auto:
                continue

            curr_amplicon_seq_str = row.amplicon_seq
            if curr_amplicon_seq_str is None:
                raise CRISPRessoShared.BadParameterException(
                    "Amplicon sequence must be given as a command line parameter or be specified in the batch settings file with the heading 'amplicon_seq' (Amplicon seq on row %s '%s' is invalid)"
                    % (int(idx) + 1, curr_amplicon_seq_str))

            guides_are_in_amplicon = {
            }  #dict of whether a guide is in at least one amplicon sequence
            #iterate through amplicons
            for curr_amplicon_seq in curr_amplicon_seq_str.split(','):
                this_include_idxs = [
                ]  #mask for bp to include for this amplicon seq, as specified by sgRNA cut points
                this_sgRNA_intervals = []
                wrong_nt = CRISPRessoShared.find_wrong_nt(curr_amplicon_seq)
                if wrong_nt:
                    raise CRISPRessoShared.NTException(
                        'The amplicon sequence in row %d (%s) contains incorrect characters:%s'
                        % (idx + 1, curr_amplicon_seq_str, ' '.join(wrong_nt)))

                #iterate through guides
                curr_guide_seq_string = row.guide_seq
                if curr_guide_seq_string is not None and curr_guide_seq_string != "":
                    guides = curr_guide_seq_string.strip().upper().split(',')
                    for curr_guide_seq in guides:
                        wrong_nt = CRISPRessoShared.find_wrong_nt(
                            curr_guide_seq)
                        if wrong_nt:
                            raise CRISPRessoShared.NTException(
                                'The sgRNA sequence in row %d (%s) contains incorrect characters:%s'
                                %
                                (idx + 1, curr_guide_seq, ' '.join(wrong_nt)))
                    guide_mismatches = [[]] * len(guides)
                    guide_names = [""] * len(guides)
                    guide_qw_centers = CRISPRessoShared.set_guide_array(
                        row.quantification_window_center, guides,
                        'guide quantification center')
                    guide_qw_sizes = CRISPRessoShared.set_guide_array(
                        row.quantification_window_size, guides,
                        'guide quantification size')
                    guide_plot_cut_points = [1] * len(guides)
                    (this_sgRNA_sequences, this_sgRNA_intervals,
                     this_sgRNA_cut_points, this_sgRNA_plot_cut_points,
                     this_sgRNA_plot_idxs, this_sgRNA_mismatches,
                     this_sgRNA_names, this_include_idxs, this_exclude_idxs
                     ) = CRISPRessoShared.get_amplicon_info_for_guides(
                         curr_amplicon_seq, guides, guide_mismatches,
                         guide_names, guide_qw_centers, guide_qw_sizes,
                         row.quantification_window_coordinates,
                         row.exclude_bp_from_left, row.exclude_bp_from_right,
                         row.plot_window_size, guide_plot_cut_points)
                    for guide_seq in this_sgRNA_sequences:
                        guides_are_in_amplicon[guide_seq] = 1

                batch_params.ix[idx, "cut_point_include_idx"].append(
                    this_include_idxs)
                batch_params.ix[idx,
                                "sgRNA_intervals"].append(this_sgRNA_intervals)

            for guide_seq in guides_are_in_amplicon:
                if guides_are_in_amplicon[guide_seq] != 1:
                    warn(
                        '\nThe guide sequence provided on row %d (%s) is not present in any amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!'
                        % (idx + 1, row.guide_seq, curr_amplicon_seq))

        batch_folder_name = os.path.splitext(
            os.path.basename(args.batch_settings))[0]
        if args.name and args.name != "":
            batch_folder_name = args.name

        output_folder_name = 'CRISPRessoBatch_on_%s' % batch_folder_name
        OUTPUT_DIRECTORY = os.path.abspath(output_folder_name)

        if args.batch_output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.batch_output_folder), output_folder_name)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoBatch_RUNNING_LOG.txt')
        logging.getLogger().addHandler(logging.FileHandler(log_filename))

        with open(log_filename, 'w+') as outfile:
            outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' %
                          ' '.join(sys.argv))

        crispresso2Batch_info_file = os.path.join(
            OUTPUT_DIRECTORY, 'CRISPResso2Batch_info.pickle')
        crispresso2_info = {
        }  #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['version'] = CRISPRessoShared.__version__
        crispresso2_info['args'] = deepcopy(args)

        crispresso2_info['log_filename'] = os.path.basename(log_filename)

        crispresso_cmds = []
        batch_names_arr = []
        batch_input_names = {}
        for idx, row in batch_params.iterrows():

            batchName = CRISPRessoShared.slugify(row["name"])
            batch_names_arr.append(batchName)
            batch_input_names[batchName] = row["name"]

            crispresso_cmd = args.crispresso_command + ' -o %s --name %s' % (
                OUTPUT_DIRECTORY, batchName)
            crispresso_cmd = propagate_options(crispresso_cmd,
                                               crispresso_options_for_batch,
                                               batch_params, idx)
            crispresso_cmds.append(crispresso_cmd)

        crispresso2_info['batch_names_arr'] = batch_names_arr
        crispresso2_info['batch_input_names'] = batch_input_names

        CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,
                                                      args.n_processes,
                                                      'batch',
                                                      args.skip_failed)

        run_datas = []  #crispresso2 info from each row

        all_amplicons = set()
        amplicon_names = {}
        amplicon_counts = {}
        completed_batch_arr = []
        for idx, row in batch_params.iterrows():
            batchName = CRISPRessoShared.slugify(row["name"])
            file_prefix = row['file_prefix']
            folder_name = os.path.join(OUTPUT_DIRECTORY,
                                       'CRISPResso_on_%s' % batchName)
            run_data_file = os.path.join(folder_name,
                                         'CRISPResso2_info.pickle')
            if os.path.isfile(run_data_file) is False:
                info("Skipping folder '%s'. Cannot find run data at '%s'." %
                     (folder_name, run_data_file))
                run_datas.append(None)
                continue

            run_data = cp.load(open(run_data_file, 'rb'))
            run_datas.append(run_data)
            for ref_name in run_data['ref_names']:
                ref_seq = run_data['refs'][ref_name]['sequence']
                all_amplicons.add(ref_seq)
                #if this amplicon is called something else in another sample, just call it the amplicon
                if ref_name in amplicon_names and amplicon_names[
                        ref_seq] != ref_name:
                    amplicon_names[ref_seq] = ref_seq
                else:
                    amplicon_names[ref_seq] = ref_name
                if ref_seq not in amplicon_counts:
                    amplicon_counts[ref_seq] = 0
                amplicon_counts[ref_seq] += 1

            completed_batch_arr.append(batchName)

        crispresso2_info['completed_batch_arr'] = completed_batch_arr

        #make sure amplicon names aren't super long
        for amplicon in all_amplicons:
            if len(amplicon_names[amplicon]) > 20:
                amplicon_names[amplicon] = amplicon_names[amplicon][0:20]

        #make sure no duplicate names (same name for the different amplicons)
        seen_names = {}
        for amplicon in all_amplicons:
            suffix_counter = 2
            orig_name = amplicon_names[amplicon]
            while amplicon_names[amplicon] in seen_names:
                amplicon_names[amplicon] = orig_name + "_" + str(
                    suffix_counter)
                suffix_counter += 1
            seen_names[amplicon_names[amplicon]] = 1

        save_png = True
        if args.suppress_report:
            save_png = False

        window_nuc_pct_quilt_plot_names = []
        nuc_pct_quilt_plot_names = []
        window_nuc_conv_plot_names = []
        nuc_conv_plot_names = []

        #report for amplicons that appear multiple times
        for amplicon_index, amplicon_seq in enumerate(all_amplicons):
            #only perform comparison if amplicon seen in more than one sample
            if amplicon_counts[amplicon_seq] < 2:
                continue

            amplicon_name = amplicon_names[amplicon_seq]
            info('Reporting summary for amplicon: "' + amplicon_name + '"')

            consensus_sequence = ""
            nucleotide_frequency_summary = []
            nucleotide_percentage_summary = []
            modification_frequency_summary = []
            modification_percentage_summary = []

            amp_found_count = 0  #how many folders had information for this amplicon
            consensus_guides = []
            consensus_include_idxs = []
            consensus_sgRNA_plot_idxs = []
            consensus_sgRNA_intervals = []
            guides_all_same = True
            batches_with_this_amplicon = []
            for idx, row in batch_params.iterrows():
                batchName = CRISPRessoShared.slugify(row["name"])
                file_prefix = row['file_prefix']
                folder_name = os.path.join(OUTPUT_DIRECTORY,
                                           'CRISPResso_on_%s' % batchName)
                run_data = run_datas[idx]
                if run_data is None:
                    continue
                batch_has_amplicon = False
                batch_amplicon_name = ''
                for ref_name in run_data['ref_names']:
                    if amplicon_seq == run_data['refs'][ref_name]['sequence']:
                        batch_has_amplicon = True
                        batch_amplicon_name = ref_name
                if not batch_has_amplicon:
                    continue
                batches_with_this_amplicon.append(idx)

                if consensus_guides == []:
                    consensus_guides = run_data['refs'][batch_amplicon_name][
                        'sgRNA_sequences']
                    consensus_include_idxs = run_data['refs'][
                        batch_amplicon_name]['include_idxs']
                    consensus_sgRNA_intervals = run_data['refs'][
                        batch_amplicon_name]['sgRNA_intervals']
                    consensus_sgRNA_plot_idxs = run_data['refs'][
                        batch_amplicon_name]['sgRNA_plot_idxs']

                if run_data['refs'][batch_amplicon_name][
                        'sgRNA_sequences'] != consensus_guides:
                    guides_all_same = False
                if set(run_data['refs'][batch_amplicon_name]
                       ['include_idxs']) != set(consensus_include_idxs):
                    guides_all_same = False

                if 'nuc_freq_filename' not in run_data['refs'][
                        batch_amplicon_name]:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information."
                        % (batch_amplicon_name, folder_name))
                    continue

                nucleotide_frequency_file = os.path.join(
                    folder_name,
                    run_data['refs'][batch_amplicon_name]['nuc_freq_filename'])
                ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file(
                    nucleotide_frequency_file)

                nucleotide_pct_file = os.path.join(
                    folder_name,
                    run_data['refs'][batch_amplicon_name]['nuc_pct_filename'])
                ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file(
                    nucleotide_pct_file)

                count_file = os.path.join(
                    folder_name, run_data['refs'][batch_amplicon_name]
                    ['mod_count_filename'])
                ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file(
                    count_file)

                if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Could not parse batch output."
                        % (batch_amplicon_name, folder_name))
                    info(
                        "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'"
                        % (ampSeq_nf, ampSeq_np, ampSeq_cf))
                    continue
                if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf:
                    warn(
                        "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s"
                        % (batch_amplicon_name, folder_name, ampSeq_nf,
                           ampSeq_np, ampSeq_cf, amplicon_seq))
                    continue
                if consensus_sequence == "":
                    consensus_sequence = ampSeq_nf
                if ampSeq_nf != consensus_sequence:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match."
                        % (batch_amplicon_name, folder_name))
                    continue
                if 'Total' not in mod_freqs:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Processing did not complete."
                        % (batch_amplicon_name, folder_name))
                    continue
                if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][0] == "0":
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon."
                        % (batch_amplicon_name, folder_name))
                    continue
                if (args.min_reads_for_inclusion is not None) and (int(
                        mod_freqs['Total'][0]) < args.min_reads_for_inclusion):
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)."
                        % (batch_amplicon_name, folder_name,
                           str(mod_freqs['Total'][0]),
                           args.min_reads_for_inclusion))
                    continue

                mod_pcts = {}
                for key in mod_freqs:
                    mod_pcts[key] = np.array(mod_freqs[key]).astype(
                        np.float) / float(mod_freqs['Total'][0])

                amp_found_count += 1

                for nuc in ['A', 'T', 'C', 'G', 'N', '-']:
                    row = [batchName, nuc]
                    row.extend(nuc_freqs[nuc])
                    nucleotide_frequency_summary.append(row)

                    pct_row = [batchName, nuc]
                    pct_row.extend(nuc_pcts[nuc])
                    nucleotide_percentage_summary.append(pct_row)

                for mod in [
                        'Insertions', 'Insertions_Left', 'Deletions',
                        'Substitutions', 'All_modifications'
                ]:
                    row = [batchName, mod]
                    row.extend(mod_freqs[mod])
                    modification_frequency_summary.append(row)

                    pct_row = [batchName, mod]
                    pct_row.extend(mod_pcts[mod])
                    modification_percentage_summary.append(pct_row)

            if amp_found_count == 0:
                info(
                    "Couldn't find any data for amplicon '%s'. Not compiling results."
                    % amplicon_name)
            else:
                amplicon_plot_name = amplicon_name + "."
                if len(amplicon_names) == 1 and amplicon_name == "Reference":
                    amplicon_plot_name = ""

                colnames = ['Batch', 'Nucleotide']
                colnames.extend(list(consensus_sequence))
                nucleotide_frequency_summary_df = pd.DataFrame(
                    nucleotide_frequency_summary, columns=colnames)
                nucleotide_frequency_summary_df = pd.concat([
                    nucleotide_frequency_summary_df.iloc[:, 0:2],
                    nucleotide_frequency_summary_df.iloc[:, 2:].apply(
                        pd.to_numeric)
                ],
                                                            axis=1)
                nucleotide_frequency_summary_filename = _jp(
                    amplicon_plot_name + 'Nucleotide_frequency_summary.txt')
                nucleotide_frequency_summary_df.to_csv(
                    nucleotide_frequency_summary_filename,
                    sep='\t',
                    index=None)

                nucleotide_percentage_summary_df = pd.DataFrame(
                    nucleotide_percentage_summary, columns=colnames)
                nucleotide_percentage_summary_df = pd.concat([
                    nucleotide_percentage_summary_df.iloc[:, 0:2],
                    nucleotide_percentage_summary_df.iloc[:, 2:].apply(
                        pd.to_numeric)
                ],
                                                             axis=1)
                nucleotide_percentage_summary_filename = _jp(
                    amplicon_plot_name + 'Nucleotide_percentage_summary.txt')
                nucleotide_percentage_summary_df.to_csv(
                    nucleotide_percentage_summary_filename,
                    sep='\t',
                    index=None)

                colnames = ['Batch', 'Modification']
                colnames.extend(list(consensus_sequence))
                modification_frequency_summary_df = pd.DataFrame(
                    modification_frequency_summary, columns=colnames)
                modification_frequency_summary_df = pd.concat([
                    modification_frequency_summary_df.iloc[:, 0:2],
                    modification_frequency_summary_df.iloc[:, 2:].apply(
                        pd.to_numeric)
                ],
                                                              axis=1)
                modification_frequency_summary_filename = _jp(
                    amplicon_plot_name + 'MODIFICATION_FREQUENCY_SUMMARY.txt')
                modification_frequency_summary_df.to_csv(
                    modification_frequency_summary_filename,
                    sep='\t',
                    index=None)

                modification_percentage_summary_df = pd.DataFrame(
                    modification_percentage_summary, columns=colnames)
                modification_percentage_summary_df = pd.concat([
                    modification_percentage_summary_df.iloc[:, 0:2],
                    modification_percentage_summary_df.iloc[:, 2:].apply(
                        pd.to_numeric)
                ],
                                                               axis=1)
                modification_percentage_summary_filename = _jp(
                    amplicon_plot_name + 'MODIFICATION_PERCENTAGE_SUMMARY.txt')
                modification_percentage_summary_df.to_csv(
                    modification_percentage_summary_filename,
                    sep='\t',
                    index=None)

                crispresso2_info[
                    'nucleotide_frequency_summary_filename'] = os.path.basename(
                        nucleotide_frequency_summary_filename)
                crispresso2_info[
                    'nucleotide_percentage_summary_filename'] = os.path.basename(
                        nucleotide_percentage_summary_filename)

                crispresso2_info[
                    'modification_frequency_summary_filename'] = os.path.basename(
                        modification_frequency_summary_filename)
                crispresso2_info[
                    'modification_percentage_summary_filename'] = os.path.basename(
                        modification_percentage_summary_filename)

                crispresso2_info['summary_plot_titles'] = {}
                crispresso2_info['summary_plot_labels'] = {}
                crispresso2_info['summary_plot_datas'] = {}

                #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window
                if guides_all_same and consensus_guides != []:
                    info(
                        "All guides are equal. Performing comparison of batches for amplicon '%s'"
                        % amplicon_name)
                    include_idxs = consensus_include_idxs  #include indexes are the same for all guides
                    for idx, sgRNA in enumerate(consensus_guides):
                        sgRNA_intervals = consensus_sgRNA_intervals[idx]
                        sgRNA_plot_idxs = consensus_sgRNA_plot_idxs[idx]
                        plot_idxs_flat = [0, 1]  # guide, nucleotide
                        plot_idxs_flat.extend(
                            [plot_idx + 2 for plot_idx in sgRNA_plot_idxs])
                        sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:,
                                                                                                   plot_idxs_flat]
                        sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:,
                                                                                                     plot_idxs_flat]
                        sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:,
                                                                                                         plot_idxs_flat]

                        #show all sgRNA's on the plot
                        sub_sgRNA_intervals = []
                        for sgRNA_interval in consensus_sgRNA_intervals:
                            newstart = None
                            newend = None
                            for idx, i in enumerate(sgRNA_plot_idxs):
                                if i <= sgRNA_interval[0]:
                                    newstart = idx
                                if newend is None and i >= sgRNA_interval[1]:
                                    newend = idx

                            #if guide doesn't overlap with plot idxs
                            if newend == 0 or newstart == len(sgRNA_plot_idxs):
                                continue
                            #otherwise, correct partial overlaps
                            elif newstart == None and newend == None:
                                newstart = 0
                                newend = len(include_idxs) - 1
                            elif newstart == None:
                                newstart = 0
                            elif newend == None:
                                newend = len(include_idxs) - 1
                            #and add it to the list
                            sub_sgRNA_intervals.append((newstart, newend))

                        if not args.suppress_plots:
                            #plot for each guide
                            this_window_nuc_pct_quilt_plot_name = _jp(
                                amplicon_plot_name +
                                'Nucleotide_percentage_quilt_around_sgRNA_' +
                                sgRNA)
                            CRISPRessoPlot.plot_nucleotide_quilt(
                                sub_nucleotide_percentage_summary_df,
                                sub_modification_percentage_summary_df,
                                this_window_nuc_pct_quilt_plot_name,
                                save_png,
                                sgRNA_intervals=sub_sgRNA_intervals,
                                quantification_window_idxs=include_idxs)
                            plot_name = os.path.basename(
                                this_window_nuc_pct_quilt_plot_name)
                            window_nuc_pct_quilt_plot_names.append(plot_name)
                            crispresso2_info['summary_plot_titles'][
                                plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name
                            if len(consensus_guides) == 1:
                                crispresso2_info['summary_plot_titles'][
                                    plot_name] = ''
                            crispresso2_info['summary_plot_labels'][
                                plot_name] = 'Composition of each base around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name
                            crispresso2_info['summary_plot_datas'][plot_name] = [
                                ('Nucleotide frequencies',
                                 os.path.basename(
                                     nucleotide_frequency_summary_filename)),
                                ('Modification frequencies',
                                 os.path.basename(
                                     modification_frequency_summary_filename))
                            ]

                            sub_nucleotide_frequency_summary_df = pd.concat(
                                [
                                    sub_nucleotide_frequency_summary_df.
                                    iloc[:, 0:2],
                                    sub_nucleotide_frequency_summary_df.
                                    iloc[:, 2:].apply(pd.to_numeric)
                                ],
                                axis=1)
                            sub_nucleotide_frequency_summary_filename = _jp(
                                amplicon_plot_name +
                                'Nucleotide_frequency_summary_around_sgRNA_' +
                                sgRNA + '.txt')
                            sub_nucleotide_frequency_summary_df.to_csv(
                                sub_nucleotide_frequency_summary_filename,
                                sep='\t',
                                index=None)

                            sub_nucleotide_percentage_summary_df = pd.concat(
                                [
                                    sub_nucleotide_percentage_summary_df.
                                    iloc[:, 0:2],
                                    sub_nucleotide_percentage_summary_df.
                                    iloc[:, 2:].apply(pd.to_numeric)
                                ],
                                axis=1)
                            sub_nucleotide_percentage_summary_filename = _jp(
                                amplicon_plot_name +
                                'Nucleotide_percentage_summary_around_sgRNA_' +
                                sgRNA + '.txt')
                            sub_nucleotide_percentage_summary_df.to_csv(
                                sub_nucleotide_percentage_summary_filename,
                                sep='\t',
                                index=None)

                            if args.base_editor_output:
                                this_window_nuc_conv_plot_name = _jp(
                                    amplicon_plot_name +
                                    'Nucleotide_conversion_map_around_sgRNA_' +
                                    sgRNA)
                                CRISPRessoPlot.plot_conversion_map(
                                    sub_nucleotide_percentage_summary_df,
                                    this_window_nuc_conv_plot_name,
                                    args.conversion_nuc_from,
                                    args.conversion_nuc_to,
                                    save_png,
                                    sgRNA_intervals=sub_sgRNA_intervals,
                                    quantification_window_idxs=include_idxs)
                                plot_name = os.path.basename(
                                    this_window_nuc_conv_plot_name)
                                window_nuc_conv_plot_names.append(plot_name)
                                crispresso2_info['summary_plot_titles'][
                                    plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name
                                if len(consensus_guides) == 1:
                                    crispresso2_info['summary_plot_titles'][
                                        plot_name] = ''
                                crispresso2_info['summary_plot_labels'][
                                    plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name
                                crispresso2_info['summary_plot_datas'][
                                    plot_name] = [
                                        ('Nucleotide frequencies around sgRNA',
                                         os.path.basename(
                                             sub_nucleotide_frequency_summary_filename
                                         )),
                                        ('Nucleotide percentages around sgRNA',
                                         os.path.basename(
                                             sub_nucleotide_percentage_summary_filename
                                         ))
                                    ]

                    if not args.suppress_plots:  # plot the whole region
                        this_nuc_pct_quilt_plot_name = _jp(
                            amplicon_plot_name + 'Nucleotide_percentage_quilt')
                        CRISPRessoPlot.plot_nucleotide_quilt(
                            nucleotide_percentage_summary_df,
                            modification_percentage_summary_df,
                            this_nuc_pct_quilt_plot_name,
                            save_png,
                            sgRNA_intervals=consensus_sgRNA_intervals,
                            quantification_window_idxs=include_idxs)
                        plot_name = os.path.basename(
                            this_nuc_pct_quilt_plot_name)
                        nuc_pct_quilt_plot_names.append(plot_name)
                        crispresso2_info['summary_plot_titles'][
                            plot_name] = 'Amplicon: ' + amplicon_name
                        if len(amplicon_names) == 1:
                            crispresso2_info['summary_plot_titles'][
                                plot_name] = ''
                        crispresso2_info['summary_plot_labels'][
                            plot_name] = 'Composition of each base for the amplicon ' + amplicon_name
                        crispresso2_info['summary_plot_datas'][plot_name] = [
                            ('Nucleotide frequencies',
                             os.path.basename(
                                 nucleotide_frequency_summary_filename)),
                            ('Modification frequencies',
                             os.path.basename(
                                 modification_frequency_summary_filename))
                        ]
                        if args.base_editor_output:
                            this_nuc_conv_plot_name = _jp(
                                amplicon_plot_name +
                                'Nucleotide_conversion_map')
                            CRISPRessoPlot.plot_conversion_map(
                                nucleotide_percentage_summary_df,
                                this_nuc_conv_plot_name,
                                args.conversion_nuc_from,
                                args.conversion_nuc_to,
                                save_png,
                                sgRNA_intervals=consensus_sgRNA_intervals,
                                quantification_window_idxs=include_idxs)
                            plot_name = os.path.basename(
                                this_nuc_conv_plot_name)
                            nuc_conv_plot_names.append(plot_name)
                            crispresso2_info['summary_plot_titles'][
                                plot_name] = 'Amplicon: ' + amplicon_name
                            if len(amplicon_names) == 1:
                                crispresso2_info['summary_plot_titles'][
                                    plot_name] = ''
                            crispresso2_info['summary_plot_titles'][
                                plot_name] = ''
                            crispresso2_info['summary_plot_labels'][
                                plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name
                            crispresso2_info['summary_plot_datas'][plot_name] = [
                                ('Nucleotide frequencies',
                                 os.path.basename(
                                     nucleotide_frequency_summary_filename)),
                                ('Modification frequencies',
                                 os.path.basename(
                                     modification_frequency_summary_filename))
                            ]

                else:  #guides are not the same
                    if not args.suppress_plots:
                        this_nuc_pct_quilt_plot_name = _jp(
                            amplicon_plot_name + 'Nucleotide_percentage_quilt')
                        CRISPRessoPlot.plot_nucleotide_quilt(
                            nucleotide_percentage_summary_df,
                            modification_percentage_summary_df,
                            this_nuc_pct_quilt_plot_name, save_png)
                        plot_name = os.path.basename(
                            this_nuc_pct_quilt_plot_name)
                        nuc_pct_quilt_plot_names.append(plot_name)
                        crispresso2_info['summary_plot_labels'][
                            plot_name] = 'Composition of each base for the amplicon ' + amplicon_name
                        crispresso2_info['summary_plot_datas'][plot_name] = [
                            ('Nucleotide frequencies',
                             os.path.basename(
                                 nucleotide_frequency_summary_filename)),
                            ('Modification frequencies',
                             os.path.basename(
                                 modification_frequency_summary_filename))
                        ]
                        if args.base_editor_output:
                            this_nuc_conv_plot_name = _jp(
                                amplicon_plot_name +
                                'Nucleotide_percentage_quilt')
                            CRISPRessoPlot.plot_conversion_map(
                                nucleotide_percentage_summary_df,
                                this_nuc_conv_plot_name,
                                args.conversion_nuc_from,
                                args.conversion_nuc_to, save_png)
                            plot_name = os.path.basename(
                                this_nuc_conv_plot_name)
                            nuc_conv_plot_names.append(plot_name)
                            crispresso2_info['summary_plot_labels'][
                                plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to + ' conversion rates for the amplicon ' + amplicon_name
                            crispresso2_info['summary_plot_datas'][plot_name] = [
                                ('Nucleotide frequencies',
                                 os.path.basename(
                                     nucleotide_frequency_summary_filename)),
                                ('Modification frequencies',
                                 os.path.basename(
                                     modification_frequency_summary_filename))
                            ]

        crispresso2_info[
            'window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names
        crispresso2_info['nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names
        crispresso2_info[
            'window_nuc_conv_plot_names'] = window_nuc_conv_plot_names
        crispresso2_info['nuc_conv_plot_names'] = nuc_conv_plot_names

        #summarize amplicon modifications
        with open(
                _jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'),
                'w') as outfile:
            wrote_header = False
            for idx, row in batch_params.iterrows():
                batchName = CRISPRessoShared.slugify(row["name"])
                file_prefix = row['file_prefix']
                folder_name = os.path.join(OUTPUT_DIRECTORY,
                                           'CRISPResso_on_%s' % batchName)
                run_data = run_datas[idx]
                if run_data is None:
                    continue

                amplicon_modification_file = os.path.join(
                    folder_name, run_data['quant_of_editing_freq_filename'])
                with open(amplicon_modification_file, 'r') as infile:
                    file_head = infile.readline()
                    if not wrote_header:
                        outfile.write('Batch\t' + file_head)
                        wrote_header = True
                    for line in infile:
                        outfile.write(batchName + "\t" + line)

        #summarize alignment
        with open(_jp('CRISPRessoBatch_mapping_statistics.txt'),
                  'w') as outfile:
            wrote_header = False
            for idx, row in batch_params.iterrows():
                batchName = CRISPRessoShared.slugify(row["name"])
                folder_name = os.path.join(OUTPUT_DIRECTORY,
                                           'CRISPResso_on_%s' % batchName)

                run_data = run_datas[idx]
                if run_data is None:
                    continue
                amplicon_modification_file = os.path.join(
                    folder_name, run_data['mapping_stats_filename'])
                with open(amplicon_modification_file, 'r') as infile:
                    file_head = infile.readline()
                    if not wrote_header:
                        outfile.write('Batch\t' + file_head)
                        wrote_header = True
                    for line in infile:
                        outfile.write(batchName + "\t" + line)

        if not args.suppress_report:
            if (args.place_report_in_output_folder):
                report_name = _jp("CRISPResso2Batch_report.html")
            else:
                report_name = OUTPUT_DIRECTORY + '.html'
            CRISPRessoReport.make_batch_report_from_folder(
                report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT)
            crispresso2_info['report_location'] = report_name
            crispresso2_info['report_filename'] = os.path.basename(report_name)

        cp.dump(crispresso2_info, open(crispresso2Batch_info_file, 'wb'))
        info('Analysis Complete!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)

        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Ejemplo n.º 4
0
def plot_ambiguous_alleles_tables_from_folder(crispresso_output_folder,fig_filename_root,MIN_FREQUENCY=None,MAX_N_ROWS=None,SAVE_ALSO_PNG=False,custom_colors=None,plot_cut_point=True,sgRNA_intervals=None,sgRNA_names=None,sgRNA_mismatches=None):
    """
    Plots an allele table plot of ambiguous alleles from a completed CRISPResso run
    This function is only used for one-off plotting purposes and not for the general CRISPResso analysis
    Important: The run must have been run with the --write_detailed_allele_table parameter
    Ambiguous reads align to multiple reference amplicons with the same score
    In this function, ambiguous reads are filtered from the allele tables and the allele plots for these ambiguous reads are plotted
    Note that each ambiguous read is assigned to a reference (usually the first one) and mutations/indels are plotted in relation to this reference sequence.
    crispresso_output_folder: completed analysis crispresso2 output folder
    fig_filename_root: figure filename to plot (not including '.pdf' or '.png')
    MIN_FREQUENCY: sum of alleles % must add to this to be plotted
    MAX_N_ROWS: max rows to plot
    SAVE_ALSO_PNG: whether to write png file as well
    plot_cut_point: if false, won't draw 'predicted cleavage' line
    example:
    """
    crispresso2_info = CRISPRessoShared.load_crispresso_info(crispresso_output_folder)

    if not crispresso2_info['args'].write_detailed_allele_table:
        raise Exception('CRISPResso run must be run with the parameter --write_detailed_allele_table')

    if MIN_FREQUENCY is None:
        MIN_FREQUENCY = crispresso2_info['args'].min_frequency_alleles_around_cut_to_plot
    if MAX_N_ROWS is None:
        MAX_N_ROWS = crispresso2_info['args'].max_rows_alleles_around_cut_to_plot

    plot_count = 0

    z = zipfile.ZipFile(os.path.join(crispresso_output_folder,crispresso2_info['allele_frequency_table_zip_filename']))
    zf = z.open(crispresso2_info['allele_frequency_table_filename'])
    df_alleles = pd.read_csv(zf,sep="\t")
    full_len = df_alleles['#Reads'].sum()
    df_alleles['ref_positions'] = df_alleles['ref_positions'].apply(arrStr_to_arr)

    #pd.set_option('display.max_columns', None)
    #print(df_alleles.head())
    df_ambiguous = df_alleles[df_alleles['Reference_Name'].str.contains('AMBIGUOUS')]
    ambig_len = df_ambiguous['#Reads'].sum()

    print("Filtered to " + str(ambig_len) + "/" + str(full_len) + " ambiguous reads")

    ref_names = crispresso2_info['ref_names']
    refs = crispresso2_info['refs']
    print("Ambiguous alleles will be plotted against to the sequence of the first reference sequence ("+ref_names[0]+")")
    for ref_name in ref_names:
        sgRNA_sequences = refs[ref_name]['sgRNA_sequences']
        sgRNA_cut_points = refs[ref_name]['sgRNA_cut_points']
        sgRNA_plot_cut_points = refs[ref_name]['sgRNA_plot_cut_points']
        sgRNA_intervals = refs[ref_name]['sgRNA_intervals']
        sgRNA_names = refs[ref_name]['sgRNA_names']
        sgRNA_mismatches = refs[ref_name]['sgRNA_mismatches']
        sgRNA_plot_idxs = refs[ref_name]['sgRNA_plot_idxs']

        reference_seq = refs[ref_name]['sequence']

        for ind,sgRNA in enumerate(sgRNA_sequences):
            sgRNA_label = sgRNA # for file names
            if sgRNA_names[ind] != "":
                sgRNA_label = sgRNA_names[ind]

            cut_point = sgRNA_cut_points[ind]
            plot_cut_point = sgRNA_plot_cut_points[ind]
            plot_idxs = sgRNA_plot_idxs[ind]
            plot_half_window = max(1,crispresso2_info['args'].plot_window_size)
            ref_seq_around_cut=refs[ref_name]['sequence'][cut_point-plot_half_window+1:cut_point+plot_half_window+1]

            ambiguous_ref_name = "AMBIGUOUS_"+ref_name
            df_alleles_around_cut=CRISPRessoShared.get_dataframe_around_cut(df_alleles.loc[df_alleles['Reference_Name'] == ambiguous_ref_name],cut_point,plot_half_window)
            this_ambig_allele_count = len(df_alleles_around_cut.index)
            if this_ambig_allele_count < 1:
                print('No ambiguous reads found for ' + ref_name)
                continue
            this_ambig_count = df_alleles_around_cut['#Reads'].sum()
            print('Plotting ' + str(this_ambig_count) + ' ambiguous reads for ' + ref_name)


	    new_sgRNA_intervals = []
	    #adjust coordinates of sgRNAs
	    new_sel_cols_start = cut_point - plot_half_window
	    for (int_start,int_end) in refs[ref_name]['sgRNA_intervals']:
		new_sgRNA_intervals += [(int_start - new_sel_cols_start - 1,int_end - new_sel_cols_start - 1)]
            fig_filename_root = fig_filename_root+"_"+ref_name+"_"+sgRNA_label
	    CRISPRessoPlot.plot_alleles_table(ref_seq_around_cut,df_alleles=df_alleles_around_cut,fig_filename_root=fig_filename_root, MIN_FREQUENCY=MIN_FREQUENCY,MAX_N_ROWS=MAX_N_ROWS,SAVE_ALSO_PNG=SAVE_ALSO_PNG,plot_cut_point=plot_cut_point,sgRNA_intervals=new_sgRNA_intervals,sgRNA_names=sgRNA_names,sgRNA_mismatches=sgRNA_mismatches,annotate_wildtype_allele=crispresso2_info['args'].annotate_wildtype_allele)

            plot_count += 1
    print('Plotted ' + str(plot_count) + ' plots')
Ejemplo n.º 5
0
def main():
    try:
        start_time = datetime.now()
        start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S')

        description = [
            '~~~CRISPRessoAggregate~~~', '-Aggregation of CRISPResso Run Data-'
        ]
        aggregate_string = r'''
___________________________________
|      __  __  _   _  __     ___ _ |
| /\  /__ /__ |_) |_ /__  /\  | |_ |
|/--\ \_| \_| | \ |_ \_| /--\ | |_ |
|__________________________________|
        '''
        print(
            CRISPRessoShared.get_crispresso_header(description,
                                                   aggregate_string))

        parser = argparse.ArgumentParser(
            description="Aggreate CRISPResso2 Runs")
        parser.add_argument(
            "-p",
            "--prefix",
            action='append',
            help=
            "Prefix for CRISPResso folders to aggregate (may be specified multiple times)",
            default=[])
        parser.add_argument("-s",
                            "--suffix",
                            type=str,
                            help="Suffix for CRISPResso folders to aggregate",
                            default="")

        parser.add_argument("-n",
                            "--name",
                            type=str,
                            help="Output name of the report",
                            required=True)
        parser.add_argument(
            '--min_reads_for_inclusion',
            help=
            'Minimum number of reads for a run to be included in the run summary',
            type=int,
            default=0)

        parser.add_argument(
            '--place_report_in_output_folder',
            help=
            'If true, report will be written inside the CRISPResso output folder. By default, the report will be written one directory up from the report output.',
            action='store_true')
        parser.add_argument('--suppress_report',
                            help='Suppress output report',
                            action='store_true')
        parser.add_argument('--suppress_plots',
                            help='Suppress output plots',
                            action='store_true')

        parser.add_argument('--debug',
                            help='Show debug messages',
                            action='store_true')

        args = parser.parse_args()

        output_folder_name = 'CRISPRessoAggregate_on_%s' % args.name
        OUTPUT_DIRECTORY = os.path.abspath(output_folder_name)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoAggregate_RUNNING_LOG.txt')
        logging.getLogger().addHandler(logging.FileHandler(log_filename))

        with open(log_filename, 'w+') as outfile:
            outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' %
                          ' '.join(sys.argv))

        crispresso2Aggregate_info_file = os.path.join(
            OUTPUT_DIRECTORY, 'CRISPResso2Aggregate_info.pickle')
        crispresso2_info = {
        }  #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['version'] = CRISPRessoShared.__version__
        crispresso2_info['args'] = deepcopy(args)

        crispresso2_info['log_filename'] = os.path.basename(log_filename)

        #glob returns paths including the original prefix
        all_files = []
        for prefix in args.prefix:
            all_files.extend(glob.glob(prefix + '*' + args.suffix))
            if args.prefix != "":
                all_files.extend(glob.glob(
                    prefix + '/*' +
                    args.suffix))  #if a folder is given, add all subfolders

        seen_folders = {}
        crispresso2_folder_infos = {
        }  #file_loc->crispresso_info; these are only CRISPResso runs -- this bit unrolls batch, pooled, and wgs runs
        successfully_imported_count = 0
        not_imported_count = 0
        for folder in all_files:
            if folder in seen_folders:  #skip if we've seen this folder (glob could have added it twice)
                continue
            seen_folders[folder] = 1
            if os.path.isdir(folder) and str(folder).endswith(args.suffix):
                #first, try to import a plain CRISPResso2 run
                crispresso_info_file = os.path.join(folder,
                                                    'CRISPResso2_info.pickle')
                if os.path.exists(crispresso_info_file):
                    try:
                        run_data = CRISPRessoShared.load_crispresso_info(
                            folder)
                        crispresso2_folder_infos[folder] = run_data
                        successfully_imported_count += 1
                    except Exception as e:
                        warn('Could not open CRISPResso2 info file in ' +
                             folder)
                        not_imported_count += 1
                #second, check pooled
                pooled_info_file = os.path.join(
                    folder, 'CRISPResso2Pooled_info.pickle')
                if os.path.exists(pooled_info_file):
                    pooled_data = cp.load(open(pooled_info_file, 'rb'))
                    if 'good_region_names' in pooled_data:
                        run_names = pooled_data['good_region_names']
                        for run_name in run_names:
                            run_folder_loc = os.path.join(
                                folder, 'CRISPResso_on_%s' % run_name)
                            try:
                                run_data = CRISPRessoShared.load_crispresso_info(
                                    run_folder_loc)
                                crispresso2_folder_infos[
                                    run_folder_loc] = run_data
                                successfully_imported_count += 1
                            except Exception as e:
                                warn('Could not open CRISPResso2 info file in '
                                     + run_folder_loc)
                                not_imported_count += 1
                    else:
                        warn('Could not process pooled folder ' + folder)
                        not_imported_count += 1
                #third, check batch
                batch_info_file = os.path.join(folder,
                                               'CRISPResso2Batch_info.pickle')
                if os.path.exists(batch_info_file):
                    batch_data = cp.load(open(batch_info_file, 'rb'))
                    if 'completed_batch_arr' in batch_data:
                        run_names = batch_data['completed_batch_arr']
                        for run_name in run_names:
                            run_folder_loc = os.path.join(
                                folder, 'CRISPResso_on_%s' % run_name)
                            try:
                                run_data = CRISPRessoShared.load_crispresso_info(
                                    run_folder_loc)
                                crispresso2_folder_infos[
                                    run_folder_loc] = run_data
                                successfully_imported_count += 1
                            except Exception as e:
                                warn('Could not open CRISPResso2 info file in '
                                     + run_folder_loc)
                                not_imported_count += 1
                    else:
                        warn('Could not process batch folder ' + folder)
                        not_imported_count += 1
                #fourth, check WGS
                wgs_info_file = os.path.join(folder,
                                             'CRISPResso2WGS_info.pickle')
                if os.path.exists(wgs_info_file):
                    wgs_data = cp.load(open(wgs_info_file, 'rb'))
                    if 'good_region_folders' in wgs_data:
                        run_names = wgs_data['good_region_folders']
                        for run_name in run_names:
                            run_folder_loc = os.path.join(
                                folder, 'CRISPResso_on_%s' % run_name)
                            try:
                                run_data = CRISPRessoShared.load_crispresso_info(
                                    run_folder_loc)
                                crispresso2_folder_infos[
                                    run_folder_loc] = run_data
                                successfully_imported_count += 1
                            except Exception as e:
                                warn('Could not open CRISPResso2 info file in '
                                     + run_folder_loc)
                                not_imported_count += 1
                    else:
                        warn('Could not process WGS folder ' + folder)
                        not_imported_count += 1

        info('Read ' + str(successfully_imported_count) + ' folders (' +
             str(not_imported_count) + ' not imported)')

        save_png = True
        if args.suppress_report:
            save_png = False

        if successfully_imported_count > 0:

            crispresso2_folders = crispresso2_folder_infos.keys()
            crispresso2_folder_names = {}
            crispresso2_folder_htmls = {}  #file_loc->html folder loc
            for crispresso2_folder in crispresso2_folders:
                crispresso2_folder_names[
                    crispresso2_folder] = CRISPRessoShared.slugify(
                        crispresso2_folder)
                this_sub_html_file = crispresso2_folder + ".html"
                if crispresso2_folder_infos[crispresso2_folder][
                        'args'].place_report_in_output_folder:
                    this_sub_html_file = os.path.join(
                        crispresso2_folder,
                        crispresso2_folder_infos[crispresso2_folder]
                        ['report_filename'])
                crispresso2_folder_htmls[crispresso2_folder] = os.path.abspath(
                    this_sub_html_file)

            all_amplicons = set()
            amplicon_names = {
            }  #sequence -> ref name (to check for amplicons with the same name but different sequences)
            amplicon_counts = {}
            amplicon_sources = {}
            completed_batch_arr = []
            for crispresso2_folder in crispresso2_folders:
                run_data = crispresso2_folder_infos[crispresso2_folder]
                for ref_name in run_data['ref_names']:
                    ref_seq = run_data['refs'][ref_name]['sequence']
                    all_amplicons.add(ref_seq)
                    #if this amplicon is called something else in another sample, just call it the amplicon
                    if ref_name in amplicon_names and amplicon_names[
                            ref_seq] != ref_name:
                        amplicon_names[ref_seq] = ref_seq
                    else:
                        amplicon_names[ref_seq] = ref_name
                    if ref_seq not in amplicon_counts:
                        amplicon_counts[ref_seq] = 0
                        amplicon_sources[ref_seq] = []
                    amplicon_counts[ref_seq] += 1
                    amplicon_sources[ref_seq].append(crispresso2_folder + '(' +
                                                     ref_name + ')')

            #make sure amplicon names aren't super long
            for amplicon in all_amplicons:
                if len(amplicon_names[amplicon]) > 21:
                    amplicon_names[amplicon] = amplicon_names[amplicon][0:21]

            #make sure no duplicate amplicon names (same name for the different amplicons)
            seen_names = []
            for amplicon in all_amplicons:
                suffix_counter = 2
                orig_name = amplicon_names[amplicon]
                while amplicon_names[amplicon] in seen_names:
                    amplicon_names[amplicon] = orig_name + "_" + str(
                        suffix_counter)
                    suffix_counter += 1
                seen_names.append(amplicon_names[amplicon])

            crispresso2_info['ref_names'] = seen_names
            crispresso2_info['refs'] = {}
            crispresso2_info['summary_plot_names'] = []
            crispresso2_info['summary_plot_titles'] = {}
            crispresso2_info['summary_plot_labels'] = {}
            crispresso2_info['summary_plot_datas'] = {}

            with open(_jp('CRISPRessoAggregate_amplicon_information.txt'),
                      'w') as outfile:
                outfile.write("\t".join([
                    'Amplicon Name', 'Number of sources', 'Amplicon sources',
                    'Amplicon sequence'
                ]) + "\n")
                for amplicon in all_amplicons:
                    outfile.write("\t".join([
                        amplicon_names[amplicon],
                        str(amplicon_counts[amplicon]), ';'.join(
                            amplicon_sources[amplicon]), amplicon
                    ]) + "\n")

            window_nuc_pct_quilt_plot_names = []
            nuc_pct_quilt_plot_names = []
            window_nuc_conv_plot_names = []
            nuc_conv_plot_names = []

            #report for amplicons that appear multiple times
            for amplicon_index, amplicon_seq in enumerate(all_amplicons):
                amplicon_name = amplicon_names[amplicon_seq]
                crispresso2_info['refs'][amplicon_name] = {}
                #only perform comparison if amplicon seen in more than one sample
                if amplicon_counts[amplicon_seq] < 2:
                    continue

                info('Reporting summary for amplicon: "' + amplicon_name + '"')

                consensus_sequence = ""
                nucleotide_frequency_summary = []
                nucleotide_percentage_summary = []
                modification_frequency_summary = []
                modification_percentage_summary = []

                amp_found_count = 0  #how many folders had information for this amplicon
                consensus_guides = []
                consensus_include_idxs = []
                consensus_sgRNA_plot_idxs = []
                consensus_sgRNA_intervals = []
                guides_all_same = True
                runs_with_this_amplicon = []
                for crispresso2_folder in crispresso2_folders:
                    run_data = crispresso2_folder_infos[crispresso2_folder]
                    run_has_amplicon = False
                    run_amplicon_name = ''
                    for ref_name in run_data['ref_names']:
                        if amplicon_seq == run_data['refs'][ref_name][
                                'sequence']:
                            run_has_amplicon = True
                            run_amplicon_name = ref_name
                    if not run_has_amplicon:
                        continue
                    runs_with_this_amplicon.append(crispresso2_folder)

                    if consensus_guides == []:
                        consensus_guides = run_data['refs'][run_amplicon_name][
                            'sgRNA_sequences']
                        consensus_include_idxs = run_data['refs'][
                            run_amplicon_name]['include_idxs']
                        consensus_sgRNA_intervals = run_data['refs'][
                            run_amplicon_name]['sgRNA_intervals']
                        consensus_sgRNA_plot_idxs = run_data['refs'][
                            run_amplicon_name]['sgRNA_plot_idxs']

                    if run_data['refs'][run_amplicon_name][
                            'sgRNA_sequences'] != consensus_guides:
                        guides_all_same = False
                    if set(run_data['refs'][run_amplicon_name]
                           ['include_idxs']) != set(consensus_include_idxs):
                        guides_all_same = False

                    if 'nuc_freq_filename' not in run_data['refs'][
                            run_amplicon_name]:
                        info(
                            "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information."
                            % (run_amplicon_name, crispresso2_folder))
                        continue

                    nucleotide_frequency_file = os.path.join(
                        crispresso2_folder, run_data['refs'][run_amplicon_name]
                        ['nuc_freq_filename'])
                    ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file(
                        nucleotide_frequency_file)

                    nucleotide_pct_file = os.path.join(
                        crispresso2_folder, run_data['refs'][run_amplicon_name]
                        ['nuc_pct_filename'])
                    ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file(
                        nucleotide_pct_file)

                    count_file = os.path.join(
                        crispresso2_folder, run_data['refs'][run_amplicon_name]
                        ['mod_count_filename'])
                    ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file(
                        count_file)

                    if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None:
                        info(
                            "Skipping the amplicon '%s' in folder '%s'. Could not parse run output."
                            % (run_amplicon_name, crispresso2_folder))
                        info(
                            "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'"
                            % (ampSeq_nf, ampSeq_np, ampSeq_cf))
                        continue
                    if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf:
                        warn(
                            "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s"
                            % (run_amplicon_name, crispresso2_folder,
                               ampSeq_nf, ampSeq_np, ampSeq_cf, amplicon_seq))
                        continue
                    if consensus_sequence == "":
                        consensus_sequence = ampSeq_nf
                    if ampSeq_nf != consensus_sequence:
                        info(
                            "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match."
                            % (run_amplicon_name, crispresso2_folder))
                        continue
                    if 'Total' not in mod_freqs:
                        info(
                            "Skipping the amplicon '%s' in folder '%s'. Processing did not complete."
                            % (run_amplicon_name, crispresso2_folder))
                        continue
                    if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][
                            0] == "0":
                        info(
                            "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon."
                            % (run_amplicon_name, crispresso2_folder))
                        continue
                    this_amp_total_reads = run_data['counts_total'][
                        run_amplicon_name]
                    if this_amp_total_reads < args.min_reads_for_inclusion:
                        info(
                            "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)."
                            % (run_amplicon_name, crispresso2_folder,
                               str(this_amp_total_reads),
                               args.min_reads_for_inclusion))
                        continue

                    mod_pcts = {}
                    for key in mod_freqs:
                        mod_pcts[key] = np.array(mod_freqs[key]).astype(
                            np.float) / float(this_amp_total_reads)

                    amp_found_count += 1

                    run_name = crispresso2_folder_names[crispresso2_folder]

                    for nuc in ['A', 'T', 'C', 'G', 'N', '-']:
                        row = [run_name, nuc]
                        row.extend(nuc_freqs[nuc])
                        nucleotide_frequency_summary.append(row)

                        pct_row = [run_name, nuc]
                        pct_row.extend(nuc_pcts[nuc])
                        nucleotide_percentage_summary.append(pct_row)

                    for mod in [
                            'Insertions', 'Insertions_Left', 'Deletions',
                            'Substitutions', 'All_modifications'
                    ]:
                        row = [run_name, mod]
                        row.extend(mod_freqs[mod])
                        modification_frequency_summary.append(row)

                        pct_row = [run_name, mod]
                        pct_row.extend(mod_pcts[mod])
                        modification_percentage_summary.append(pct_row)

                if amp_found_count == 0:
                    info(
                        "Couldn't find any data for amplicon '%s'. Not compiling results."
                        % amplicon_name)
                else:
                    amplicon_plot_name = amplicon_name + "."
                    if len(amplicon_names
                           ) == 1 and amplicon_name == "Reference":
                        amplicon_plot_name = ""

                    colnames = ['Folder', 'Nucleotide']
                    colnames.extend(list(consensus_sequence))
                    nucleotide_frequency_summary_df = pd.DataFrame(
                        nucleotide_frequency_summary, columns=colnames)
                    nucleotide_frequency_summary_df = pd.concat([
                        nucleotide_frequency_summary_df.iloc[:, 0:2],
                        nucleotide_frequency_summary_df.iloc[:, 2:].apply(
                            pd.to_numeric)
                    ],
                                                                axis=1)
                    nucleotide_frequency_summary_filename = _jp(
                        amplicon_plot_name +
                        'Nucleotide_frequency_summary.txt')
                    nucleotide_frequency_summary_df.to_csv(
                        nucleotide_frequency_summary_filename,
                        sep='\t',
                        index=None)

                    nucleotide_percentage_summary_df = pd.DataFrame(
                        nucleotide_percentage_summary, columns=colnames)
                    nucleotide_percentage_summary_df = pd.concat([
                        nucleotide_percentage_summary_df.iloc[:, 0:2],
                        nucleotide_percentage_summary_df.iloc[:, 2:].apply(
                            pd.to_numeric)
                    ],
                                                                 axis=1)
                    nucleotide_percentage_summary_filename = _jp(
                        amplicon_plot_name +
                        'Nucleotide_percentage_summary.txt')
                    nucleotide_percentage_summary_df.to_csv(
                        nucleotide_percentage_summary_filename,
                        sep='\t',
                        index=None)

                    colnames = ['Folder', 'Modification']
                    colnames.extend(list(consensus_sequence))
                    modification_frequency_summary_df = pd.DataFrame(
                        modification_frequency_summary, columns=colnames)
                    modification_frequency_summary_df = pd.concat([
                        modification_frequency_summary_df.iloc[:, 0:2],
                        modification_frequency_summary_df.iloc[:, 2:].apply(
                            pd.to_numeric)
                    ],
                                                                  axis=1)
                    modification_frequency_summary_filename = _jp(
                        amplicon_plot_name +
                        'MODIFICATION_FREQUENCY_SUMMARY.txt')
                    modification_frequency_summary_df.to_csv(
                        modification_frequency_summary_filename,
                        sep='\t',
                        index=None)

                    modification_percentage_summary_df = pd.DataFrame(
                        modification_percentage_summary, columns=colnames)
                    modification_percentage_summary_df = pd.concat([
                        modification_percentage_summary_df.iloc[:, 0:2],
                        modification_percentage_summary_df.iloc[:, 2:].apply(
                            pd.to_numeric)
                    ],
                                                                   axis=1)
                    modification_percentage_summary_filename = _jp(
                        amplicon_plot_name +
                        'MODIFICATION_PERCENTAGE_SUMMARY.txt')
                    modification_percentage_summary_df.to_csv(
                        modification_percentage_summary_filename,
                        sep='\t',
                        index=None)

                    crispresso2_info['refs'][amplicon_name][
                        'nucleotide_frequency_summary_filename'] = os.path.basename(
                            nucleotide_frequency_summary_filename)
                    crispresso2_info['refs'][amplicon_name][
                        'nucleotide_percentage_summary_filename'] = os.path.basename(
                            nucleotide_percentage_summary_filename)

                    crispresso2_info['refs'][amplicon_name][
                        'modification_frequency_summary_filename'] = os.path.basename(
                            modification_frequency_summary_filename)
                    crispresso2_info['refs'][amplicon_name][
                        'modification_percentage_summary_filename'] = os.path.basename(
                            modification_percentage_summary_filename)

                    #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window
                    if guides_all_same and consensus_guides != []:
                        info(
                            "All guides are equal. Performing comparison of runs for amplicon '%s'"
                            % amplicon_name)
                        include_idxs = consensus_include_idxs  #include indexes are the same for all guides
                        for idx, sgRNA in enumerate(consensus_guides):
                            sgRNA_intervals = consensus_sgRNA_intervals[idx]
                            sgRNA_plot_idxs = consensus_sgRNA_plot_idxs[idx]
                            plot_idxs_flat = [0, 1]  # guide, nucleotide
                            plot_idxs_flat.extend(
                                [plot_idx + 2 for plot_idx in sgRNA_plot_idxs])
                            sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:,
                                                                                                       plot_idxs_flat]
                            sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:,
                                                                                                         plot_idxs_flat]
                            sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:,
                                                                                                             plot_idxs_flat]

                            #show all sgRNA's on the plot
                            sub_sgRNA_intervals = []
                            for sgRNA_interval in consensus_sgRNA_intervals:
                                newstart = None
                                newend = None
                                for idx, i in enumerate(sgRNA_plot_idxs):
                                    if i <= sgRNA_interval[0]:
                                        newstart = idx
                                    if newend is None and i >= sgRNA_interval[
                                            1]:
                                        newend = idx

                                #if guide doesn't overlap with plot idxs
                                if newend == 0 or newstart == len(
                                        sgRNA_plot_idxs):
                                    continue
                                #otherwise, correct partial overlaps
                                elif newstart == None and newend == None:
                                    newstart = 0
                                    newend = len(include_idxs) - 1
                                elif newstart == None:
                                    newstart = 0
                                elif newend == None:
                                    newend = len(include_idxs) - 1
                                #and add it to the list
                                sub_sgRNA_intervals.append((newstart, newend))

                            if not args.suppress_plots:
                                #plot for each guide
                                this_window_nuc_pct_quilt_plot_name = _jp(
                                    amplicon_plot_name +
                                    'Nucleotide_percentage_quilt_around_sgRNA_'
                                    + sgRNA)
                                CRISPRessoPlot.plot_nucleotide_quilt(
                                    sub_nucleotide_percentage_summary_df,
                                    sub_modification_percentage_summary_df,
                                    this_window_nuc_pct_quilt_plot_name,
                                    save_png,
                                    sgRNA_intervals=sub_sgRNA_intervals,
                                    quantification_window_idxs=include_idxs,
                                    group_column='Folder')
                                plot_name = os.path.basename(
                                    this_window_nuc_pct_quilt_plot_name)
                                window_nuc_pct_quilt_plot_names.append(
                                    plot_name)
                                crispresso2_info['summary_plot_titles'][
                                    plot_name] = 'sgRNA: ' + sgRNA + ' Amplicon: ' + amplicon_name
                                if len(consensus_guides) == 1:
                                    crispresso2_info['summary_plot_titles'][
                                        plot_name] = ''
                                crispresso2_info['summary_plot_labels'][
                                    plot_name] = 'Composition of each base around the guide ' + sgRNA + ' for the amplicon ' + amplicon_name
                                crispresso2_info['summary_plot_datas'][
                                    plot_name] = [
                                        (amplicon_name +
                                         ' nucleotide frequencies',
                                         os.path.basename(
                                             nucleotide_frequency_summary_filename
                                         )),
                                        (amplicon_name +
                                         ' modification frequencies',
                                         os.path.basename(
                                             modification_frequency_summary_filename
                                         ))
                                    ]

                                sub_nucleotide_frequency_summary_df = pd.concat(
                                    [
                                        sub_nucleotide_frequency_summary_df.
                                        iloc[:, 0:2],
                                        sub_nucleotide_frequency_summary_df.
                                        iloc[:, 2:].apply(pd.to_numeric)
                                    ],
                                    axis=1)
                                sub_nucleotide_frequency_summary_filename = _jp(
                                    amplicon_plot_name +
                                    'Nucleotide_frequency_summary_around_sgRNA_'
                                    + sgRNA + '.txt')
                                sub_nucleotide_frequency_summary_df.to_csv(
                                    sub_nucleotide_frequency_summary_filename,
                                    sep='\t',
                                    index=None)

                                sub_nucleotide_percentage_summary_df = pd.concat(
                                    [
                                        sub_nucleotide_percentage_summary_df.
                                        iloc[:, 0:2],
                                        sub_nucleotide_percentage_summary_df.
                                        iloc[:, 2:].apply(pd.to_numeric)
                                    ],
                                    axis=1)
                                sub_nucleotide_percentage_summary_filename = _jp(
                                    amplicon_plot_name +
                                    'Nucleotide_percentage_summary_around_sgRNA_'
                                    + sgRNA + '.txt')
                                sub_nucleotide_percentage_summary_df.to_csv(
                                    sub_nucleotide_percentage_summary_filename,
                                    sep='\t',
                                    index=None)

                        if not args.suppress_plots:  # plot the whole region
                            this_nuc_pct_quilt_plot_name = _jp(
                                amplicon_plot_name +
                                'Nucleotide_percentage_quilt')
                            CRISPRessoPlot.plot_nucleotide_quilt(
                                nucleotide_percentage_summary_df,
                                modification_percentage_summary_df,
                                this_nuc_pct_quilt_plot_name,
                                save_png,
                                sgRNA_intervals=consensus_sgRNA_intervals,
                                quantification_window_idxs=include_idxs,
                                group_column='Folder')
                            plot_name = os.path.basename(
                                this_nuc_pct_quilt_plot_name)
                            nuc_pct_quilt_plot_names.append(plot_name)
                            crispresso2_info['summary_plot_titles'][
                                plot_name] = 'Amplicon: ' + amplicon_name
                            if len(amplicon_names) == 1:
                                crispresso2_info['summary_plot_titles'][
                                    plot_name] = ''
                            crispresso2_info['summary_plot_labels'][
                                plot_name] = 'Composition of each base for the amplicon ' + amplicon_name
                            crispresso2_info['summary_plot_datas'][plot_name] = [
                                (amplicon_name + ' nucleotide frequencies',
                                 os.path.basename(
                                     nucleotide_frequency_summary_filename)),
                                (amplicon_name + ' modification frequencies',
                                 os.path.basename(
                                     modification_frequency_summary_filename))
                            ]

                    else:  #guides are not the same
                        if not args.suppress_plots:
                            this_nuc_pct_quilt_plot_name = _jp(
                                amplicon_plot_name +
                                'Nucleotide_percentage_quilt')
                            CRISPRessoPlot.plot_nucleotide_quilt(
                                nucleotide_percentage_summary_df,
                                modification_percentage_summary_df,
                                this_nuc_pct_quilt_plot_name,
                                save_png,
                                group_column='Folder')
                            plot_name = os.path.basename(
                                this_nuc_pct_quilt_plot_name)
                            nuc_pct_quilt_plot_names.append(plot_name)
                            crispresso2_info['summary_plot_labels'][
                                plot_name] = 'Composition of each base for the amplicon ' + amplicon_name
                            crispresso2_info['summary_plot_datas'][plot_name] = [
                                (amplicon_name + ' nucleotide frequencies',
                                 os.path.basename(
                                     nucleotide_frequency_summary_filename)),
                                (amplicon_name + ' modification frequencies',
                                 os.path.basename(
                                     modification_frequency_summary_filename))
                            ]

            crispresso2_info[
                'window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names
            crispresso2_info[
                'nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names
            crispresso2_info[
                'window_nuc_conv_plot_names'] = window_nuc_conv_plot_names
            crispresso2_info['nuc_conv_plot_names'] = nuc_conv_plot_names

            quantification_summary = []
            #summarize amplicon modifications
            samples_quantification_summary_by_amplicon_filename = _jp(
                'CRISPRessoAggregate_quantification_of_editing_frequency_by_amplicon.txt'
            )  #this file has separate lines for each amplicon in each run
            with open(samples_quantification_summary_by_amplicon_filename,
                      'w') as outfile:
                wrote_header = False
                for crispresso2_folder in crispresso2_folders:
                    run_data = crispresso2_folder_infos[crispresso2_folder]
                    run_name = crispresso2_folder_names[crispresso2_folder]
                    amplicon_modification_file = os.path.join(
                        crispresso2_folder,
                        run_data['quant_of_editing_freq_filename'])
                    with open(amplicon_modification_file, 'r') as infile:
                        file_head = infile.readline()
                        if not wrote_header:
                            outfile.write('Folder\t' + file_head)
                            wrote_header = True
                        for line in infile:
                            outfile.write(crispresso2_folder + "\t" + line)

                    n_tot = run_data['aln_stats']['N_TOT_READS']
                    n_aligned = 0
                    n_unmod = 0
                    n_mod = 0
                    n_discarded = 0

                    n_insertion = 0
                    n_deletion = 0
                    n_substitution = 0
                    n_only_insertion = 0
                    n_only_deletion = 0
                    n_only_substitution = 0
                    n_insertion_and_deletion = 0
                    n_insertion_and_substitution = 0
                    n_deletion_and_substitution = 0
                    n_insertion_and_deletion_and_substitution = 0

                    for ref_name in run_data[
                            'ref_names']:  #multiple alleles could be provided
                        n_aligned += run_data['counts_total'][ref_name]
                        n_unmod += run_data['counts_unmodified'][ref_name]
                        n_mod += run_data['counts_modified'][ref_name]
                        n_discarded += run_data['counts_discarded'][ref_name]

                        n_insertion += run_data['counts_insertion'][ref_name]
                        n_deletion += run_data['counts_deletion'][ref_name]
                        n_substitution += run_data['counts_substitution'][
                            ref_name]
                        n_only_insertion += run_data['counts_only_insertion'][
                            ref_name]
                        n_only_deletion += run_data['counts_only_deletion'][
                            ref_name]
                        n_only_substitution += run_data[
                            'counts_only_substitution'][ref_name]
                        n_insertion_and_deletion += run_data[
                            'counts_insertion_and_deletion'][ref_name]
                        n_insertion_and_substitution += run_data[
                            'counts_insertion_and_substitution'][ref_name]
                        n_deletion_and_substitution += run_data[
                            'counts_deletion_and_substitution'][ref_name]
                        n_insertion_and_deletion_and_substitution += run_data[
                            'counts_insertion_and_deletion_and_substitution'][
                                ref_name]

                    unmod_pct = np.nan
                    mod_pct = np.nan
                    if n_aligned > 0:
                        unmod_pct = 100 * n_unmod / float(n_aligned)
                        mod_pct = 100 * n_mod / float(n_aligned)

                    vals = [run_name]
                    vals.extend([
                        round(unmod_pct, 8),
                        round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod,
                        n_discarded, n_insertion, n_deletion, n_substitution,
                        n_only_insertion, n_only_deletion, n_only_substitution,
                        n_insertion_and_deletion, n_insertion_and_substitution,
                        n_deletion_and_substitution,
                        n_insertion_and_deletion_and_substitution
                    ])
                    quantification_summary.append(vals)

            header = 'Name\tUnmodified%\tModified%\tReads_total\tReads_aligned\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions'
            header_els = header.split("\t")
            df_summary_quantification = pd.DataFrame(quantification_summary,
                                                     columns=header_els)
            samples_quantification_summary_filename = _jp(
                'CRISPRessoAggregate_quantification_of_editing_frequency.txt'
            )  #this file has one line for each run (sum of all amplicons)
            df_summary_quantification.fillna('NA').to_csv(
                samples_quantification_summary_filename, sep='\t', index=None)
            crispresso2_info[
                'samples_quantification_summary_filename'] = os.path.basename(
                    samples_quantification_summary_filename)
            crispresso2_info[
                'samples_quantification_summary_by_amplicon_filename'] = os.path.basename(
                    samples_quantification_summary_by_amplicon_filename)
            df_summary_quantification.set_index('Name')

            if not args.suppress_plots:
                plot_root = _jp("CRISPRessoAggregate_reads_summary")

                CRISPRessoPlot.plot_reads_total(plot_root,
                                                df_summary_quantification,
                                                save_png,
                                                args.min_reads_for_inclusion)
                plot_name = os.path.basename(plot_root)
                crispresso2_info['summary_plot_root'] = plot_name
                crispresso2_info['summary_plot_names'].append(plot_name)
                crispresso2_info['summary_plot_titles'][
                    plot_name] = 'CRISPRessoAggregate Mapping Statistics Summary'
                crispresso2_info['summary_plot_labels'][
                    plot_name] = 'Each bar shows the total number of reads in each sample. The vertical line shows the cutoff for analysis, set using the --min_reads_for_inclusion parameter.'
                crispresso2_info['summary_plot_datas'][plot_name] = [
                    ('CRISPRessoAggregate summary',
                     os.path.basename(samples_quantification_summary_filename)
                     ),
                    ('CRISPRessoAggregate summary by amplicon',
                     os.path.basename(
                         samples_quantification_summary_by_amplicon_filename))
                ]

                plot_root = _jp(
                    "CRISPRessoAggregate_quantification_of_editing_frequency")
                CRISPRessoPlot.plot_unmod_mod_pcts(
                    plot_root, df_summary_quantification, save_png,
                    args.min_reads_for_inclusion)
                plot_name = os.path.basename(plot_root)
                crispresso2_info['summary_plot_root'] = plot_name
                crispresso2_info['summary_plot_names'].append(plot_name)
                crispresso2_info['summary_plot_titles'][
                    plot_name] = 'CRISPRessoAggregate Modification Summary'
                crispresso2_info['summary_plot_labels'][
                    plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_for_inclusion parameter.'
                crispresso2_info['summary_plot_datas'][plot_name] = [
                    ('CRISPRessoAggregate summary',
                     os.path.basename(samples_quantification_summary_filename)
                     ),
                    ('CRISPRessoAggregate summary by amplicon',
                     os.path.basename(
                         samples_quantification_summary_by_amplicon_filename))
                ]

            #summarize alignment
            with open(_jp('CRISPRessoAggregate_mapping_statistics.txt'),
                      'w') as outfile:
                wrote_header = False
                for crispresso2_folder in crispresso2_folders:
                    run_data = crispresso2_folder_infos[crispresso2_folder]
                    run_name = crispresso2_folder_names[crispresso2_folder]
                    mapping_file = os.path.join(
                        crispresso2_folder, run_data['mapping_stats_filename'])
                    with open(mapping_file, 'r') as infile:
                        file_head = infile.readline()
                        if not wrote_header:
                            outfile.write('Folder\t' + file_head)
                            wrote_header = True
                        for line in infile:
                            outfile.write(crispresso2_folder + "\t" + line)

            if not args.suppress_report:
                report_filename = OUTPUT_DIRECTORY + '.html'
                if (args.place_report_in_output_folder):
                    report_filename = _jp("CRISPResso2Aggregate_report.html")
                CRISPRessoReport.make_aggregate_report(
                    crispresso2_info, args.name, report_filename,
                    OUTPUT_DIRECTORY, _ROOT, crispresso2_folders,
                    crispresso2_folder_htmls)
                crispresso2_info['report_location'] = report_filename
                crispresso2_info['report_filename'] = os.path.basename(
                    report_filename)

        end_time = datetime.now()
        end_time_string = end_time.strftime('%Y-%m-%d %H:%M:%S')
        running_time = end_time - start_time
        running_time_string = str(running_time)

        crispresso2_info['end_time'] = end_time
        crispresso2_info['end_time_string'] = end_time_string
        crispresso2_info['running_time'] = running_time
        crispresso2_info['running_time_string'] = running_time_string

        cp.dump(crispresso2_info, open(crispresso2Aggregate_info_file, 'wb'))
        info('Analysis Complete!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)

        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Ejemplo n.º 6
0
def main():
    def print_stacktrace_if_debug():
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)
            error(traceback.format_exc())

    try:
        start_time = datetime.now()
        start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S')

        description = [
            '~~~CRISPRessoWGS~~~',
            '-Analysis of CRISPR/Cas9 outcomes from WGS data-'
        ]
        wgs_string = r'''
 ____________
|     __  __ |
||  |/ _ (_  |
||/\|\__)__) |
|____________|
        '''
        print(CRISPRessoShared.get_crispresso_header(description, wgs_string))

        parser = CRISPRessoShared.getCRISPRessoArgParser(
            parserTitle='CRISPRessoWGS Parameters', requiredParams={})

        #tool specific optional
        parser.add_argument('-b',
                            '--bam_file',
                            type=str,
                            help='WGS aligned bam file',
                            required=True,
                            default='bam filename')
        parser.add_argument(
            '-f',
            '--region_file',
            type=str,
            help=
            'Regions description file. A BED format  file containing the regions to analyze, one per line. The REQUIRED\
        columns are: chr_id(chromosome name), bpstart(start position), bpend(end position), the optional columns are:name (an unique indentifier for the region), guide_seq, expected_hdr_amplicon_seq,coding_seq, see CRISPResso help for more details on these last 3 parameters)',
            required=True)
        parser.add_argument(
            '-r',
            '--reference_file',
            type=str,
            help=
            'A FASTA format reference file (for example hg19.fa for the human genome)',
            default='',
            required=True)
        parser.add_argument(
            '--min_reads_to_use_region',
            type=float,
            help=
            'Minimum number of reads that align to a region to perform the CRISPResso analysis',
            default=10)
        parser.add_argument(
            '--skip_failed',
            help='Continue with pooled analysis even if one sample fails',
            action='store_true')
        parser.add_argument(
            '--gene_annotations',
            type=str,
            help=
            'Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \
        please select as table "knownGene", as output format "all fields from selected table" and as file returned "gzip compressed"',
            default='')
        parser.add_argument('--crispresso_command',
                            help='CRISPResso command to call',
                            default='CRISPResso')

        args = parser.parse_args()

        crispresso_options = CRISPRessoShared.get_crispresso_options()
        options_to_ignore = {
            'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name',
            'output_folder', 'name'
        }
        crispresso_options_for_wgs = list(crispresso_options -
                                          options_to_ignore)

        info('Checking dependencies...')

        if check_samtools() and check_bowtie2():
            info('\n All the required dependencies are present!')
        else:
            sys.exit(1)

        #check files
        check_file(args.bam_file)

        check_file(args.reference_file)

        check_file(args.region_file)

        if args.gene_annotations:
            check_file(args.gene_annotations)

        # for computation performed in CRISPRessoWGS (e.g. bowtie alignment, etc) use n_processes_for_wgs
        n_processes_for_wgs = 1
        if args.n_processes == "max":
            n_processes_for_wgs = CRISPRessoMultiProcessing.get_max_processes()
        else:
            n_processes_for_wgs = int(args.n_processes)

        # here, we set args.n_processes as 1 because this value is propagated to sub-CRISPResso runs (not for usage in CRISPRessoWGS)
        args.n_processes = 1

        #INIT
        get_name_from_bam = lambda x: os.path.basename(x).replace('.bam', '')

        if not args.name:
            database_id = '%s' % get_name_from_bam(args.bam_file)
        else:
            clean_name = CRISPRessoShared.slugify(args.name)
            if args.name != clean_name:
                warn(
                    'The specified name {0} contained invalid characters and was changed to: {1}'
                    .format(
                        args.name,
                        clean_name,
                    ), )
            database_id = clean_name

        OUTPUT_DIRECTORY = 'CRISPRessoWGS_on_%s' % database_id

        if args.output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.output_folder), OUTPUT_DIRECTORY)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
            info('Done!')
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoWGS_RUNNING_LOG.txt')
        logger.addHandler(logging.FileHandler(log_filename))

        crispresso2_info_file = os.path.join(OUTPUT_DIRECTORY,
                                             'CRISPResso2WGS_info.json')
        crispresso2_info = {
            'running_info': {},
            'results': {
                'alignment_stats': {},
                'general_plots': {}
            }
        }  #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['running_info'][
            'version'] = CRISPRessoShared.__version__
        crispresso2_info['running_info']['args'] = deepcopy(args)

        crispresso2_info['running_info']['log_filename'] = os.path.basename(
            log_filename)
        crispresso2_info['running_info']['finished_steps'] = {}

        crispresso_cmd_to_write = ' '.join(sys.argv)
        if args.write_cleaned_report:
            cmd_copy = sys.argv[:]
            cmd_copy[0] = 'CRISPRessoWGS'
            for i in range(len(cmd_copy)):
                if os.sep in cmd_copy[i]:
                    cmd_copy[i] = os.path.basename(cmd_copy[i])

            crispresso_cmd_to_write = ' '.join(
                cmd_copy
            )  #clean command doesn't show the absolute path to the executable or other files
        crispresso2_info['running_info'][
            'command_used'] = crispresso_cmd_to_write

        with open(log_filename, 'w+') as outfile:
            outfile.write(
                'CRISPResso version %s\n[Command used]:\n%s\n\n[Execution log]:\n'
                % (CRISPRessoShared.__version__, crispresso_cmd_to_write))

        #keep track of args to see if it is possible to skip computation steps on rerun
        can_finish_incomplete_run = False
        if args.no_rerun:
            if os.path.exists(crispresso2_info_file):
                previous_run_data = CRISPRessoShared.load_crispresso_info(
                    OUTPUT_DIRECTORY)
                if previous_run_data['running_info'][
                        'version'] == CRISPRessoShared.__version__:
                    args_are_same = True
                    for arg in vars(args):
                        if arg == "no_rerun" or arg == "debug" or arg == "n_processes":
                            continue
                        if arg not in vars(
                                previous_run_data['running_info']['args']):
                            info(
                                'Comparing current run to previous run: old run had argument '
                                + str(arg) + ' \nRerunning.')
                            args_are_same = False
                        elif str(
                                getattr(
                                    previous_run_data['running_info']['args'],
                                    arg)) != str(getattr(args, arg)):
                            info(
                                'Comparing current run to previous run:\n\told argument '
                                + str(arg) + ' = ' + str(
                                    getattr(
                                        previous_run_data['running_info']
                                        ['args'], arg)) +
                                '\n\tnew argument: ' + str(arg) + ' = ' +
                                str(getattr(args, arg)) + '\nRerunning.')
                            args_are_same = False

                    if args_are_same:
                        if 'end_time_string' in previous_run_data:
                            info('Analysis already completed on %s!' %
                                 previous_run_data['running_info']
                                 ['end_time_string'])
                            sys.exit(0)
                        else:
                            can_finish_incomplete_run = True
                            if 'finished_steps' in previous_run_data[
                                    'running_info']:
                                for key in previous_run_data['running_info'][
                                        'finished_steps'].keys():
                                    crispresso2_info['running_info'][
                                        'finished_steps'][
                                            key] = previous_run_data[
                                                'running_info'][
                                                    'finished_steps'][key]
                                    if args.debug:
                                        info('finished: ' + key)
                else:
                    info(
                        'The no_rerun flag is set, but this analysis will be rerun because the existing run was performed using an old version of CRISPResso ('
                        + str(previous_run_data['running_info']['version']) +
                        ').')

        #write this file early on so we can check the params if we have to rerun
        CRISPRessoShared.write_crispresso_info(
            crispresso2_info_file,
            crispresso2_info,
        )

        def rreplace(s, old, new):
            li = s.rsplit(old)
            return new.join(li)

        #check if bam has the index already
        if os.path.exists(rreplace(args.bam_file, ".bam", ".bai")):
            info('Index file for input .bam file exists, skipping generation.')
        elif os.path.exists(args.bam_file + '.bai'):
            info('Index file for input .bam file exists, skipping generation.')
        else:
            info('Creating index file for input .bam file...')
            sb.call('samtools index %s ' % (args.bam_file), shell=True)

        #load gene annotation
        if args.gene_annotations:
            info('Loading gene coordinates from annotation file: %s...' %
                 args.gene_annotations)
            try:
                df_genes = pd.read_csv(args.gene_annotations,
                                       compression='gzip',
                                       sep="\t")
                df_genes.txEnd = df_genes.txEnd.astype(int)
                df_genes.txStart = df_genes.txStart.astype(int)
                df_genes.head()
            except:
                raise Exception('Failed to load the gene annotations file.')

        #Load and validate the REGION FILE
        df_regions = pd.read_csv(args.region_file,
                                 names=[
                                     'chr_id', 'bpstart', 'bpend', 'Name',
                                     'sgRNA', 'Expected_HDR', 'Coding_sequence'
                                 ],
                                 comment='#',
                                 sep='\t',
                                 dtype={
                                     'Name': str,
                                     'chr_id': str
                                 })

        #remove empty amplicons/lines
        df_regions.dropna(subset=['chr_id', 'bpstart', 'bpend'], inplace=True)

        df_regions.Expected_HDR = df_regions.Expected_HDR.apply(
            capitalize_sequence)
        df_regions.sgRNA = df_regions.sgRNA.apply(capitalize_sequence)
        df_regions.Coding_sequence = df_regions.Coding_sequence.apply(
            capitalize_sequence)

        #check or create names
        for idx, row in df_regions.iterrows():
            if pd.isnull(row.Name):
                df_regions.iloc[idx, ]['Name'] = '_'.join(
                    map(str, [row['chr_id'], row['bpstart'], row['bpend']]))

        if not len(df_regions.Name.unique()) == df_regions.shape[0]:
            raise Exception('The amplicon names should be all distinct!')

        df_regions.set_index('Name', inplace=True)
        #df_regions.index=df_regions.index.str.replace(' ','_')
        df_regions.index = df_regions.index.to_series().str.replace(' ', '_')

        #extract sequence for each region
        uncompressed_reference = args.reference_file

        if os.path.exists(uncompressed_reference + '.fai'):
            info(
                'The index for the reference fasta file is already present! Skipping generation.'
            )
        else:
            info('Indexing reference file... Please be patient!')
            sb.call('samtools faidx %s >>%s 2>&1' %
                    (uncompressed_reference, log_filename),
                    shell=True)

        info(
            'Retrieving reference sequences for amplicons and checking for sgRNAs'
        )
        df_regions['sequence'] = df_regions.apply(
            lambda row: get_region_from_fa(row.chr_id, row.bpstart, row.bpend,
                                           uncompressed_reference),
            axis=1)

        for idx, row in df_regions.iterrows():

            if not pd.isnull(row.sgRNA):

                cut_points = []
                guides = row.sgRNA.strip().upper().split(',')
                guide_qw_centers = CRISPRessoShared.set_guide_array(
                    args.quantification_window_center, guides,
                    'guide quantification center')
                for idx, current_guide_seq in enumerate(guides):

                    wrong_nt = find_wrong_nt(current_guide_seq)
                    if wrong_nt:
                        raise NTException(
                            'The sgRNA sequence %s contains wrong characters:%s'
                            % (current_guide_seq, ' '.join(wrong_nt)))

                    offset_fw = guide_qw_centers[idx] + len(
                        current_guide_seq) - 1
                    offset_rc = (-guide_qw_centers[idx]) - 1
                    cut_points+=[m.start() + offset_fw for \
                                m in re.finditer(current_guide_seq,  row.sequence)]+[m.start() + offset_rc for m in re.finditer(CRISPRessoShared.reverse_complement(current_guide_seq),  row.sequence)]

                if not cut_points:
                    df_regions.iloc[idx, :]['sgRNA'] = ''
                    info('Cannot find guide ' + str(row.sgRNA) +
                         ' in amplicon ' + str(idx) + ' (' + str(row) + ')')

        df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart'])
        df_regions['bpend'] = pd.to_numeric(df_regions['bpend'])

        df_regions.bpstart = df_regions.bpstart.astype(int)
        df_regions.bpend = df_regions.bpend.astype(int)

        if args.gene_annotations:
            df_regions = df_regions.apply(
                lambda row: find_overlapping_genes(row, df_genes), axis=1)

        #extract reads with samtools in that region and create a bam
        #create a fasta file with all the trimmed reads
        info('\nProcessing each region...')

        ANALYZED_REGIONS = _jp('ANALYZED_REGIONS/')
        if not os.path.exists(ANALYZED_REGIONS):
            os.mkdir(ANALYZED_REGIONS)

        df_regions['region_number'] = np.arange(len(df_regions))

        def set_filenames(row):
            row_fastq_exists = False
            fastq_gz_filename = os.path.join(
                ANALYZED_REGIONS, '%s.fastq.gz' %
                clean_filename('REGION_' + str(row.region_number)))
            bam_region_filename = os.path.join(
                ANALYZED_REGIONS,
                '%s.bam' % clean_filename('REGION_' + str(row.region_number)))
            #if bam file already exists, don't regenerate it
            if os.path.isfile(fastq_gz_filename):
                row_fastq_exists = True
            return bam_region_filename, fastq_gz_filename, row_fastq_exists

        df_regions['bam_file_with_reads_in_region'], df_regions[
            'fastq_file_trimmed_reads_in_region'], df_regions[
                'row_fastq_exists'] = zip(
                    *df_regions.apply(set_filenames, axis=1))
        df_regions['n_reads'] = 0
        df_regions[
            'original_bam'] = args.bam_file  #stick this in the df so we can parallelize the analysis and not pass params

        report_reads_aligned_filename = _jp(
            'REPORT_READS_ALIGNED_TO_SELECTED_REGIONS_WGS.txt')
        num_rows_without_fastq = len(
            df_regions[df_regions.row_fastq_exists == False])

        if can_finish_incomplete_run and num_rows_without_fastq == 0 and os.path.isfile(
                report_reads_aligned_filename
        ) and 'generation_of_fastq_files_for_each_amplicon' in crispresso2_info[
                'running_info']['finished_steps']:
            info('Skipping generation of fastq files for each amplicon.')
            df_regions = pd.read_csv(report_reads_aligned_filename,
                                     comment='#',
                                     sep='\t',
                                     dtype={
                                         'Name': str,
                                         'chr_id': str
                                     })
            df_regions.set_index('Name', inplace=True)

        else:
            #run region extraction here
            df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel(
                df_regions, extract_reads_chunk, n_processes_for_wgs)
            df_regions.sort_values('region_number', inplace=True)
            cols_to_print = [
                "chr_id", "bpstart", "bpend", "sgRNA", "Expected_HDR",
                "Coding_sequence", "sequence", "n_reads",
                "bam_file_with_reads_in_region",
                "fastq_file_trimmed_reads_in_region"
            ]
            if args.gene_annotations:
                cols_to_print.append('gene_overlapping')
            df_regions.fillna('NA').to_csv(report_reads_aligned_filename,
                                           sep='\t',
                                           columns=cols_to_print,
                                           index_label="Name")

            #save progress
            crispresso2_info['running_info']['finished_steps'][
                'generation_of_fastq_files_for_each_amplicon'] = True
            CRISPRessoShared.write_crispresso_info(
                crispresso2_info_file,
                crispresso2_info,
            )

        #Run Crispresso
        info('Running CRISPResso on each region...')
        crispresso_cmds = []
        for idx, row in df_regions.iterrows():
            if row['n_reads'] >= args.min_reads_to_use_region:
                info('\nThe region [%s] has enough reads (%d) mapped to it!' %
                     (idx, row['n_reads']))

                crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' %\
                (row['fastq_file_trimmed_reads_in_region'], row['sequence'], OUTPUT_DIRECTORY, idx)

                if row['sgRNA'] and not pd.isnull(row['sgRNA']):
                    crispresso_cmd += ' -g %s' % row['sgRNA']

                if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']):
                    crispresso_cmd += ' -e %s' % row['Expected_HDR']

                if row['Coding_sequence'] and not pd.isnull(
                        row['Coding_sequence']):
                    crispresso_cmd += ' -c %s' % row['Coding_sequence']

                crispresso_cmd = CRISPRessoShared.propagate_crispresso_options(
                    crispresso_cmd, crispresso_options_for_wgs, args)

                #logging like this causes the multiprocessing step to not block for some reason #mysteriesOfThPythonUniverse
                #log_name = _jp("CRISPResso_on_"+idx) +".log"
                #crispresso_cmd += " &> %s"%log_name

                crispresso_cmds.append(crispresso_cmd)


#                    info('Running CRISPResso:%s' % crispresso_cmd)
#                    sb.call(crispresso_cmd,shell=True)

            else:
                info(
                    '\nThe region [%s] has too few reads mapped to it (%d)! Not running CRISPResso!'
                    % (idx, row['n_reads']))

        CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,
                                                      n_processes_for_wgs,
                                                      'region',
                                                      args.skip_failed)

        quantification_summary = []
        all_region_names = []
        all_region_read_counts = {}
        good_region_names = []
        good_region_folders = {}
        header = 'Name\tUnmodified%\tModified%\tReads_total\tReads_aligned\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions'
        header_els = header.split("\t")
        header_el_count = len(header_els)
        empty_line_els = [np.nan] * (header_el_count - 1)
        n_reads_index = header_els.index('Reads_total') - 1
        for idx, row in df_regions.iterrows():
            folder_name = 'CRISPResso_on_%s' % idx
            run_name = idx

            all_region_names.append(run_name)
            all_region_read_counts[run_name] = row.n_reads

            run_file = os.path.join(_jp(folder_name), 'CRISPResso2_info.json')
            if not os.path.exists(run_file):
                warn(
                    'Skipping the folder %s: not enough reads, incomplete, or empty folder.'
                    % folder_name)
                this_els = empty_line_els[:]
                this_els[n_reads_index] = row.n_reads
                to_add = [run_name]
                to_add.extend(this_els)
                quantification_summary.append(to_add)
            else:
                run_data = CRISPRessoShared.load_crispresso_info(
                    _jp(folder_name), )
                ref_name = run_data['results']['ref_names'][
                    0]  #only expect one amplicon sequence
                n_tot = row.n_reads
                n_aligned = run_data['results']['alignment_stats'][
                    'counts_total'][ref_name]
                n_unmod = run_data['results']['alignment_stats'][
                    'counts_unmodified'][ref_name]
                n_mod = run_data['results']['alignment_stats'][
                    'counts_modified'][ref_name]
                n_discarded = run_data['results']['alignment_stats'][
                    'counts_discarded'][ref_name]

                n_insertion = run_data['results']['alignment_stats'][
                    'counts_insertion'][ref_name]
                n_deletion = run_data['results']['alignment_stats'][
                    'counts_deletion'][ref_name]
                n_substitution = run_data['results']['alignment_stats'][
                    'counts_substitution'][ref_name]
                n_only_insertion = run_data['results']['alignment_stats'][
                    'counts_only_insertion'][ref_name]
                n_only_deletion = run_data['results']['alignment_stats'][
                    'counts_only_deletion'][ref_name]
                n_only_substitution = run_data['results']['alignment_stats'][
                    'counts_only_substitution'][ref_name]
                n_insertion_and_deletion = run_data['results'][
                    'alignment_stats']['counts_insertion_and_deletion'][
                        ref_name]
                n_insertion_and_substitution = run_data['results'][
                    'alignment_stats']['counts_insertion_and_substitution'][
                        ref_name]
                n_deletion_and_substitution = run_data['results'][
                    'alignment_stats']['counts_deletion_and_substitution'][
                        ref_name]
                n_insertion_and_deletion_and_substitution = run_data[
                    'results']['alignment_stats'][
                        'counts_insertion_and_deletion_and_substitution'][
                            ref_name]

                unmod_pct = "NA"
                mod_pct = "NA"
                if n_aligned > 0:
                    unmod_pct = 100 * n_unmod / float(n_aligned)
                    mod_pct = 100 * n_mod / float(n_aligned)

                vals = [run_name]
                vals.extend([
                    round(unmod_pct, 8),
                    round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod,
                    n_discarded, n_insertion, n_deletion, n_substitution,
                    n_only_insertion, n_only_deletion, n_only_substitution,
                    n_insertion_and_deletion, n_insertion_and_substitution,
                    n_deletion_and_substitution,
                    n_insertion_and_deletion_and_substitution
                ])
                quantification_summary.append(vals)

                good_region_names.append(idx)
                good_region_folders[idx] = folder_name
        samples_quantification_summary_filename = _jp(
            'SAMPLES_QUANTIFICATION_SUMMARY.txt')

        df_summary_quantification = pd.DataFrame(quantification_summary,
                                                 columns=header_els)
        if args.crispresso1_mode:
            crispresso1_columns = [
                'Name', 'Unmodified%', 'Modified%', 'Reads_aligned',
                'Reads_total'
            ]
            df_summary_quantification.fillna('NA').to_csv(
                samples_quantification_summary_filename,
                sep='\t',
                index=None,
                columns=crispresso1_columns)
        else:
            df_summary_quantification.fillna('NA').to_csv(
                samples_quantification_summary_filename, sep='\t', index=None)

        crispresso2_info['results']['alignment_stats'][
            'samples_quantification_summary_filename'] = os.path.basename(
                samples_quantification_summary_filename)
        crispresso2_info['results']['regions'] = df_regions
        crispresso2_info['results']['all_region_names'] = all_region_names
        crispresso2_info['results'][
            'all_region_read_counts'] = all_region_read_counts
        crispresso2_info['results']['good_region_names'] = good_region_names
        crispresso2_info['results'][
            'good_region_folders'] = good_region_folders

        crispresso2_info['results']['general_plots']['summary_plot_names'] = []
        crispresso2_info['results']['general_plots'][
            'summary_plot_titles'] = {}
        crispresso2_info['results']['general_plots'][
            'summary_plot_labels'] = {}
        crispresso2_info['results']['general_plots']['summary_plot_datas'] = {}

        df_summary_quantification.set_index('Name')

        save_png = True
        if args.suppress_report:
            save_png = False

        if not args.suppress_plots:
            plot_root = _jp("CRISPRessoWGS_reads_summary")
            CRISPRessoPlot.plot_reads_total(plot_root,
                                            df_summary_quantification,
                                            save_png,
                                            args.min_reads_to_use_region)
            plot_name = os.path.basename(plot_root)
            crispresso2_info['results']['general_plots'][
                'reads_summary_plot'] = plot_name
            crispresso2_info['results']['general_plots'][
                'summary_plot_names'].append(plot_name)
            crispresso2_info['results']['general_plots'][
                'summary_plot_titles'][
                    plot_name] = 'CRISPRessoWGS Read Allocation Summary'
            crispresso2_info['results']['general_plots']['summary_plot_labels'][
                plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
            crispresso2_info['results']['general_plots']['summary_plot_datas'][
                plot_name] = [
                    ('CRISPRessoWGS summary',
                     os.path.basename(samples_quantification_summary_filename))
                ]

            plot_root = _jp("CRISPRessoWGS_modification_summary")
            CRISPRessoPlot.plot_unmod_mod_pcts(plot_root,
                                               df_summary_quantification,
                                               save_png,
                                               args.min_reads_to_use_region)
            plot_name = os.path.basename(plot_root)
            crispresso2_info['results']['general_plots'][
                'modification_summary_plot'] = plot_name
            crispresso2_info['results']['general_plots'][
                'summary_plot_names'].append(plot_name)
            crispresso2_info['results']['general_plots'][
                'summary_plot_titles'][
                    plot_name] = 'CRISPRessoWGS Modification Summary'
            crispresso2_info['results']['general_plots']['summary_plot_labels'][
                plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
            crispresso2_info['results']['general_plots']['summary_plot_datas'][
                plot_name] = [
                    ('CRISPRessoWGS summary',
                     os.path.basename(samples_quantification_summary_filename))
                ]

        if not args.suppress_report and not args.suppress_plots:
            if (args.place_report_in_output_folder):
                report_name = _jp("CRISPResso2WGS_report.html")
            else:
                report_name = OUTPUT_DIRECTORY + '.html'
            CRISPRessoReport.make_wgs_report_from_folder(
                report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT)
            crispresso2_info['running_info']['report_location'] = report_name
            crispresso2_info['running_info'][
                'report_filename'] = os.path.basename(report_name)

        end_time = datetime.now()
        end_time_string = end_time.strftime('%Y-%m-%d %H:%M:%S')
        running_time = end_time - start_time
        running_time_string = str(running_time)

        crispresso2_info['running_info']['end_time'] = end_time
        crispresso2_info['running_info']['end_time_string'] = end_time_string
        crispresso2_info['running_info']['running_time'] = running_time
        crispresso2_info['running_info'][
            'running_time_string'] = running_time_string

        CRISPRessoShared.write_crispresso_info(
            crispresso2_info_file,
            crispresso2_info,
        )

        info('Analysis Complete!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        print_stacktrace_if_debug()
        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Ejemplo n.º 7
0
def plot_alleles_heatmap(reference_seq,
                         fig_filename_root,
                         X,
                         annot,
                         y_labels,
                         insertion_dict,
                         per_element_annot_kws,
                         SAVE_ALSO_PNG=False,
                         plot_cut_point=True,
                         cut_point_ind=None,
                         sgRNA_intervals=None,
                         sgRNA_names=None,
                         sgRNA_mismatches=None,
                         custom_colors=None):
    """
    Plots alleles in a heatmap (nucleotides color-coded for easy visualization)
    input:
    -reference_seq: sequence of reference allele to plot
    -fig_filename: figure filename to plot (not including '.pdf' or '.png')
    -X: list of numbers representing nucleotides of the allele
    -annot: list of nucleotides (letters) of the allele
    -y_labels: list of labels for each row/allele
    -insertion_dict: locations of insertions -- red squares will be drawn around these
    -per_element_annot_kws: annotations for each cell (e.g. bold for substitutions, etc.)
    -SAVE_ALSO_PNG: whether to write png file as well
    -plot_cut_point: if false, won't draw 'predicted cleavage' line
    -cut_point_ind: index to plot cut point at
    -sgRNA_intervals: locations where sgRNA is located
    -sgRNA_mismatches: array (for each sgRNA_interval) of locations in sgRNA where there are mismatches
    -sgRNA_names: array (for each sgRNA_interval) of names of sgRNAs (otherwise empty)
    -custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
    """
    plot_nuc_len = len(reference_seq)

    # make a color map of fixed colors
    alpha = 0.4
    A_color = CRISPRessoPlot.get_nuc_color('A', alpha)
    T_color = CRISPRessoPlot.get_nuc_color('T', alpha)
    C_color = CRISPRessoPlot.get_nuc_color('C', alpha)
    G_color = CRISPRessoPlot.get_nuc_color('G', alpha)
    INDEL_color = CRISPRessoPlot.get_nuc_color('N', alpha)

    if custom_colors is not None:
        if 'A' in custom_colors:
            A_color = custom_colors['A']
        if 'T' in custom_colors:
            T_color = custom_colors['T']
        if 'C' in custom_colors:
            C_color = custom_colors['C']
        if 'G' in custom_colors:
            G_color = custom_colors['G']
        if 'N' in custom_colors:
            INDEL_color = custom_colors['N']

    dna_to_numbers = {'-': 0, 'A': 1, 'T': 2, 'C': 3, 'G': 4, 'N': 5}
    seq_to_numbers = lambda seq: [dna_to_numbers[x] for x in seq]

    cmap = colors_mpl.ListedColormap(
        [INDEL_color, A_color, T_color, C_color, G_color, INDEL_color])

    #ref_seq_around_cut=reference_seq[max(0,cut_point-plot_nuc_len/2+1):min(len(reference_seq),cut_point+plot_nuc_len/2+1)]

    #    print('per element anoot kws: ' + per_element_annot_kws)
    if len(per_element_annot_kws) > 1:
        per_element_annot_kws = np.vstack(per_element_annot_kws[::-1])
    else:
        per_element_annot_kws = np.array(per_element_annot_kws)
    ref_seq_hm = np.expand_dims(seq_to_numbers(reference_seq), 1).T
    ref_seq_annot_hm = np.expand_dims(list(reference_seq), 1).T

    annot = annot[::-1]
    X = X[::-1]

    N_ROWS = len(X)
    N_COLUMNS = plot_nuc_len

    if N_ROWS < 1:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.text(0.5,
                 0.5,
                 'No Alleles',
                 horizontalalignment='center',
                 verticalalignment='center',
                 transform=ax.transAxes)
        ax.set_clip_on(False)

        plt.savefig(fig_filename_root + '.pdf', bbox_inches='tight')
        if SAVE_ALSO_PNG:
            plt.savefig(fig_filename_root + '.png', bbox_inches='tight')
        plt.close()
        return

    sgRNA_rows = []
    num_sgRNA_rows = 0

    if sgRNA_intervals and len(sgRNA_intervals) > 0:
        sgRNA_rows = CRISPRessoPlot.get_rows_for_sgRNA_annotation(
            sgRNA_intervals, plot_nuc_len)
        num_sgRNA_rows = max(sgRNA_rows) + 1
        fig = plt.figure(figsize=(plot_nuc_len * 0.3,
                                  (N_ROWS + 1 + num_sgRNA_rows) * 0.6))
        gs1 = gridspec.GridSpec(N_ROWS + 2, N_COLUMNS)
        gs2 = gridspec.GridSpec(N_ROWS + 2, N_COLUMNS)
        #ax_hm_ref heatmap for the reference
        ax_hm_ref = plt.subplot(gs1[0:1, :])
        ax_hm = plt.subplot(gs2[2:, :])
    else:
        fig = plt.figure(figsize=(plot_nuc_len * 0.3, (N_ROWS + 1) * 0.6))
        gs1 = gridspec.GridSpec(N_ROWS + 1, N_COLUMNS)
        gs2 = gridspec.GridSpec(N_ROWS + 1, N_COLUMNS)
        #ax_hm_ref heatmap for the reference
        ax_hm_ref = plt.subplot(gs1[0, :])
        ax_hm = plt.subplot(gs2[1:, :])

    CRISPRessoPlot.custom_heatmap(ref_seq_hm,
                                  annot=ref_seq_annot_hm,
                                  annot_kws={'size': 16},
                                  cmap=cmap,
                                  fmt='s',
                                  ax=ax_hm_ref,
                                  vmin=0,
                                  vmax=5,
                                  square=True)
    CRISPRessoPlot.custom_heatmap(X,
                                  annot=np.array(annot),
                                  annot_kws={'size': 16},
                                  cmap=cmap,
                                  fmt='s',
                                  ax=ax_hm,
                                  vmin=0,
                                  vmax=5,
                                  square=True,
                                  per_element_annot_kws=per_element_annot_kws)

    ax_hm.yaxis.tick_right()
    ax_hm.yaxis.set_ticklabels(y_labels[::-1], rotation=True, va='center')
    ax_hm.xaxis.set_ticks([])

    if sgRNA_intervals and len(sgRNA_intervals) > 0:
        this_sgRNA_y_start = -1 * num_sgRNA_rows
        this_sgRNA_y_height = num_sgRNA_rows - 0.3
        CRISPRessoPlot.add_sgRNA_to_ax(ax_hm_ref,
                                       sgRNA_intervals,
                                       sgRNA_y_start=this_sgRNA_y_start,
                                       sgRNA_y_height=this_sgRNA_y_height,
                                       amp_len=plot_nuc_len,
                                       font_size='small',
                                       clip_on=False,
                                       sgRNA_names=sgRNA_names,
                                       sgRNA_mismatches=sgRNA_mismatches,
                                       x_offset=0,
                                       label_at_zero=True,
                                       sgRNA_rows=sgRNA_rows)


# todo -- add sgRNAs below reference plot
#    if sgRNA_intervals:
#        ax_hm_anno=plt.subplot(gs3[2, :])
#        sgRNA_y_start = 0.3
##        sgRNA_y_height = 0.1
#        sgRNA_y_height = 10
#        min_sgRNA_x = None
#        for idx,sgRNA_int in enumerate(sgRNA_intervals):
#            ax_hm_anno.add_patch(
#                patches.Rectangle((2+sgRNA_int[0], sgRNA_y_start), 1+sgRNA_int[1]-sgRNA_int[0], sgRNA_y_height,facecolor=(0,0,0,0.15))
#                )
#            #set left-most sgrna start
#            if not min_sgRNA_x:
#                min_sgRNA_x = sgRNA_int[0]
#            if sgRNA_int[0] < min_sgRNA_x:
#                min_sgRNA_x = sgRNA_int[0]
#        ax_hm_anno.text(2+min_sgRNA_x,sgRNA_y_start + sgRNA_y_height/2,'sgRNA ',horizontalalignment='right',verticalalignment='center')

#print lines

#create boxes for ins
    for idx, lss in insertion_dict.iteritems():
        for ls in lss:
            ax_hm.add_patch(
                patches.Rectangle((ls[0], N_ROWS - idx - 1),
                                  ls[1] - ls[0],
                                  1,
                                  linewidth=3,
                                  edgecolor='r',
                                  fill=False))

    #cut point vertical line
    if plot_cut_point:
        if cut_point_ind is None:
            ax_hm.vlines([plot_nuc_len / 2],
                         *ax_hm.get_ylim(),
                         linestyles='dashed')
        else:
            ax_hm.vlines(cut_point_ind, *ax_hm.get_ylim(), linestyles='dashed')

    ax_hm_ref.yaxis.tick_right()
    ax_hm_ref.xaxis.set_ticks([])
    ax_hm_ref.yaxis.set_ticklabels(['Reference'], rotation=True, va='center')

    gs2.update(left=0,
               right=1,
               hspace=0.05,
               wspace=0,
               top=1 * (((N_ROWS) * 1.13)) / (N_ROWS))
    gs1.update(
        left=0,
        right=1,
        hspace=0.05,
        wspace=0,
    )

    sns.set_context(
        rc={
            'axes.facecolor': 'white',
            'lines.markeredgewidth': 1,
            'mathtext.fontset': 'stix',
            'text.usetex': True,
            'text.latex.unicode': True
        })

    proxies = [
        matplotlib.lines.Line2D([0], [0],
                                linestyle='none',
                                mfc='black',
                                mec='none',
                                marker=r'$\mathbf{{{}}}$'.format('bold'),
                                ms=18),
        matplotlib.lines.Line2D([0], [0],
                                linestyle='none',
                                mfc='none',
                                mec='r',
                                marker='s',
                                ms=8,
                                markeredgewidth=2.5),
        matplotlib.lines.Line2D(
            [0],
            [0],
            linestyle='none',
            mfc='none',
            mec='black',
            marker='_',
            ms=2,
        )
    ]
    descriptions = ['Substitutions', 'Insertions', 'Deletions']

    if plot_cut_point:
        proxies.append(
            matplotlib.lines.Line2D([0], [1], linestyle='--', c='black', ms=6))
        descriptions.append('Predicted cleavage position')

    #ax_hm_ref.legend(proxies, descriptions, numpoints=1, markerscale=2, loc='center', bbox_to_anchor=(0.5, 4),ncol=1)
    lgd = ax_hm.legend(proxies,
                       descriptions,
                       numpoints=1,
                       markerscale=2,
                       loc='upper center',
                       bbox_to_anchor=(0.5, 0),
                       ncol=1,
                       fancybox=True,
                       shadow=False)

    plt.savefig(fig_filename_root + '.pdf',
                bbox_inches='tight',
                bbox_extra_artists=(lgd, ))
    if SAVE_ALSO_PNG:
        plt.savefig(fig_filename_root + '.png',
                    bbox_inches='tight',
                    bbox_extra_artists=(lgd, ))
    plt.close()
Ejemplo n.º 8
0
def main():
    try:
        description = ['~~~CRISPRessoPooled~~~','-Analysis of CRISPR/Cas9 outcomes from POOLED deep sequencing data-']
        pooled_string = r'''
 _______________________
| __  __  __     __ __  |
||__)/  \/  \|  |_ |  \ |
||   \__/\__/|__|__|__/ |
|_______________________|
        '''
        print(CRISPRessoShared.get_crispresso_header(description,pooled_string))

        parser = CRISPRessoShared.getCRISPRessoArgParser(parserTitle = 'CRISPRessoPooled Parameters',requiredParams={'fastq_r1':True})
        parser.add_argument('-f','--amplicons_file', type=str,  help='Amplicons description file. This file is a tab-delimited text file with up to 5 columns (2 required):\
        \nAMPLICON_NAME:  an identifier for the amplicon (must be unique)\nAMPLICON_SEQUENCE:  amplicon sequence used in the experiment\n\
        \nsgRNA_SEQUENCE (OPTIONAL):  sgRNA sequence used for this amplicon without the PAM sequence. Multiple guides can be given separated by commas and not spaces. If not available enter NA.\
        \nEXPECTED_AMPLICON_AFTER_HDR (OPTIONAL): expected amplicon sequence in case of HDR. If not available enter NA.\
        \nCODING_SEQUENCE (OPTIONAL): Subsequence(s) of the amplicon corresponding to coding sequences. If more than one separate them by commas and not spaces. If not available enter NA.', default='')

        #tool specific optional
        parser.add_argument('--gene_annotations', type=str, help='Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \
        please select as table "knownGene", as output format "all fields from selected table" and as file returned "gzip compressed"', default='')
        parser.add_argument('-p','--n_processes',type=int, help='Specify the number of processes to use for Bowtie2.\
        Please use with caution since increasing this parameter will increase significantly the memory required to run CRISPResso.',default=1)
        parser.add_argument('-x','--bowtie2_index', type=str, help='Basename of Bowtie2 index for the reference genome', default='')
        parser.add_argument('--bowtie2_options_string', type=str, help='Override options for the Bowtie2 alignment command',default=' -k 1 --end-to-end -N 0 --np 0 ')
        parser.add_argument('--min_reads_to_use_region',  type=float, help='Minimum number of reads that align to a region to perform the CRISPResso analysis', default=1000)
        parser.add_argument('--skip_failed',  help='Continue with pooled analysis even if one sample fails',action='store_true')
        parser.add_argument('--crispresso_command', help='CRISPResso command to call',default='CRISPResso')

        args = parser.parse_args()

        crispresso_options = CRISPRessoShared.get_crispresso_options()
        options_to_ignore = set(['fastq_r1','fastq_r2','amplicon_seq','amplicon_name','output_folder','name'])
        crispresso_options_for_pooled = list(crispresso_options-options_to_ignore)


        info('Checking dependencies...')

        if check_samtools() and check_bowtie2():
            info('All the required dependencies are present!')
        else:
            sys.exit(1)

        #check files
        check_file(args.fastq_r1)
        if args.fastq_r2:
            check_file(args.fastq_r2)

        if args.bowtie2_index:
            check_file(args.bowtie2_index+'.1.bt2')

        if args.amplicons_file:
            check_file(args.amplicons_file)

        if args.gene_annotations:
            check_file(args.gene_annotations)

        if args.amplicons_file and not args.bowtie2_index:
            RUNNING_MODE='ONLY_AMPLICONS'
            info('Only the Amplicon description file was provided. The analysis will be perfomed using only the provided amplicons sequences.')

        elif args.bowtie2_index and not args.amplicons_file:
            RUNNING_MODE='ONLY_GENOME'
            info('Only the bowtie2 reference genome index file was provided. The analysis will be perfomed using only genomic regions where enough reads align.')
        elif args.bowtie2_index and args.amplicons_file:
            RUNNING_MODE='AMPLICONS_AND_GENOME'
            info('Amplicon description file and bowtie2 reference genome index files provided. The analysis will be perfomed using the reads that are aligned ony to the amplicons provided and not to other genomic regions.')
        else:
            error('Please provide the amplicons description file (-f or --amplicons_file option) or the bowtie2 reference genome index file (-x or --bowtie2_index option) or both.')
            sys.exit(1)



        ####TRIMMING AND MERGING
        get_name_from_fasta=lambda  x: os.path.basename(x).replace('.fastq','').replace('.gz','')

        if not args.name:
                 if args.fastq_r2!='':
                         database_id='%s_%s' % (get_name_from_fasta(args.fastq_r1),get_name_from_fasta(args.fastq_r2))
                 else:
                         database_id='%s' % get_name_from_fasta(args.fastq_r1)

        else:
                 database_id=args.name



        OUTPUT_DIRECTORY='CRISPRessoPooled_on_%s' % database_id

        if args.output_folder:
                 OUTPUT_DIRECTORY=os.path.join(os.path.abspath(args.output_folder),OUTPUT_DIRECTORY)

        _jp=lambda filename: os.path.join(OUTPUT_DIRECTORY,filename) #handy function to put a file in the output directory

        try:
                 info('Creating Folder %s' % OUTPUT_DIRECTORY)
                 os.makedirs(OUTPUT_DIRECTORY)
                 info('Done!')
        except:
                 warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename=_jp('CRISPRessoPooled_RUNNING_LOG.txt')
        logging.getLogger().addHandler(logging.FileHandler(log_filename))

        crispresso2WGS_info_file = os.path.join(OUTPUT_DIRECTORY,'CRISPResso2Pooled_info.pickle')
        crispresso2_info = {} #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['version'] = CRISPRessoShared.__version__
        crispresso2_info['args'] = deepcopy(args)

        crispresso2_info['log_filename'] = os.path.basename(log_filename)

        with open(log_filename,'w+') as outfile:
                  outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' % ' '.join(sys.argv))

        if args.fastq_r2=='': #single end reads

             #check if we need to trim
             if not args.trim_sequences:
                 #create a symbolic link
                 symlink_filename=_jp(os.path.basename(args.fastq_r1))
                 force_symlink(os.path.abspath(args.fastq_r1),symlink_filename)
                 output_forward_filename=symlink_filename
             else:
                 output_forward_filename=_jp('reads.trimmed.fq.gz')
                 #Trimming with trimmomatic
                 cmd='%s SE -phred33 %s  %s %s >>%s 2>&1'\
                 % (args.trimmomatic_command,args.fastq_r1,
                    output_forward_filename,
                    args.trimmomatic_options_string,
                    log_filename)
                 #print cmd
                 TRIMMOMATIC_STATUS=sb.call(cmd,shell=True)

                 if TRIMMOMATIC_STATUS:
                         raise TrimmomaticException('TRIMMOMATIC failed to run, please check the log file.')


             processed_output_filename=output_forward_filename

        else:#paired end reads case

             if not args.trim_sequences:
                 output_forward_paired_filename=args.fastq_r1
                 output_reverse_paired_filename=args.fastq_r2
             else:
                 info('Trimming sequences with Trimmomatic...')
                 output_forward_paired_filename=_jp('output_forward_paired.fq.gz')
                 output_forward_unpaired_filename=_jp('output_forward_unpaired.fq.gz')
                 output_reverse_paired_filename=_jp('output_reverse_paired.fq.gz')
                 output_reverse_unpaired_filename=_jp('output_reverse_unpaired.fq.gz')

                 #Trimming with trimmomatic
                 cmd='%s PE -phred33 %s  %s %s  %s  %s  %s %s >>%s 2>&1'\
                 % (args.trimmomatic_command,
                         args.fastq_r1,args.fastq_r2,output_forward_paired_filename,
                         output_forward_unpaired_filename,output_reverse_paired_filename,
                         output_reverse_unpaired_filename,args.trimmomatic_options_string,log_filename)
                 #print cmd
                 TRIMMOMATIC_STATUS=sb.call(cmd,shell=True)
                 if TRIMMOMATIC_STATUS:
                         raise TrimmomaticException('TRIMMOMATIC failed to run, please check the log file.')

                 info('Done!')


             max_overlap_string = ""
             min_overlap_string = ""
             if args.max_paired_end_reads_overlap:
                 max_overlap_string = "--max-overlap " + str(args.max_paired_end_reads_overlap)
             if args.min_paired_end_reads_overlap:
                 min_overlap_string = args.min_paired_end_reads_overlap
             #Merging with Flash
             info('Merging paired sequences with Flash...')
             cmd=args.flash_command+' --allow-outies %s %s %s %s -z -d %s >>%s 2>&1' %\
             (output_forward_paired_filename,
              output_reverse_paired_filename,
              max_overlap_string,
              max_overlap_string,
              OUTPUT_DIRECTORY,log_filename)

             FLASH_STATUS=sb.call(cmd,shell=True)
             if FLASH_STATUS:
                 raise FlashException('Flash failed to run, please check the log file.')

             info('Done!')

             flash_hist_filename=_jp('out.hist')
             flash_histogram_filename=_jp('out.histogram')
             flash_not_combined_1_filename=_jp('out.notCombined_1.fastq.gz')
             flash_not_combined_2_filename=_jp('out.notCombined_2.fastq.gz')

             processed_output_filename=_jp('out.extendedFrags.fastq.gz')


        #count reads
        N_READS_INPUT=get_n_reads_fastq(args.fastq_r1)
        N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)


        #load gene annotation
        if args.gene_annotations:
            info('Loading gene coordinates from annotation file: %s...' % args.gene_annotations)
            try:
                df_genes=pd.read_table(args.gene_annotations,compression='gzip')
                df_genes.txEnd=df_genes.txEnd.astype(int)
                df_genes.txStart=df_genes.txStart.astype(int)
                df_genes.head()
            except:
               info('Failed to load the gene annotations file.')


        if RUNNING_MODE=='ONLY_AMPLICONS' or  RUNNING_MODE=='AMPLICONS_AND_GENOME':

            #load and validate template file
            df_template=pd.read_csv(args.amplicons_file,names=[
                    'Name','Amplicon_Sequence','sgRNA',
                    'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str})

            if str(df_template.iloc[0,1]).lower() == "amplicon_sequence":
                df_template.drop(0,axis=0,inplace=True)
                info('Detected header in amplicon file.')


            #remove empty amplicons/lines
            df_template.dropna(subset=['Amplicon_Sequence'],inplace=True)
            df_template.dropna(subset=['Name'],inplace=True)

            df_template.Amplicon_Sequence=df_template.Amplicon_Sequence.apply(capitalize_sequence)
            df_template.Expected_HDR=df_template.Expected_HDR.apply(capitalize_sequence)
            df_template.sgRNA=df_template.sgRNA.apply(capitalize_sequence)
            df_template.Coding_sequence=df_template.Coding_sequence.apply(capitalize_sequence)

            if not len(df_template.Amplicon_Sequence.unique())==df_template.shape[0]:
                duplicated_entries = df_template.Amplicon_Sequence[df_template.Amplicon_Sequence.duplicated()]
                raise Exception('The amplicon sequences must be distinct! (Duplicated entries: ' + str(duplicated_entries.values) + ')')

            if not len(df_template.Name.unique())==df_template.shape[0]:
                duplicated_entries = df_template.Name[df_template.Name.duplicated()]
                raise Exception('The amplicon names must be distinct! (Duplicated names: ' + str(duplicated_entries.values) + ')')

            df_template=df_template.set_index('Name')
            df_template.index=df_template.index.to_series().str.replace(' ','_')

            for idx,row in df_template.iterrows():

                wrong_nt=find_wrong_nt(row.Amplicon_Sequence)
                if wrong_nt:
                     raise NTException('The amplicon sequence %s contains wrong characters:%s' % (idx,' '.join(wrong_nt)))

                if not pd.isnull(row.sgRNA):

                    cut_points=[]

                    for current_guide_seq in row.sgRNA.strip().upper().split(','):

                        wrong_nt=find_wrong_nt(current_guide_seq)
                        if wrong_nt:
                            raise NTException('The sgRNA sequence %s contains wrong characters:%s'  % (current_guide_seq, ' '.join(wrong_nt)))

                        offset_fw=args.quantification_window_center+len(current_guide_seq)-1
                        offset_rc=(-args.quantification_window_center)-1
                        cut_points+=[m.start() + offset_fw for \
                                    m in re.finditer(current_guide_seq,  row.Amplicon_Sequence)]+[m.start() + offset_rc for m in re.finditer(reverse_complement(current_guide_seq),  row.Amplicon_Sequence)]

                    if not cut_points:
                        warn('\nThe guide sequence/s provided: %s is(are) not present in the amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!' % (row.sgRNA,row.Amplicon_Sequence))
                        df_template.ix[idx,'sgRNA']=''



        if RUNNING_MODE=='ONLY_AMPLICONS':
            #create a fasta file with all the amplicons
            amplicon_fa_filename=_jp('AMPLICONS.fa')
            fastq_gz_amplicon_filenames=[]
            with open(amplicon_fa_filename,'w+') as outfile:
                for idx,row in df_template.iterrows():
                    if row['Amplicon_Sequence']:
                        outfile.write('>%s\n%s\n' %(clean_filename('AMPL_'+idx),row['Amplicon_Sequence']))

                        #create place-holder fastq files
                        fastq_gz_amplicon_filenames.append(_jp('%s.fastq.gz' % clean_filename('AMPL_'+idx)))
                        open(fastq_gz_amplicon_filenames[-1], 'w+').close()

            df_template['Demultiplexed_fastq.gz_filename']=fastq_gz_amplicon_filenames
            info('Creating a custom index file with all the amplicons...')
            custom_index_filename=_jp('CUSTOM_BOWTIE2_INDEX')
            sb.call('bowtie2-build %s %s >>%s 2>&1' %(amplicon_fa_filename,custom_index_filename,log_filename), shell=True)


            #align the file to the amplicons (MODE 1)
            info('Align reads to the amplicons...')
            bam_filename_amplicons= _jp('CRISPResso_AMPLICONS_ALIGNED.bam')
            aligner_command= 'bowtie2 -x %s -p %s %s -U %s 2>>%s | samtools view -bS - > %s' %(custom_index_filename,args.n_processes,args.bowtie2_options_string,processed_output_filename,log_filename,bam_filename_amplicons)


            info('Alignment command: ' + aligner_command)
            sb.call(aligner_command,shell=True)

            N_READS_ALIGNED=get_n_aligned_bam(bam_filename_amplicons)

            s1=r"samtools view -F 4 %s 2>>%s | grep -v ^'@'" % (bam_filename_amplicons,log_filename)
            s2=r'''|awk '{ gzip_filename=sprintf("gzip >> OUTPUTPATH%s.fastq.gz",$3);\
            print "@"$1"\n"$10"\n+\n"$11  | gzip_filename;}' '''

            cmd=s1+s2.replace('OUTPUTPATH',_jp(''))
            sb.call(cmd,shell=True)

            info('Demultiplex reads and run CRISPResso on each amplicon...')
            n_reads_aligned_amplicons=[]
            crispresso_cmds = []
            for idx,row in df_template.iterrows():
                info('\n Processing:%s' %idx)
                n_reads_aligned_amplicons.append(get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename']))
                crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' % (row['Demultiplexed_fastq.gz_filename'],row['Amplicon_Sequence'],OUTPUT_DIRECTORY,idx)

                if n_reads_aligned_amplicons[-1]>args.min_reads_to_use_region:
                    if row['sgRNA'] and not pd.isnull(row['sgRNA']):
                        crispresso_cmd+=' -g %s' % row['sgRNA']

                    if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']):
                        crispresso_cmd+=' -e %s' % row['Expected_HDR']

                    if row['Coding_sequence'] and not pd.isnull(row['Coding_sequence']):
                        crispresso_cmd+=' -c %s' % row['Coding_sequence']

                    crispresso_cmd=CRISPRessoShared.propagate_crispresso_options(crispresso_cmd,crispresso_options_for_pooled,args)
                    crispresso_cmds.append(crispresso_cmd)

                else:
                    warn('Skipping amplicon [%s] because no reads align to it\n'% idx)

            CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,args.n_processes,'amplicon',args.skip_failed)

            df_template['n_reads']=n_reads_aligned_amplicons
            df_template['n_reads_aligned_%']=df_template['n_reads']/float(N_READS_ALIGNED)*100
            df_template.fillna('NA').to_csv(_jp('REPORT_READS_ALIGNED_TO_AMPLICONS.txt'),sep='\t')



        if RUNNING_MODE=='AMPLICONS_AND_GENOME':
            print 'Mapping amplicons to the reference genome...'
            #find the locations of the amplicons on the genome and their strand and check if there are mutations in the reference genome
            additional_columns=[]
            for idx,row in df_template.iterrows():
                fields_to_append=list(np.take(get_align_sequence(row.Amplicon_Sequence, args.bowtie2_index).split('\t'),[0,1,2,3,5]))
                if fields_to_append[0]=='*':
                    info('The amplicon [%s] is not mappable to the reference genome provided!' % idx )
                    additional_columns.append([idx,'NOT_ALIGNED',0,-1,'+',''])
                else:
                    additional_columns.append([idx]+fields_to_append)
                    info('The amplicon [%s] was mapped to: %s ' % (idx,' '.join(fields_to_append[:3]) ))


            df_template=df_template.join(pd.DataFrame(additional_columns,columns=['Name','chr_id','bpstart','bpend','strand','Reference_Sequence']).set_index('Name'))

            df_template.bpstart=df_template.bpstart.astype(int)
            df_template.bpend=df_template.bpend.astype(int)

            #Check reference is the same otherwise throw a warning
            for idx,row in df_template.iterrows():
                if row.Amplicon_Sequence != row.Reference_Sequence and row.Amplicon_Sequence != reverse_complement(row.Reference_Sequence):
                    warn('The amplicon sequence %s provided:\n%s\n\nis different from the reference sequence(both strand):\n\n%s\n\n%s\n' %(row.name,row.Amplicon_Sequence,row.Amplicon_Sequence,reverse_complement(row.Amplicon_Sequence)))


        if RUNNING_MODE=='ONLY_GENOME' or RUNNING_MODE=='AMPLICONS_AND_GENOME':

            ###HERE we recreate the uncompressed genome file if not available###

            #check you have all the files for the genome and create a fa idx for samtools

            uncompressed_reference=args.bowtie2_index+'.fa'

            #if not os.path.exists(GENOME_LOCAL_FOLDER):
            #    os.mkdir(GENOME_LOCAL_FOLDER)

            if os.path.exists(uncompressed_reference):
                info('The uncompressed reference fasta file for %s is already present! Skipping generation.' % args.bowtie2_index)
            else:
                #uncompressed_reference=os.path.join(GENOME_LOCAL_FOLDER,'UNCOMPRESSED_REFERENCE_FROM_'+args.bowtie2_index.replace('/','_')+'.fa')
                info('Extracting uncompressed reference from the provided bowtie2 index since it is not available... Please be patient!')

                cmd_to_uncompress='bowtie2-inspect %s > %s 2>>%s' % (args.bowtie2_index,uncompressed_reference,log_filename)
                sb.call(cmd_to_uncompress,shell=True)

                info('Indexing fasta file with samtools...')
                #!samtools faidx {uncompressed_reference}
                sb.call('samtools faidx %s 2>>%s ' % (uncompressed_reference,log_filename),shell=True)


        #####CORRECT ONE####
        #align in unbiased way the reads to the genome
        if RUNNING_MODE=='ONLY_GENOME' or RUNNING_MODE=='AMPLICONS_AND_GENOME':
            info('Aligning reads to the provided genome index...')
            bam_filename_genome = _jp('%s_GENOME_ALIGNED.bam' % database_id)
            aligner_command= 'bowtie2 -x %s -p %s %s -U %s 2>>%s| samtools view -bS - > %s' %(args.bowtie2_index,args.n_processes,args.bowtie2_options_string,processed_output_filename,log_filename,bam_filename_genome)
            info('aligning with command: ' + aligner_command)
            sb.call(aligner_command,shell=True)

            N_READS_ALIGNED=get_n_aligned_bam(bam_filename_genome)

            #REDISCOVER LOCATIONS and DEMULTIPLEX READS
            MAPPED_REGIONS=_jp('MAPPED_REGIONS/')
            if not os.path.exists(MAPPED_REGIONS):
                os.mkdir(MAPPED_REGIONS)

            s1=r'''samtools view -F 0x0004 %s 2>>%s |''' % (bam_filename_genome,log_filename)+\
            r'''awk '{OFS="\t"; bpstart=$4;  bpend=bpstart; split ($6,a,"[MIDNSHP]"); n=0;\
            for (i=1; i in a; i++){\
                n+=1+length(a[i]);\
                if (substr($6,n,1)=="S"){\
                    if (bpend==$4)\
                        bpstart-=a[i];\
                    else
                        bpend+=a[i];
                    }\
                else if( (substr($6,n,1)!="I")  && (substr($6,n,1)!="H") )\
                        bpend+=a[i];\
                }\
                if ( ($2 % 32)>=16)\
                    print $3,bpstart,bpend,"-",$1,$10,$11;\
                else\
                    print $3,bpstart,bpend,"+",$1,$10,$11;}' | '''

            s2=r'''  sort -k1,1 -k2,2n  | awk \
            'BEGIN{chr_id="NA";bpstart=-1;bpend=-1; fastq_filename="NA"}\
            { if ( (chr_id!=$1) || (bpstart!=$2) || (bpend!=$3) )\
                {\
                if (fastq_filename!="NA") {close(fastq_filename); system("gzip -f "fastq_filename)}\
                chr_id=$1; bpstart=$2; bpend=$3;\
                fastq_filename=sprintf("__OUTPUTPATH__REGION_%s_%s_%s.fastq",$1,$2,$3);\
                }\
            print "@"$5"\n"$6"\n+\n"$7 >> fastq_filename;\
            }' '''
            cmd=s1+s2.replace('__OUTPUTPATH__',MAPPED_REGIONS)

            info('Demultiplexing reads by location...')
            sb.call(cmd,shell=True)

            #gzip the missing ones
            sb.call('gzip -f %s/*.fastq' % MAPPED_REGIONS,shell=True)

        '''
        The most common use case, where many different target sites are pooled into a single
        high-throughput sequencing library for quantification, is not directly addressed by this implementation.
        Potential users of CRISPResso would need to write their own code to generate separate input files for processing.
        Importantly, this preprocessing code would need to remove any PCR amplification artifacts
        (such as amplification of sequences from a gene and a highly similar pseudogene )
        which may confound the interpretation of results.
        This can be done by mapping of input sequences to a reference genome and removing
        those that do not map to the expected genomic location, but is non-trivial for an end-user to implement.
        '''


        if RUNNING_MODE=='AMPLICONS_AND_GENOME':
            files_to_match=glob.glob(os.path.join(MAPPED_REGIONS,'REGION*'))
            n_reads_aligned_genome=[]
            fastq_region_filenames=[]

            crispresso_cmds = []
            for idx,row in df_template.iterrows():

                info('Processing amplicon: %s' % idx )

                #check if we have reads
                fastq_filename_region=os.path.join(MAPPED_REGIONS,'REGION_%s_%s_%s.fastq.gz' % (row['chr_id'],row['bpstart'],row['bpend']))

                if os.path.exists(fastq_filename_region):

                    N_READS=get_n_reads_fastq(fastq_filename_region)
                    n_reads_aligned_genome.append(N_READS)
                    fastq_region_filenames.append(fastq_filename_region)
                    files_to_match.remove(fastq_filename_region)
                    if N_READS>=args.min_reads_to_use_region:
                        info('\nThe amplicon [%s] has enough reads (%d) mapped to it! Running CRISPResso!\n' % (idx,N_READS))

                        crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' % (fastq_filename_region,row['Amplicon_Sequence'],OUTPUT_DIRECTORY,idx)

                        if row['sgRNA'] and not pd.isnull(row['sgRNA']):
                            crispresso_cmd+=' -g %s' % row['sgRNA']

                        if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']):
                            crispresso_cmd+=' -e %s' % row['Expected_HDR']

                        if row['Coding_sequence'] and not pd.isnull(row['Coding_sequence']):
                            crispresso_cmd+=' -c %s' % row['Coding_sequence']

                        crispresso_cmd=CRISPRessoShared.propagate_crispresso_options(crispresso_cmd,crispresso_options_for_pooled,args)
                        info('Running CRISPResso:%s' % crispresso_cmd)
                        crispresso_cmds.append(crispresso_cmd)

                    else:
                         warn('The amplicon [%s] has not enough reads (%d) mapped to it! Skipping the execution of CRISPResso!' % (idx,N_READS))
                else:
                    fastq_region_filenames.append('')
                    n_reads_aligned_genome.append(0)
                    warn("The amplicon %s doesn't have any read mapped to it!\n Please check your amplicon sequence." %  idx)

            CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,args.n_processes,'amplicon',args.skip_failed)

            df_template['Amplicon_Specific_fastq.gz_filename']=fastq_region_filenames
            df_template['n_reads']=n_reads_aligned_genome
            df_template['n_reads_aligned_%']=df_template['n_reads']/float(N_READS_ALIGNED)*100

            if args.gene_annotations:
                df_template=df_template.apply(lambda row: find_overlapping_genes(row, df_genes),axis=1)

            df_template.fillna('NA').to_csv(_jp('REPORT_READS_ALIGNED_TO_GENOME_AND_AMPLICONS.txt'),sep='\t')

            #write another file with the not amplicon regions

            info('Reporting problematic regions...')
            coordinates=[]
            for region in files_to_match:
                coordinates.append(os.path.basename(region).replace('.fastq.gz','').replace('.fastq','').split('_')[1:4]+[region,get_n_reads_fastq(region)])

            df_regions=pd.DataFrame(coordinates,columns=['chr_id','bpstart','bpend','fastq_file','n_reads'])

            df_regions.dropna(inplace=True) #remove regions in chrUn

            df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart'])
            df_regions['bpend'] = pd.to_numeric(df_regions['bpend'])
            df_regions['n_reads'] = pd.to_numeric(df_regions['n_reads'])

            df_regions.bpstart=df_regions.bpstart.astype(int)
            df_regions.bpend=df_regions.bpend.astype(int)

            df_regions['n_reads_aligned_%']=df_regions['n_reads']/float(N_READS_ALIGNED)*100

            df_regions['Reference_sequence']=df_regions.apply(lambda row: get_region_from_fa(row.chr_id,row.bpstart,row.bpend,uncompressed_reference),axis=1)


            if args.gene_annotations:
                info('Checking overlapping genes...')
                df_regions=df_regions.apply(lambda row: find_overlapping_genes(row, df_genes),axis=1)

            if np.sum(np.array(map(int,pd.__version__.split('.')))*(100,10,1))< 170:
                df_regions.sort('n_reads',ascending=False,inplace=True)
            else:
                df_regions.sort_values(by='n_reads',ascending=False,inplace=True)


            df_regions.fillna('NA').to_csv(_jp('REPORTS_READS_ALIGNED_TO_GENOME_NOT_MATCHING_AMPLICONS.txt'),sep='\t',index=None)


        if RUNNING_MODE=='ONLY_GENOME' :
            #Load regions and build REFERENCE TABLES
            info('Parsing the demultiplexed files and extracting locations and reference sequences...')
            coordinates=[]
            for region in glob.glob(os.path.join(MAPPED_REGIONS,'REGION*.fastq.gz')):
                coord_from_filename = os.path.basename(region).replace('.fastq.gz','').split('_')[1:4]
#                print('ccord from filename: ' + str(coord_from_filename))
                if not (coord_from_filename[1].isdigit() and coord_from_filename[2].isdigit()):
                    warn('Skipping region [%s] because the region name cannot be parsed\n'% region)
                    continue
                coordinates.append(coord_from_filename+[region,get_n_reads_fastq(region)])

            df_regions=pd.DataFrame(coordinates,columns=['chr_id','bpstart','bpend','fastq_file','n_reads'])

            df_regions.dropna(inplace=True) #remove regions in chrUn

            df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart'])
            df_regions['bpend'] = pd.to_numeric(df_regions['bpend'])
            df_regions['n_reads'] = pd.to_numeric(df_regions['n_reads'])

            df_regions.bpstart=df_regions.bpstart.astype(int)
            df_regions.bpend=df_regions.bpend.astype(int)
            df_regions['sequence']=df_regions.apply(lambda row: get_region_from_fa(row.chr_id,row.bpstart,row.bpend,uncompressed_reference),axis=1)

            df_regions['n_reads_aligned_%']=df_regions['n_reads']/float(N_READS_ALIGNED)*100

            if args.gene_annotations:
                info('Checking overlapping genes...')
                df_regions=df_regions.apply(lambda row: find_overlapping_genes(row, df_genes),axis=1)

            if np.sum(np.array(map(int,pd.__version__.split('.')))*(100,10,1))< 170:
                df_regions.sort('n_reads',ascending=False,inplace=True)
            else:
                df_regions.sort_values(by='n_reads',ascending=False,inplace=True)


            df_regions.fillna('NA').to_csv(_jp('REPORT_READS_ALIGNED_TO_GENOME_ONLY.txt'),sep='\t',index=None)


            #run CRISPResso
            #demultiplex reads in the amplicons and call crispresso!
            info('Running CRISPResso on the regions discovered...')
            crispresso_cmds = []
            for idx,row in df_regions.iterrows():

                if row.n_reads > args.min_reads_to_use_region:
                    info('\nRunning CRISPResso on: %s-%d-%d...'%(row.chr_id,row.bpstart,row.bpend ))
                    crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s' %(row.fastq_file,row.sequence,OUTPUT_DIRECTORY)
                    crispresso_cmd=CRISPRessoShared.propagate_crispresso_options(crispresso_cmd,crispresso_options_for_pooled,args)
                    crispresso_cmds.append(crispresso_cmd)
                else:
                    info('Skipping region: %s-%d-%d , not enough reads (%d)' %(row.chr_id,row.bpstart,row.bpend, row.n_reads))
            CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,args.n_processes,'region',args.skip_failed)

        #write alignment statistics
        with open(_jp('MAPPING_STATISTICS.txt'),'w+') as outfile:
            outfile.write('READS IN INPUTS:%d\nREADS AFTER PREPROCESSING:%d\nREADS ALIGNED:%d' % (N_READS_INPUT,N_READS_AFTER_PREPROCESSING,N_READS_ALIGNED))

        quantification_summary=[]

        if RUNNING_MODE=='ONLY_AMPLICONS' or RUNNING_MODE=='AMPLICONS_AND_GENOME':
            df_final_data=df_template
        else:
            df_final_data=df_regions

        all_region_names = []
        all_region_read_counts = {}
        good_region_names = []
        good_region_folders = {}
        header = 'Name\tUnmodified%\tModified%\tReads_aligned\tReads_total\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions'
        header_els = header.split("\t")
        header_el_count = len(header_els)
        empty_line_els = [np.nan]*(header_el_count-1)
        n_reads_index = header_els.index('Reads_total') - 1
        for idx,row in df_final_data.iterrows():
                run_name = idx
                if RUNNING_MODE=='ONLY_AMPLICONS' or RUNNING_MODE=='AMPLICONS_AND_GENOME':
                    run_name=idx
                else:
                    run_name='REGION_%s_%d_%d' %(row.chr_id,row.bpstart,row.bpend )
                folder_name = 'CRISPResso_on_%s'%run_name

                all_region_names.append(run_name)
                all_region_read_counts[run_name] = row.n_reads

                run_file = os.path.join(_jp(folder_name),'CRISPResso2_info.pickle')
                if not os.path.exists(run_file):
                    warn('Skipping the folder %s: not enough reads, incomplete, or empty folder.'% folder_name)
                    this_els = empty_line_els[:]
                    this_els[n_reads_index] = row.n_reads
                    to_add = [run_name]
                    to_add.extend(this_els)
                    quantification_summary.append(to_add)
                else:
                    run_data = cp.load(open(run_file,'rb'))
                    ref_name = run_data['ref_names'][0] #only expect one amplicon sequence
                    n_tot = row.n_reads
                    n_aligned = run_data['counts_total'][ref_name]
                    n_unmod = run_data['counts_unmodified'][ref_name]
                    n_mod = run_data['counts_modified'][ref_name]
                    n_discarded = run_data['counts_discarded'][ref_name]

                    n_insertion = run_data['counts_insertion'][ref_name]
                    n_deletion = run_data['counts_deletion'][ref_name]
                    n_substitution = run_data['counts_substitution'][ref_name]
                    n_only_insertion = run_data['counts_only_insertion'][ref_name]
                    n_only_deletion = run_data['counts_only_deletion'][ref_name]
                    n_only_substitution = run_data['counts_only_substitution'][ref_name]
                    n_insertion_and_deletion = run_data['counts_insertion_and_deletion'][ref_name]
                    n_insertion_and_substitution = run_data['counts_insertion_and_substitution'][ref_name]
                    n_deletion_and_substitution = run_data['counts_deletion_and_substitution'][ref_name]
                    n_insertion_and_deletion_and_substitution = run_data['counts_insertion_and_deletion_and_substitution'][ref_name]

                    unmod_pct = np.nan
                    mod_pct = np.nan
                    if n_aligned > 0:
                        unmod_pct = 100*n_unmod/float(n_aligned)
                        mod_pct = 100*n_mod/float(n_aligned)


                    vals = [run_name]
                    vals.extend([round(unmod_pct,8),round(mod_pct,8),n_aligned,n_tot,n_unmod,n_mod,n_discarded,n_insertion,n_deletion,n_substitution,n_only_insertion,n_only_deletion,n_only_substitution,n_insertion_and_deletion,n_insertion_and_substitution,n_deletion_and_substitution,n_insertion_and_deletion_and_substitution])
                    quantification_summary.append(vals)

                    good_region_names.append(run_name)
                    good_region_folders[idx] = folder_name


        samples_quantification_summary_filename = _jp('SAMPLES_QUANTIFICATION_SUMMARY.txt')

        df_summary_quantification=pd.DataFrame(quantification_summary,columns=header_els)
        if args.crispresso1_mode:
            crispresso1_columns=['Name','Unmodified%','Modified%','Reads_aligned','Reads_total']
            df_summary_quantification.fillna('NA').to_csv(samples_quantification_summary_filename,sep='\t',index=None,columns=crispresso1_columns)
        else:

            df_summary_quantification.fillna('NA').to_csv(samples_quantification_summary_filename,sep='\t',index=None)

        crispresso2_info['samples_quantification_summary_filename'] = os.path.basename(samples_quantification_summary_filename)
        crispresso2_info['final_data'] = df_final_data
        crispresso2_info['all_region_names'] = all_region_names
        crispresso2_info['all_region_read_counts'] = all_region_read_counts
        crispresso2_info['good_region_names'] = good_region_names
        crispresso2_info['good_region_folders'] = good_region_folders
        crispresso2_info['running_mode'] = RUNNING_MODE

        crispresso2_info['summary_plot_names'] = []
        crispresso2_info['summary_plot_titles'] = {}
        crispresso2_info['summary_plot_labels'] = {}
        crispresso2_info['summary_plot_datas'] = {}


        df_summary_quantification.set_index('Name')

        save_png = True
        if args.suppress_report:
            save_png = False

        plot_root = _jp("CRISPRessoPooled_reads_summary")
        CRISPRessoPlot.plot_reads_total(plot_root,df_summary_quantification,save_png,args.min_reads_to_use_region)
        plot_name = os.path.basename(plot_root)
        crispresso2_info['summary_plot_root'] = plot_name
        crispresso2_info['summary_plot_names'].append(plot_name)
        crispresso2_info['summary_plot_titles'][plot_name] = 'CRISPRessoPooled Read Allocation Summary'
        crispresso2_info['summary_plot_labels'][plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
        crispresso2_info['summary_plot_datas'][plot_name] = [('CRISPRessoPooled summary',os.path.basename(samples_quantification_summary_filename))]

        plot_root = _jp("CRISPRessoPooled_modification_summary")
        CRISPRessoPlot.plot_unmod_mod_pcts(plot_root,df_summary_quantification,save_png,args.min_reads_to_use_region)
        plot_name = os.path.basename(plot_root)
        crispresso2_info['summary_plot_root'] = plot_name
        crispresso2_info['summary_plot_names'].append(plot_name)
        crispresso2_info['summary_plot_titles'][plot_name] = 'CRISPRessoPooled Modification Summary'
        crispresso2_info['summary_plot_labels'][plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
        crispresso2_info['summary_plot_datas'][plot_name] = [('CRISPRessoPooled summary',os.path.basename(samples_quantification_summary_filename))]




        #if many reads weren't aligned, print those out for the user
        if RUNNING_MODE != 'ONLY_GENOME':
            #N_READS_INPUT=get_n_reads_fastq(args.fastq_r1)
            #N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
    		tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum()
    		tot_reads = df_summary_quantification['Reads_total'].sum()

    		if RUNNING_MODE=='AMPLICONS_AND_GENOME':
    			this_bam_filename = bam_filename_genome
    		if RUNNING_MODE=='ONLY_AMPLICONS':
    			this_bam_filename = bam_filename_amplicons
    		#if less than 1/2 of reads aligned, find most common unaligned reads and advise the user
    		if N_READS_INPUT > 0 and tot_reads/float(N_READS_INPUT) < 0.5:
    			warn('Less than half (%d/%d) of reads aligned. Finding most frequent unaligned reads.'%(tot_reads,N_READS_INPUT))
    			###
    			###this results in the unpretty messages being printed:
    			### sort: write failed: standard output: Broken pipe
    			### sort: write error
    			###
    			#cmd = "samtools view -f 4 %s | awk '{print $10}' | sort | uniq -c | sort -nr | head -n 10"%this_bam_filename
    			import signal
    			def default_sigpipe():
    				    signal.signal(signal.SIGPIPE, signal.SIG_DFL)

    			cmd = "samtools view -f 4 %s | head -n 10000 | awk '{print $10}' | sort | uniq -c | sort -nr | head -n 10 | awk '{print $2}'"%this_bam_filename
#    			print("command is: "+cmd)
#    		    p = sb.Popen(cmd, shell=True,stdout=sb.PIPE)
		    	p = sb.Popen(cmd, shell=True,stdout=sb.PIPE,preexec_fn=default_sigpipe)
    			top_unaligned = p.communicate()[0]
    			top_unaligned_filename=_jp('CRISPRessoPooled_TOP_UNALIGNED.txt')

    			with open(top_unaligned_filename,'w') as outfile:
    				outfile.write(top_unaligned)
    			warn('Perhaps one or more of the given amplicon sequences were incomplete or incorrect. Below is a list of the most frequent unaligned reads (in the first 10000 unaligned reads). Check this list to see if an amplicon is among these reads.\n%s'%top_unaligned)


        #cleaning up
        if not args.keep_intermediate:
             info('Removing Intermediate files...')

             if args.fastq_r2!='':
                 files_to_remove=[processed_output_filename,flash_hist_filename,flash_histogram_filename,\
                              flash_not_combined_1_filename,flash_not_combined_2_filename]
             else:
                 files_to_remove=[processed_output_filename]

             if args.trim_sequences and args.fastq_r2!='':
                 files_to_remove+=[output_forward_paired_filename,output_reverse_paired_filename,\
                                                   output_forward_unpaired_filename,output_reverse_unpaired_filename]

             if RUNNING_MODE=='ONLY_GENOME' or RUNNING_MODE=='AMPLICONS_AND_GENOME':
                     files_to_remove+=[bam_filename_genome]

             if RUNNING_MODE=='ONLY_AMPLICONS':
                files_to_remove+=[bam_filename_amplicons,amplicon_fa_filename]
                for bowtie2_file in glob.glob(_jp('CUSTOM_BOWTIE2_INDEX.*')):
                    files_to_remove.append(bowtie2_file)

             for file_to_remove in files_to_remove:
                 try:
                         if os.path.islink(file_to_remove):
                             #print 'LINK',file_to_remove
                             os.unlink(file_to_remove)
                         else:
                             os.remove(file_to_remove)
                 except:
                         warn('Skipping:%s' %file_to_remove)

        if not args.suppress_report:
            if (args.place_report_in_output_folder):
                report_name = _jp("CRISPResso2Pooled_report.html")
            else:
                report_name = OUTPUT_DIRECTORY+'.html'
            CRISPRessoReport.make_pooled_report_from_folder(report_name,crispresso2_info,OUTPUT_DIRECTORY,_ROOT)
            crispresso2_info['report_location'] = report_name
            crispresso2_info['report_filename'] = os.path.basename(report_name)

        cp.dump(crispresso2_info, open(crispresso2WGS_info_file, 'wb' ) )

        info('All Done!')
        print CRISPRessoShared.get_crispresso_footer()
        sys.exit(0)

    except Exception as e:
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)

        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Ejemplo n.º 9
0
def main():
    try:
        description = [
            '~~~CRISPRessoCompare~~~',
            '-Comparison of two CRISPResso analyses-'
        ]
        compare_header = r'''
 ___________________________
| __ __      __      __  __ |
|/  /  \|\/||__) /\ |__)|_  |
|\__\__/|  ||   /--\| \ |__ |
|___________________________|
        '''
        compare_header = CRISPRessoShared.get_crispresso_header(
            description, compare_header)
        print(compare_header)

        parser = argparse.ArgumentParser(
            description='CRISPRessoCompare Parameters',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        parser.add_argument(
            'crispresso_output_folder_1',
            type=str,
            help='First output folder with CRISPResso analysis')
        parser.add_argument(
            'crispresso_output_folder_2',
            type=str,
            help='Second output folder with CRISPResso analysis')

        #OPTIONALS
        parser.add_argument('-n', '--name', help='Output name', default='')
        parser.add_argument('-n1',
                            '--sample_1_name',
                            help='Sample 1 name',
                            default='Sample_1')
        parser.add_argument('-n2',
                            '--sample_2_name',
                            help='Sample 2 name',
                            default='Sample_2')
        parser.add_argument('-o', '--output_folder', help='', default='')
        parser.add_argument(
            '--min_frequency_alleles_around_cut_to_plot',
            type=float,
            help=
            'Minimum %% reads required to report an allele in the alleles table plot.',
            default=0.2)
        parser.add_argument(
            '--max_rows_alleles_around_cut_to_plot',
            type=int,
            help='Maximum number of rows to report in the alleles table plot. ',
            default=50)
        parser.add_argument(
            '--save_also_png',
            help='Save also .png images additionally to .pdf files',
            action='store_true')
        parser.add_argument('--debug',
                            help='Show debug messages',
                            action='store_true')

        args = parser.parse_args()
        debug_flag = args.debug

        #check that the CRISPResso output is present and fill amplicon_info
        quantification_file_1, amplicon_names_1, amplicon_info_1 = CRISPRessoShared.check_output_folder(
            args.crispresso_output_folder_1)
        quantification_file_2, amplicon_names_2, amplicon_info_2 = CRISPRessoShared.check_output_folder(
            args.crispresso_output_folder_2)

        get_name_from_folder = lambda x: os.path.basename(os.path.abspath(
            x)).replace('CRISPResso_on_', '')

        if not args.name:
            database_id = '%s_VS_%s' % (
                get_name_from_folder(args.crispresso_output_folder_1),
                get_name_from_folder(args.crispresso_output_folder_2))
        else:
            database_id = args.name

        OUTPUT_DIRECTORY = 'CRISPRessoCompare_on_%s' % database_id

        if args.output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.output_folder), OUTPUT_DIRECTORY)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory
        log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt')

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
            info('Done!')
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt')
        logging.getLogger().addHandler(logging.FileHandler(log_filename))

        with open(log_filename, 'w+') as outfile:
            outfile.write(
                '[Command used]:\nCRISPRessoCompare %s\n\n[Execution log]:\n' %
                ' '.join(sys.argv))

        #LOAD DATA
        amplicon_names_in_both = [
            amplicon_name for amplicon_name in amplicon_names_1
            if amplicon_name in amplicon_names_2
        ]
        n_refs = len(amplicon_names_in_both)

        def get_plot_title_with_ref_name(plotTitle, refName):
            if n_refs > 1:
                return (plotTitle + ": " + refName)
            return plotTitle

        for amplicon_name in amplicon_names_in_both:
            profile_1 = parse_profile(
                amplicon_info_1[amplicon_name]['quantification_file'])
            profile_2 = parse_profile(
                amplicon_info_2[amplicon_name]['quantification_file'])

            try:
                assert np.all(profile_1[:, 0] == profile_2[:, 0])
            except:
                raise DifferentAmpliconLengthException(
                    'Different amplicon lengths for the two amplicons.')
            len_amplicon = profile_1.shape[0]
            effect_vector_any_1 = profile_1[:, 1]
            effect_vector_any_2 = profile_2[:, 1]
            cut_points, sgRNA_intervals = load_cut_points_sgRNA_intervals(
                args.crispresso_output_folder_1, amplicon_name)

            #Quantification comparison barchart
            fig = plt.figure(figsize=(30, 15))
            n_groups = 2

            N_TOTAL_1 = float(amplicon_info_1[amplicon_name]['Total'])
            N_UNMODIFIED_1 = float(
                amplicon_info_1[amplicon_name]['Unmodified'])
            N_MODIFIED_1 = float(amplicon_info_1[amplicon_name]['Modified'])

            N_TOTAL_2 = float(amplicon_info_2[amplicon_name]['Total'])
            N_UNMODIFIED_2 = float(
                amplicon_info_2[amplicon_name]['Unmodified'])
            N_MODIFIED_2 = float(amplicon_info_2[amplicon_name]['Modified'])

            means_sample_1 = np.array([N_UNMODIFIED_1, N_MODIFIED_1
                                       ]) / N_TOTAL_1 * 100
            means_sample_2 = np.array([N_UNMODIFIED_2, N_MODIFIED_2
                                       ]) / N_TOTAL_2 * 100

            ax1 = fig.add_subplot(1, 2, 1)

            index = np.arange(n_groups)
            bar_width = 0.35

            opacity = 0.4
            error_config = {'ecolor': '0.3'}

            rects1 = ax1.bar(index,
                             means_sample_1,
                             bar_width,
                             alpha=opacity,
                             color=(0, 0, 1, 0.4),
                             label=args.sample_1_name)

            rects2 = ax1.bar(index + bar_width,
                             means_sample_2,
                             bar_width,
                             alpha=opacity,
                             color=(1, 0, 0, 0.4),
                             label=args.sample_2_name)

            plt.ylabel('% Sequences')
            plt.title(
                get_plot_title_with_ref_name(
                    '%s VS %s' % (args.sample_1_name, args.sample_2_name),
                    amplicon_name))
            plt.xticks(index + bar_width / 2.0, ('Unmodified', 'Modified'))
            plt.legend()
            #            plt.xlim(index[0]-0.2,(index + bar_width)[-1]+bar_width+0.2)
            plt.tight_layout()

            ax2 = fig.add_subplot(1, 2, 2)
            ax2.bar(index,
                    means_sample_1 - means_sample_2,
                    bar_width + 0.35,
                    alpha=opacity,
                    color=(0, 1, 1, 0.4),
                    label='')

            plt.ylabel('% Sequences Difference')
            plt.title(
                get_plot_title_with_ref_name(
                    '%s - %s' % (args.sample_1_name, args.sample_2_name),
                    amplicon_name))
            plt.xticks(index, ['Unmodified', 'Modified'])

            #            plt.xlim(index[0]-bar_width/2, (index+bar_width)[-1]+2*bar_width)
            plt.tight_layout()
            plt.savefig(_jp('1.' + amplicon_name +
                            '.Comparison_Efficiency.pdf'),
                        bbox_inches='tight')
            if args.save_also_png:
                plt.savefig(_jp('1.' + amplicon_name +
                                '.Comparison_Efficiency.png'),
                            bbox_inches='tight')

            #profile comparion
            fig = plt.figure(figsize=(20, 10))

            ax1 = fig.add_subplot(1, 2, 1)
            plt.title(
                get_plot_title_with_ref_name('Mutation position distribution',
                                             amplicon_name))
            y_max = max(effect_vector_any_1.max(),
                        effect_vector_any_2.max()) * 1.2

            plt.plot(effect_vector_any_1,
                     color=(0, 0, 1, 0.3),
                     lw=4,
                     label='%s combined mutations' % args.sample_1_name)
            #            plt.hold(True)
            plt.plot(effect_vector_any_2,
                     color=(1, 0, 0, 0.3),
                     lw=4,
                     label='%s combined mutations' % args.sample_2_name)

            if cut_points:
                for idx, cut_point in enumerate(cut_points):
                    if idx == 0:
                        plt.plot([cut_point, cut_point], [0, y_max],
                                 '--k',
                                 lw=2,
                                 label='Predicted cleavage position')
                    else:
                        plt.plot([cut_point, cut_point], [0, y_max],
                                 '--k',
                                 lw=2,
                                 label='_nolegend_')

                for idx, sgRNA_int in enumerate(sgRNA_intervals):
                    if idx == 0:
                        plt.plot([sgRNA_int[0], sgRNA_int[1]], [0, 0],
                                 lw=10,
                                 c=(0, 0, 0, 0.15),
                                 label='sgRNA')
                    else:
                        plt.plot([sgRNA_int[0], sgRNA_int[1]], [0, 0],
                                 lw=10,
                                 c=(0, 0, 0, 0.15),
                                 label='_nolegend_')

            lgd = plt.legend(loc='center',
                             bbox_to_anchor=(0.5, -0.3),
                             ncol=1,
                             fancybox=True,
                             shadow=False)

            plt.xticks(
                np.arange(0, len_amplicon,
                          max(3, (len_amplicon / 6) -
                              (len_amplicon / 6) % 5)).astype(int))
            plt.xlabel('Reference amplicon position (bp)')
            plt.ylabel('Sequences %')
            plt.ylim(0, max(1, y_max))
            plt.xlim(xmax=len_amplicon - 1)

            ax2 = fig.add_subplot(1, 2, 2)

            effect_vector_any_diff = effect_vector_any_1 - effect_vector_any_2

            y_max = effect_vector_any_diff.max() * 1.2
            y_min = effect_vector_any_diff.min() * 1.2

            plt.title(
                get_plot_title_with_ref_name(
                    '%s - %s' % (args.sample_1_name, args.sample_2_name),
                    amplicon_name))
            plt.plot(effect_vector_any_diff,
                     color=(0, 1, 0, 0.4),
                     lw=3,
                     label='Difference')

            if cut_points:
                for idx, cut_point in enumerate(cut_points):
                    if idx == 0:
                        plt.plot(
                            [cut_point, cut_point],
                            [min(-1, y_min), max(1, y_max)],
                            '--k',
                            lw=2,
                            label='Predicted cleavage position')
                    else:
                        plt.plot(
                            [cut_point, cut_point],
                            [min(-1, y_min), max(1, y_max)],
                            '--k',
                            lw=2,
                            label='_nolegend_')

                for idx, sgRNA_int in enumerate(sgRNA_intervals):
                    if idx == 0:
                        plt.plot(
                            [sgRNA_int[0], sgRNA_int[1]],
                            [min(-1, y_min), min(-1, y_min)],
                            lw=10,
                            c=(0, 0, 0, 0.15),
                            label='sgRNA')
                    else:
                        plt.plot(
                            [sgRNA_int[0], sgRNA_int[1]],
                            [min(-1, y_min), min(-1, y_min)],
                            lw=10,
                            c=(0, 0, 0, 0.15),
                            label='_nolegend_')

            lgd2 = plt.legend(loc='center',
                              bbox_to_anchor=(0.5, -0.2),
                              ncol=1,
                              fancybox=True,
                              shadow=False)
            plt.xticks(
                np.arange(0, len_amplicon,
                          max(3, (len_amplicon / 6) -
                              (len_amplicon / 6) % 5)).astype(int))
            plt.xlabel('Reference amplicon position (bp)')
            plt.ylabel('Sequences Difference %')
            plt.xlim(xmax=len_amplicon - 1)

            plt.ylim(min(-1, y_min), max(1, y_max))

            plt.savefig(_jp(
                '2.' + amplicon_name +
                '.Comparison_Combined_Insertion_Deletion_Substitution_Locations.pdf'
            ),
                        bbox_extra_artists=(lgd, ),
                        bbox_inches='tight')
            if args.save_also_png:
                plt.savefig(_jp(
                    '2.' + amplicon_name +
                    '.Comparison_Insertion_Deletion_Substitution_Locations.png'
                ),
                            bbox_extra_artists=(lgd, ),
                            bbox_inches='tight')

            mod_file_1 = amplicon_info_1[amplicon_name][
                'modification_count_file']
            amp_seq_1, mod_freqs_1 = CRISPRessoShared.parse_count_file(
                mod_file_1)
            mod_file_2 = amplicon_info_2[amplicon_name][
                'modification_count_file']
            amp_seq_2, mod_freqs_2 = CRISPRessoShared.parse_count_file(
                mod_file_2)
            consensus_sequence = amp_seq_1
            if amp_seq_2 != consensus_sequence:
                raise DifferentAmpliconLengthException(
                    'Different amplicon lengths for the two amplicons.')

            for mod in [
                    'Insertions', 'Deletions', 'Substitutions',
                    'All_modifications'
            ]:

                mod_counts_1 = np.array(mod_freqs_1[mod], dtype=float)
                tot_counts_1 = np.array(mod_freqs_1['Total'], dtype=float)
                unmod_counts_1 = tot_counts_1 - mod_counts_1

                mod_counts_2 = np.array(mod_freqs_2[mod], dtype=float)
                tot_counts_2 = np.array(mod_freqs_2['Total'], dtype=float)
                unmod_counts_2 = tot_counts_2 - mod_counts_2

                fisher_results = [
                    stats.fisher_exact([[z[0], z[1]], [z[2], z[3]]])
                    if max(z) > 0 else [nan, 1.0]
                    for z in zip(mod_counts_1, unmod_counts_1, mod_counts_2,
                                 unmod_counts_2)
                ]
                oddsratios, pvalues = [a for a, b in fisher_results
                                       ], [b for a, b in fisher_results]

                mod_df = []
                row = [args.sample_1_name + '_' + mod]
                row.extend(mod_counts_1)
                mod_df.append(row)

                row = [args.sample_1_name + '_total']
                row.extend(tot_counts_1)
                mod_df.append(row)

                row = [args.sample_2_name + '_' + mod]
                row.extend(mod_counts_2)
                mod_df.append(row)

                row = [args.sample_2_name + '_total']
                row.extend(tot_counts_2)
                mod_df.append(row)

                row = ['odds_ratios']
                row.extend(oddsratios)
                mod_df.append(row)

                row = ['pvalues']
                row.extend(pvalues)
                mod_df.append(row)

                colnames = ['Reference']
                colnames.extend(list(consensus_sequence))
                mod_df = pd.DataFrame(mod_df, columns=colnames)
                #                mod_df = pd.concat([mod_df.iloc[:,0:2], mod_df.iloc[:,2:].apply(pd.to_numeric)],axis=1)
                #write to file
                mod_df.to_csv(_jp(amplicon_name + '.' + mod +
                                  '_quantification.txt'),
                              sep='\t',
                              index=None)

                #plot
                fig = plt.figure(figsize=(20, 10))
                ax1 = fig.add_subplot(2, 1, 1)

                diff = np.divide(mod_counts_1, tot_counts_1) - np.divide(
                    mod_counts_2, tot_counts_2)
                diff_plot = ax1.plot(diff,
                                     color=(0, 1, 0, 0.4),
                                     lw=3,
                                     label='Difference')
                ax1.set_title(
                    get_plot_title_with_ref_name(
                        '%s: %s - %s' %
                        (mod, args.sample_1_name, args.sample_2_name),
                        amplicon_name))
                ax1.set_xticks(
                    np.arange(
                        0, len_amplicon,
                        max(3, (len_amplicon / 6) -
                            (len_amplicon / 6) % 5)).astype(int))
                ax1.set_ylabel('Sequences Difference %')
                ax1.set_xlim(xmin=0, xmax=len_amplicon - 1)

                pvalues = np.array(pvalues)
                min_nonzero = np.min(pvalues[np.nonzero(pvalues)])
                pvalues[pvalues == 0] = min_nonzero
                #ax2 = ax1.twinx()
                ax2 = fig.add_subplot(2, 1, 2)
                pval_plot = ax2.plot(-1 * np.log10(pvalues),
                                     color=(1, 0, 0, 0.4),
                                     lw=2,
                                     label='-log10 P-value')
                ax2.set_ylabel('-log10 P-value')
                ax2.set_xlim(xmin=0, xmax=len_amplicon - 1)
                ax2.set_xticks(
                    np.arange(
                        0, len_amplicon,
                        max(3, (len_amplicon / 6) -
                            (len_amplicon / 6) % 5)).astype(int))
                ax2.set_xlabel('Reference amplicon position (bp)')

                #bonferroni correction
                corrected_p = -1 * np.log10(
                    0.01 / float(len(consensus_sequence)))
                cutoff_plot = ax2.plot([0, len(consensus_sequence)],
                                       [corrected_p, corrected_p],
                                       color='k',
                                       dashes=(5, 10),
                                       label='Bonferronni corrected cutoff')

                plots = diff_plot + pval_plot + cutoff_plot

                diff_y_min, diff_y_max = ax1.get_ylim()
                p_y_min, p_y_max = ax2.get_ylim()
                if cut_points:
                    for idx, cut_point in enumerate(cut_points):
                        if idx == 0:
                            plot_cleavage = ax1.plot(
                                [cut_point, cut_point],
                                [diff_y_min, diff_y_max],
                                '--k',
                                lw=2,
                                label='Predicted cleavage position')
                            ax2.plot([cut_point, cut_point],
                                     [p_y_min, p_y_max],
                                     '--k',
                                     lw=2,
                                     label='Predicted cleavage position')
                            plots = plots + plot_cleavage
                        else:
                            ax1.plot([cut_point, cut_point],
                                     [diff_y_min, diff_y_max],
                                     '--k',
                                     lw=2,
                                     label='_nolegend_')
                            ax2.plot([cut_point, cut_point],
                                     [diff_y_min, diff_y_max],
                                     '--k',
                                     lw=2,
                                     label='_nolegend_')

                    for idx, sgRNA_int in enumerate(sgRNA_intervals):
                        if idx == 0:
                            p2 = ax1.plot([sgRNA_int[0], sgRNA_int[1]],
                                          [diff_y_min, diff_y_min],
                                          lw=10,
                                          c=(0, 0, 0, 0.15),
                                          label='sgRNA')
                            ax2.plot([sgRNA_int[0], sgRNA_int[1]],
                                     [p_y_min, p_y_min],
                                     lw=10,
                                     c=(0, 0, 0, 0.15),
                                     label='sgRNA')
                            plots = plots + p2
                        else:
                            ax1.plot([sgRNA_int[0], sgRNA_int[1]],
                                     [diff_y_min, diff_y_min],
                                     lw=10,
                                     c=(0, 0, 0, 0.15),
                                     label='_nolegend_')
                            ax2.plot([sgRNA_int[0], sgRNA_int[1]],
                                     [p_y_min, p_y_min],
                                     lw=10,
                                     c=(0, 0, 0, 0.15),
                                     label='_nolegend_')

                labs = [p.get_label() for p in plots]
                lgd = plt.legend(plots,
                                 labs,
                                 loc='upper center',
                                 bbox_to_anchor=(0.5, -0.2),
                                 ncol=1,
                                 fancybox=True,
                                 shadow=False)

                plt.savefig(_jp('2.' + amplicon_name + '.' + mod +
                                '.quantification.pdf'),
                            bbox_extra_artists=(lgd, ),
                            bbox_inches='tight')
                if args.save_also_png:
                    plt.savefig(_jp('2.' + amplicon_name + '.' + mod +
                                    '.quantification.png'),
                                bbox_extra_artists=(lgd, ),
                                bbox_inches='tight')

            #create merged heatmaps for each cut site
            allele_files_1 = amplicon_info_1[amplicon_name]['allele_files']
            allele_files_2 = amplicon_info_2[amplicon_name]['allele_files']
            for allele_file_1 in allele_files_1:
                allele_file_1_name = os.path.split(allele_file_1)[
                    1]  #get file part of path
                for allele_file_2 in allele_files_2:
                    allele_file_2_name = os.path.split(allele_file_2)[
                        1]  #get file part of path
                    #if files are the same (same amplicon, cut site, guide), run comparison
                    if allele_file_1_name == allele_file_2_name:
                        df1 = pd.read_csv(allele_file_1, sep="\t")
                        df2 = pd.read_csv(allele_file_2, sep="\t")

                        #find unmodified reference for comparison (if it exists)
                        ref_seq_around_cut = ""
                        if len(df1.loc[df1['Reference_Sequence'].str.contains(
                                '-') == False]) > 0:
                            ref_seq_around_cut = df1.loc[
                                df1['Reference_Sequence'].str.contains('-') ==
                                False]['Reference_Sequence'].iloc[0]
                        #otherwise figure out which sgRNA was used for this comparison
                        elif len(df2.loc[df2['Reference_Sequence'].str.
                                         contains('-') == False]) > 0:
                            ref_seq_around_cut = df2.loc[
                                df2['Reference_Sequence'].str.contains('-') ==
                                False]['Reference_Sequence'].iloc[0]
                        else:
                            seq_len = df2[df2['Unedited'] ==
                                          True]['Reference_Sequence'].iloc[0]
                            for sgRNA_interval, cut_point in zip(
                                    sgRNA_intervals, cut_points):
                                sgRNA_seq = consensus_sequence[
                                    sgRNA_interval[0]:sgRNA_interval[1]]
                                if sgRNA_seq in allele_file_1_name:
                                    this_sgRNA_seq = sgRNA_seq
                                    this_cut_point = cut_point
                                    ref_seq_around_cut = consensus_sequence[max(
                                        0, this_cut_point -
                                        args.offset_around_cut_to_plot +
                                        1):min(
                                            len(reference_seq), cut_point +
                                            args.offset_around_cut_to_plot +
                                            1)]
                                    break

                        merged = pd.merge(df1,
                                          df2,
                                          on=[
                                              'Aligned_Sequence',
                                              'Reference_Sequence', 'Unedited',
                                              'n_deleted', 'n_inserted',
                                              'n_mutated'
                                          ],
                                          suffixes=('_' + args.sample_1_name,
                                                    '_' + args.sample_2_name),
                                          how='outer')
                        quant_cols = [
                            '#Reads_' + args.sample_1_name,
                            '%Reads_' + args.sample_1_name,
                            '#Reads_' + args.sample_2_name,
                            '%Reads_' + args.sample_2_name
                        ]
                        merged[quant_cols] = merged[quant_cols].fillna(0)
                        lfc_error = 0.1
                        merged['each_LFC'] = np.log2(
                            ((merged['%Reads_' + args.sample_1_name] +
                              lfc_error) /
                             (merged['%Reads_' + args.sample_2_name] +
                              lfc_error)).astype(float)).replace(
                                  [np.inf, np.NaN], 0)
                        merged = merged.reset_index().set_index(
                            'Aligned_Sequence')
                        output_root = allele_file_1_name.replace(".txt", "")
                        merged.to_csv(_jp(output_root + ".txt"),
                                      sep="\t",
                                      index=None)
                        CRISPRessoPlot.plot_alleles_table_compare(
                            ref_seq_around_cut,
                            merged.sort_values(['each_LFC'], ascending=True),
                            args.sample_1_name,
                            args.sample_2_name,
                            _jp('3.' + output_root + "_top"),
                            MIN_FREQUENCY=args.
                            min_frequency_alleles_around_cut_to_plot,
                            MAX_N_ROWS=args.
                            max_rows_alleles_around_cut_to_plot,
                            SAVE_ALSO_PNG=args.save_also_png)
                        CRISPRessoPlot.plot_alleles_table_compare(
                            ref_seq_around_cut,
                            merged.sort_values(['each_LFC'], ascending=False),
                            args.sample_1_name,
                            args.sample_2_name,
                            _jp('3.' + output_root + "_bottom"),
                            MIN_FREQUENCY=args.
                            min_frequency_alleles_around_cut_to_plot,
                            MAX_N_ROWS=args.
                            max_rows_alleles_around_cut_to_plot,
                            SAVE_ALSO_PNG=args.save_also_png)

        info('All Done!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)

        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Ejemplo n.º 10
0
###EXCEPTIONS############################


class MixedRunningModeException(Exception):
    pass


class DifferentAmpliconLengthException(Exception):
    pass


############################

matplotlib = check_library('matplotlib')
CRISPRessoPlot.setMatplotlibDefaults()

plt = check_library('pylab')
np = check_library('numpy')
pd = check_library('pandas')
#scipy=check_library('scipy.stats')
import scipy.stats as stats

_ROOT = os.path.abspath(os.path.dirname(__file__))


def main():
    try:
        description = [
            '~~~CRISPRessoCompare~~~',
            '-Comparison of two CRISPResso analyses-'
Ejemplo n.º 11
0
        def report_nucleotide_summary(amplicon_seq, amplicon_name,
                                      amplicon_index):
            consensus_sequence = ""
            nucleotide_frequency_summary = []
            nucleotide_percentage_summary = []
            modification_frequency_summary = []
            modification_percentage_summary = []

            amp_found_count = 0  #how many folders had information for this amplicon
            consensus_guides = []
            consensus_include_idxs = []
            consensus_sgRNA_intervals = []
            guides_all_same = True
            batches_with_this_amplicon = []
            for idx, row in batch_params.iterrows():
                batchName = CRISPRessoShared.slugify(row["name"])
                file_prefix = row['file_prefix']
                folder_name = os.path.join(OUTPUT_DIRECTORY,
                                           'CRISPResso_on_%s' % batchName)
                run_data = run_datas[idx]
                if run_data is None:
                    continue
                batch_has_amplicon = False
                batch_amplicon_name = ''
                for ref_name in run_data['ref_names']:
                    if amplicon_seq == run_data['refs'][ref_name]['sequence']:
                        batch_has_amplicon = True
                        batch_amplicon_name = ref_name
                if not batch_has_amplicon:
                    continue
                batches_with_this_amplicon.append(idx)

                if consensus_guides == []:
                    consensus_guides = run_data['refs'][batch_amplicon_name][
                        'sgRNA_sequences']
                    consensus_include_idxs = run_data['refs'][
                        batch_amplicon_name]['include_idxs']
                    consensus_sgRNA_intervals = run_data['refs'][
                        batch_amplicon_name]['sgRNA_intervals']

                if run_data['refs'][batch_amplicon_name][
                        'sgRNA_sequences'] != consensus_guides:
                    guides_all_same = False

                if 'nuc_freq_filename' not in run_data['refs'][
                        batch_amplicon_name]:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Cannot find nucleotide information."
                        % (batch_amplicon_name, folder_name))
                    continue

                nucleotide_frequency_file = run_data['refs'][
                    batch_amplicon_name]['nuc_freq_filename']
                ampSeq_nf, nuc_freqs = CRISPRessoShared.parse_count_file(
                    nucleotide_frequency_file)

                nucleotide_pct_file = run_data['refs'][batch_amplicon_name][
                    'nuc_pct_filename']
                ampSeq_np, nuc_pcts = CRISPRessoShared.parse_count_file(
                    nucleotide_pct_file)

                count_file = run_data['refs'][batch_amplicon_name][
                    'mod_count_filename']
                ampSeq_cf, mod_freqs = CRISPRessoShared.parse_count_file(
                    count_file)

                if ampSeq_nf is None or ampSeq_np is None or ampSeq_cf is None:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Could not parse batch output."
                        % (batch_amplicon_name, folder_name))
                    info(
                        "Nucleotide frequency amplicon: '%s', Nucleotide percentage amplicon: '%s', Count vectors amplicon: '%s'"
                        % (ampSeq_nf, ampSeq_np, ampSeq_cf))
                    continue
                if ampSeq_nf != ampSeq_np or ampSeq_np != ampSeq_cf:
                    warn(
                        "Skipping the amplicon '%s' in folder '%s'. Parsed amplicon sequences do not match\nnf:%s\nnp:%s\ncf:%s\nrf:%s"
                        % (batch_amplicon_name, folder_name, ampSeq_nf,
                           ampSeq_np, ampSeq_cf, amplicon_seq))
                    continue
                if consensus_sequence == "":
                    consensus_sequence = ampSeq_nf
                if ampSeq_nf != consensus_sequence:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Amplicon sequences do not match."
                        % (batch_amplicon_name, folder_name))
                    continue
                if 'Total' not in mod_freqs:
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Processing did not complete."
                        % (batch_amplicon_name, folder_name))
                    continue
                if mod_freqs['Total'][0] == 0 or mod_freqs['Total'][0] == "0":
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Got no reads for amplicon."
                        % (batch_amplicon_name, folder_name))
                    continue
                if (args.min_reads_for_inclusion is not None) and (int(
                        mod_freqs['Total'][0]) < args.min_reads_for_inclusion):
                    info(
                        "Skipping the amplicon '%s' in folder '%s'. Got %s reads (min_reads_for_inclusion is %d)."
                        % (batch_amplicon_name, folder_name,
                           str(mod_freqs['Total'][0]),
                           args.min_reads_for_inclusion))
                    continue

                mod_pcts = {}
                for key in mod_freqs:
                    mod_pcts[key] = np.array(mod_freqs[key]).astype(
                        np.float) / float(mod_freqs['Total'][0])

                amp_found_count += 1

                for nuc in ['A', 'T', 'C', 'G', 'N', '-']:
                    row = [batchName, nuc]
                    row.extend(nuc_freqs[nuc])
                    nucleotide_frequency_summary.append(row)

                    pct_row = [batchName, nuc]
                    pct_row.extend(nuc_pcts[nuc])
                    nucleotide_percentage_summary.append(pct_row)

                for mod in [
                        'Insertions', 'Insertions_Left', 'Deletions',
                        'Substitutions', 'All_modifications'
                ]:
                    row = [batchName, mod]
                    row.extend(mod_freqs[mod])
                    modification_frequency_summary.append(row)

                    pct_row = [batchName, mod]
                    pct_row.extend(mod_pcts[mod])
                    modification_percentage_summary.append(pct_row)

            if amp_found_count == 0:
                info(
                    "Couldn't find any data for amplicon '%s'. Not compiling results."
                    % amplicon_name)
                return ()

            colnames = ['Batch', 'Nucleotide']
            colnames.extend(list(consensus_sequence))
            nucleotide_frequency_summary_df = pd.DataFrame(
                nucleotide_frequency_summary, columns=colnames)
            nucleotide_frequency_summary_df = pd.concat([
                nucleotide_frequency_summary_df.iloc[:, 0:2],
                nucleotide_frequency_summary_df.iloc[:, 2:].apply(
                    pd.to_numeric)
            ],
                                                        axis=1)
            nucleotide_frequency_summary_df.to_csv(
                _jp(amplicon_name + '.NUCLEOTIDE_FREQUENCY_SUMMARY.txt'),
                sep='\t',
                index=None)

            nucleotide_percentage_summary_df = pd.DataFrame(
                nucleotide_percentage_summary, columns=colnames)
            nucleotide_percentage_summary_df = pd.concat([
                nucleotide_percentage_summary_df.iloc[:, 0:2],
                nucleotide_percentage_summary_df.iloc[:, 2:].apply(
                    pd.to_numeric)
            ],
                                                         axis=1)
            nucleotide_percentage_summary_df.to_csv(
                _jp(amplicon_name + '.NUCLEOTIDE_PERCENTAGE_SUMMARY.txt'),
                sep='\t',
                index=None)

            colnames = ['Batch', 'Modification']
            colnames.extend(list(consensus_sequence))
            modification_frequency_summary_df = pd.DataFrame(
                modification_frequency_summary, columns=colnames)
            modification_frequency_summary_df = pd.concat([
                modification_frequency_summary_df.iloc[:, 0:2],
                modification_frequency_summary_df.iloc[:, 2:].apply(
                    pd.to_numeric)
            ],
                                                          axis=1)
            modification_frequency_summary_df.to_csv(
                _jp(amplicon_name + '.MODIFICATION_FREQUENCY_SUMMARY.txt'),
                sep='\t',
                index=None)

            modification_percentage_summary_df = pd.DataFrame(
                modification_percentage_summary, columns=colnames)
            modification_percentage_summary_df = pd.concat([
                modification_percentage_summary_df.iloc[:, 0:2],
                modification_percentage_summary_df.iloc[:, 2:].apply(
                    pd.to_numeric)
            ],
                                                           axis=1)
            modification_percentage_summary_df.to_csv(
                _jp(amplicon_name + '.MODIFICATION_PERCENTAGE_SUMMARY.txt'),
                sep='\t',
                index=None)

            #if guides are all the same, merge substitutions and perform base editor comparison at guide quantification window
            if guides_all_same and consensus_guides != []:
                include_idxs = consensus_include_idxs
                sgRNA_intervals = consensus_sgRNA_intervals
                info(
                    "All guides are equal. Performing comparison of batches for amplicon '%s'"
                    % amplicon_name)
                include_idxs_flat = [0, 1]  # guide, nucleotide
                include_idxs_flat.extend(
                    [cutidx + 2 for cutidx in include_idxs])
                sub_nucleotide_frequency_summary_df = nucleotide_frequency_summary_df.iloc[:,
                                                                                           include_idxs_flat]
                sub_nucleotide_percentage_summary_df = nucleotide_percentage_summary_df.iloc[:,
                                                                                             include_idxs_flat]
                sub_modification_percentage_summary_df = modification_percentage_summary_df.iloc[:,
                                                                                                 include_idxs_flat]
                sub_sgRNA_intervals = []
                for sgRNA_interval in sgRNA_intervals:
                    newstart = None
                    newend = None
                    for idx, i in enumerate(include_idxs):
                        if i <= sgRNA_interval[0]:
                            newstart = idx
                        if newend is None and i >= sgRNA_interval[1]:
                            newend = idx

                    #if guide doesn't overlap with include indexes
                    if newend == 0 or newstart == len(include_idxs):
                        continue
                    #otherwise, correct partial overlaps
                    elif newstart == None and newend == None:
                        newstart = 0
                        newend = len(include_idxs) - 1
                    elif newstart == None:
                        newstart = 0
                    elif newend == None:
                        newend = len(include_idxs) - 1
                    #and add it to the list
                    sub_sgRNA_intervals.append((newstart, newend))

                if not args.suppress_plots:
                    CRISPRessoPlot.plot_nucleotide_quilt(
                        sub_nucleotide_percentage_summary_df,
                        sub_modification_percentage_summary_df,
                        _jp(amplicon_name +
                            '.Quantification_Window_Nucleotide_Percentage_Quilt'
                            ),
                        save_png,
                        sgRNA_intervals=sub_sgRNA_intervals)
                    if args.base_editor_output:
                        CRISPRessoPlot.plot_conversion_map(
                            sub_nucleotide_percentage_summary_df,
                            _jp(amplicon_name +
                                '.Quantification_Window_Nucleotide_Conversion'
                                ),
                            args.conversion_nuc_from,
                            args.conversion_nuc_to,
                            save_png,
                            sgRNA_intervals=sub_sgRNA_intervals)

                    CRISPRessoPlot.plot_nucleotide_quilt(
                        nucleotide_percentage_summary_df,
                        modification_percentage_summary_df,
                        _jp(amplicon_name + '.Nucleotide_Percentage_Quilt'),
                        save_png,
                        sgRNA_intervals=sgRNA_intervals,
                        quantification_window_idxs=include_idxs)
                    if args.base_editor_output:
                        CRISPRessoPlot.plot_conversion_map(
                            nucleotide_percentage_summary_df,
                            _jp(amplicon_name + '.Nucleotide_Conversion'),
                            args.conversion_nuc_from,
                            args.conversion_nuc_to,
                            save_png,
                            sgRNA_intervals=sgRNA_intervals)
            else:  #guides are not the same
                if not args.suppress_plots:
                    CRISPRessoPlot.plot_nucleotide_quilt(
                        nucleotide_percentage_summary_df,
                        modification_percentage_summary_df,
                        _jp(amplicon_name + '.Nucleotide_Percentage_Quilt'),
                        save_png)
                    if args.base_editor_output:
                        CRISPRessoPlot.plot_conversion_map(
                            nucleotide_percentage_summary_df,
                            _jp(amplicon_name + '.Nucleotide_Conversion'),
                            args.conversion_nuc_from, args.conversion_nuc_to,
                            save_png)
Ejemplo n.º 12
0
def main():
    try:
        description = [
            '~~~CRISPRessoCompare~~~',
            '-Comparison of two CRISPResso analyses-'
        ]
        compare_header = r'''
 ___________________________
| __ __      __      __  __ |
|/  /  \|\/||__) /\ |__)|_  |
|\__\__/|  ||   /--\| \ |__ |
|___________________________|
        '''
        compare_header = CRISPRessoShared.get_crispresso_header(
            description, compare_header)
        print(compare_header)

        parser = argparse.ArgumentParser(
            description='CRISPRessoCompare Parameters',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        parser.add_argument(
            'crispresso_output_folder_1',
            type=str,
            help='First output folder with CRISPResso analysis')
        parser.add_argument(
            'crispresso_output_folder_2',
            type=str,
            help='Second output folder with CRISPResso analysis')

        #OPTIONALS
        parser.add_argument('-n', '--name', help='Output name', default='')
        parser.add_argument('-n1', '--sample_1_name', help='Sample 1 name')
        parser.add_argument('-n2', '--sample_2_name', help='Sample 2 name')
        parser.add_argument('-o', '--output_folder', help='', default='')
        parser.add_argument(
            '--min_frequency_alleles_around_cut_to_plot',
            type=float,
            help=
            'Minimum %% reads required to report an allele in the alleles table plot.',
            default=0.2)
        parser.add_argument(
            '--max_rows_alleles_around_cut_to_plot',
            type=int,
            help='Maximum number of rows to report in the alleles table plot. ',
            default=50)
        parser.add_argument('--suppress_report',
                            help='Suppress output report',
                            action='store_true')
        parser.add_argument(
            '--place_report_in_output_folder',
            help=
            'If true, report will be written inside the CRISPResso output folder. By default, the report will be written one directory up from the report output.',
            action='store_true')
        parser.add_argument('--debug',
                            help='Show debug messages',
                            action='store_true')

        args = parser.parse_args()
        debug_flag = args.debug

        #check that the CRISPResso output is present and fill amplicon_info
        quantification_file_1, amplicon_names_1, amplicon_info_1 = CRISPRessoShared.check_output_folder(
            args.crispresso_output_folder_1)
        quantification_file_2, amplicon_names_2, amplicon_info_2 = CRISPRessoShared.check_output_folder(
            args.crispresso_output_folder_2)

        run_info_1_file = os.path.join(args.crispresso_output_folder_1,
                                       'CRISPResso2_info.pickle')
        if os.path.isfile(run_info_1_file) is False:
            raise CRISPRessoShared.OutputFolderIncompleteException(
                'The folder %s is not a valid CRISPResso2 output folder. Cannot find run data at %s'
                % (args.crispresso_output_folder_1, run_info_1_file))
        run_info_1 = cp.load(open(run_info_1_file, 'rb'))

        run_info_2_file = os.path.join(args.crispresso_output_folder_2,
                                       'CRISPResso2_info.pickle')
        if os.path.isfile(run_info_2_file) is False:
            raise CRISPRessoShared.OutputFolderIncompleteException(
                'The folder %s is not a valid CRISPResso2 output folder. Cannot find run data at %s'
                % (args.crispresso_output_folder_2, run_info_2_file))
        run_info_2 = cp.load(open(run_info_2_file, 'rb'))

        sample_1_name = args.sample_1_name
        if args.sample_1_name is None:
            sample_1_name = "Sample 1"
            if 'name' in run_info_1 and run_info_1['name'] != '':
                sample_1_name = run_info_1['name']

        sample_2_name = args.sample_2_name
        if args.sample_2_name is None:
            sample_2_name = "Sample 2"
            if 'name' in run_info_2 and run_info_2['name'] != '':
                sample_2_name = run_info_2['name']

        get_name_from_folder = lambda x: os.path.basename(os.path.abspath(
            x)).replace('CRISPResso_on_', '')

        if not args.name:
            database_id = '%s_VS_%s' % (
                get_name_from_folder(args.crispresso_output_folder_1),
                get_name_from_folder(args.crispresso_output_folder_2))
        else:
            database_id = args.name

        OUTPUT_DIRECTORY = 'CRISPRessoCompare_on_%s' % database_id

        if args.output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.output_folder), OUTPUT_DIRECTORY)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory
        log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt')

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
            info('Done!')
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoCompare_RUNNING_LOG.txt')
        logging.getLogger().addHandler(logging.FileHandler(log_filename))

        with open(log_filename, 'w+') as outfile:
            outfile.write(
                '[Command used]:\nCRISPRessoCompare %s\n\n[Execution log]:\n' %
                ' '.join(sys.argv))

        crispresso2Compare_info_file = os.path.join(
            OUTPUT_DIRECTORY, 'CRISPResso2Compare_info.pickle')
        crispresso2_info = {
        }  #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['version'] = CRISPRessoShared.__version__
        crispresso2_info['args'] = deepcopy(args)

        crispresso2_info['log_filename'] = os.path.basename(log_filename)

        crispresso2_info['summary_plot_names'] = []
        crispresso2_info['summary_plot_titles'] = {}
        crispresso2_info['summary_plot_labels'] = {}
        crispresso2_info['summary_plot_datas'] = {}

        save_png = True
        if args.suppress_report:
            save_png = False

        #LOAD DATA
        amplicon_names_in_both = [
            amplicon_name for amplicon_name in amplicon_names_1
            if amplicon_name in amplicon_names_2
        ]
        n_refs = len(amplicon_names_in_both)

        def get_plot_title_with_ref_name(plotTitle, refName):
            if n_refs > 1:
                return (plotTitle + ": " + refName)
            return plotTitle

        for amplicon_name in amplicon_names_in_both:
            profile_1 = parse_profile(
                amplicon_info_1[amplicon_name]['quantification_file'])
            profile_2 = parse_profile(
                amplicon_info_2[amplicon_name]['quantification_file'])

            amplicon_plot_name = amplicon_name + "."
            if len(amplicon_names_in_both
                   ) == 1 and amplicon_name == "Reference":
                amplicon_plot_name = ""

            try:
                assert np.all(profile_1[:, 0] == profile_2[:, 0])
            except:
                raise DifferentAmpliconLengthException(
                    'Different amplicon lengths for the two amplicons.')
            len_amplicon = profile_1.shape[0]
            effect_vector_any_1 = profile_1[:, 1]
            effect_vector_any_2 = profile_2[:, 1]
            cut_points = run_info_1['refs'][amplicon_name]['sgRNA_cut_points']
            sgRNA_intervals = run_info_1['refs'][amplicon_name][
                'sgRNA_intervals']

            #Quantification comparison barchart
            fig = plt.figure(figsize=(30, 15))
            n_groups = 2

            N_TOTAL_1 = float(amplicon_info_1[amplicon_name]['Reads_aligned'])
            N_UNMODIFIED_1 = float(
                amplicon_info_1[amplicon_name]['Unmodified'])
            N_MODIFIED_1 = float(amplicon_info_1[amplicon_name]['Modified'])

            N_TOTAL_2 = float(amplicon_info_2[amplicon_name]['Reads_aligned'])
            N_UNMODIFIED_2 = float(
                amplicon_info_2[amplicon_name]['Unmodified'])
            N_MODIFIED_2 = float(amplicon_info_2[amplicon_name]['Modified'])

            means_sample_1 = np.array([N_UNMODIFIED_1, N_MODIFIED_1
                                       ]) / N_TOTAL_1 * 100
            means_sample_2 = np.array([N_UNMODIFIED_2, N_MODIFIED_2
                                       ]) / N_TOTAL_2 * 100

            ax1 = fig.add_subplot(1, 2, 1)

            index = np.arange(n_groups)
            bar_width = 0.35

            opacity = 0.4
            error_config = {'ecolor': '0.3'}

            rects1 = ax1.bar(index,
                             means_sample_1,
                             bar_width,
                             alpha=opacity,
                             color=(0, 0, 1, 0.4),
                             label=sample_1_name)

            rects2 = ax1.bar(index + bar_width,
                             means_sample_2,
                             bar_width,
                             alpha=opacity,
                             color=(1, 0, 0, 0.4),
                             label=sample_2_name)

            plt.ylabel('% Sequences')
            plt.title(
                get_plot_title_with_ref_name(
                    '%s VS %s' % (sample_1_name, sample_2_name),
                    amplicon_name))
            plt.xticks(index + bar_width / 2.0, ('Unmodified', 'Modified'))
            plt.legend()
            #            plt.xlim(index[0]-0.2,(index + bar_width)[-1]+bar_width+0.2)
            plt.tight_layout()

            ax2 = fig.add_subplot(1, 2, 2)
            ax2.bar(index,
                    means_sample_1 - means_sample_2,
                    bar_width + 0.35,
                    alpha=opacity,
                    color=(0, 1, 1, 0.4),
                    label='')

            plt.ylabel('% Sequences Difference')
            plt.title(
                get_plot_title_with_ref_name(
                    '%s - %s' % (sample_1_name, sample_2_name), amplicon_name))
            plt.xticks(index, ['Unmodified', 'Modified'])

            #            plt.xlim(index[0]-bar_width/2, (index+bar_width)[-1]+2*bar_width)
            plt.tight_layout()
            plot_name = '1.' + amplicon_plot_name + 'Editing_comparison'
            plt.savefig(_jp(plot_name) + '.pdf', bbox_inches='tight')
            if save_png:
                plt.savefig(_jp(plot_name) + '.png', bbox_inches='tight')

            crispresso2_info['summary_plot_names'].append(plot_name)
            crispresso2_info['summary_plot_titles'][
                plot_name] = 'Editing efficiency comparison'
            crispresso2_info['summary_plot_labels'][
                plot_name] = 'Figure 1: Comparison for amplicon ' + amplicon_name + '; Left: Percentage of modified and unmodified reads in each sample; Right: relative percentage of modified and unmodified reads'
            output_1 = os.path.join(args.crispresso_output_folder_1,
                                    run_info_1['report_filename'])
            output_2 = os.path.join(args.crispresso_output_folder_1,
                                    run_info_2['report_filename'])
            crispresso2_info['summary_plot_datas'][plot_name] = []
            if os.path.isfile(output_1):
                crispresso2_info['summary_plot_datas'][plot_name].append(
                    (sample_1_name + ' output',
                     os.path.relpath(output_1, OUTPUT_DIRECTORY)))
            if os.path.isfile(output_2):
                crispresso2_info['summary_plot_datas'][plot_name].append(
                    (sample_2_name + ' output',
                     os.path.relpath(output_2, OUTPUT_DIRECTORY)))

            mod_file_1 = amplicon_info_1[amplicon_name][
                'modification_count_file']
            amp_seq_1, mod_freqs_1 = CRISPRessoShared.parse_count_file(
                mod_file_1)
            mod_file_2 = amplicon_info_2[amplicon_name][
                'modification_count_file']
            amp_seq_2, mod_freqs_2 = CRISPRessoShared.parse_count_file(
                mod_file_2)
            consensus_sequence = amp_seq_1
            if amp_seq_2 != consensus_sequence:
                raise DifferentAmpliconLengthException(
                    'Different amplicon lengths for the two amplicons.')

            for mod in [
                    'Insertions', 'Deletions', 'Substitutions',
                    'All_modifications'
            ]:
                mod_name = mod
                if mod == "All_modifications":
                    mod_name = "Combined modifications (insertions, deletions and substitutions)"

                mod_counts_1 = np.array(mod_freqs_1[mod], dtype=float)
                tot_counts_1 = np.array(mod_freqs_1['Total'], dtype=float)
                unmod_counts_1 = tot_counts_1 - mod_counts_1

                mod_counts_2 = np.array(mod_freqs_2[mod], dtype=float)
                tot_counts_2 = np.array(mod_freqs_2['Total'], dtype=float)
                unmod_counts_2 = tot_counts_2 - mod_counts_2

                fisher_results = [
                    stats.fisher_exact([[z[0], z[1]], [z[2], z[3]]])
                    if max(z) > 0 else [nan, 1.0]
                    for z in zip(mod_counts_1, unmod_counts_1, mod_counts_2,
                                 unmod_counts_2)
                ]
                oddsratios, pvalues = [a for a, b in fisher_results
                                       ], [b for a, b in fisher_results]

                mod_df = []
                row = [sample_1_name + '_' + mod]
                row.extend(mod_counts_1)
                mod_df.append(row)

                row = [sample_1_name + '_total']
                row.extend(tot_counts_1)
                mod_df.append(row)

                row = [sample_2_name + '_' + mod]
                row.extend(mod_counts_2)
                mod_df.append(row)

                row = [sample_2_name + '_total']
                row.extend(tot_counts_2)
                mod_df.append(row)

                row = ['odds_ratios']
                row.extend(oddsratios)
                mod_df.append(row)

                row = ['pvalues']
                row.extend(pvalues)
                mod_df.append(row)

                colnames = ['Reference']
                colnames.extend(list(consensus_sequence))
                mod_df = pd.DataFrame(mod_df, columns=colnames)
                #                mod_df = pd.concat([mod_df.iloc[:,0:2], mod_df.iloc[:,2:].apply(pd.to_numeric)],axis=1)
                #write to file
                mod_filename = _jp(amplicon_plot_name + mod +
                                   "_quantification.txt")
                mod_df.to_csv(mod_filename, sep='\t', index=None)

                #plot
                fig = plt.figure(figsize=(20, 10))
                ax1 = fig.add_subplot(2, 1, 1)

                diff = np.divide(mod_counts_1, tot_counts_1) - np.divide(
                    mod_counts_2, tot_counts_2)
                diff_plot = ax1.plot(diff,
                                     color=(0, 1, 0, 0.4),
                                     lw=3,
                                     label='Difference')
                ax1.set_title(
                    get_plot_title_with_ref_name(
                        '%s: %s - %s' % (mod, sample_1_name, sample_2_name),
                        amplicon_name))
                ax1.set_xticks(
                    np.arange(
                        0, len_amplicon,
                        max(3, (len_amplicon / 6) -
                            (len_amplicon / 6) % 5)).astype(int))
                ax1.set_ylabel('Sequences Difference %')
                ax1.set_xlim(xmin=0, xmax=len_amplicon - 1)

                pvalues = np.array(pvalues)
                min_nonzero = np.min(pvalues[np.nonzero(pvalues)])
                pvalues[pvalues == 0] = min_nonzero
                #ax2 = ax1.twinx()
                ax2 = fig.add_subplot(2, 1, 2)
                pval_plot = ax2.plot(-1 * np.log10(pvalues),
                                     color=(1, 0, 0, 0.4),
                                     lw=2,
                                     label='-log10 P-value')
                ax2.set_ylabel('-log10 P-value')
                ax2.set_xlim(xmin=0, xmax=len_amplicon - 1)
                ax2.set_xticks(
                    np.arange(
                        0, len_amplicon,
                        max(3, (len_amplicon / 6) -
                            (len_amplicon / 6) % 5)).astype(int))
                ax2.set_xlabel('Reference amplicon position (bp)')

                #bonferroni correction
                corrected_p = -1 * np.log10(
                    0.01 / float(len(consensus_sequence)))
                cutoff_plot = ax2.plot([0, len(consensus_sequence)],
                                       [corrected_p, corrected_p],
                                       color='k',
                                       dashes=(5, 10),
                                       label='Bonferronni corrected cutoff')

                plots = diff_plot + pval_plot + cutoff_plot

                diff_y_min, diff_y_max = ax1.get_ylim()
                p_y_min, p_y_max = ax2.get_ylim()
                if cut_points:
                    for idx, cut_point in enumerate(cut_points):
                        if idx == 0:
                            plot_cleavage = ax1.plot(
                                [cut_point, cut_point],
                                [diff_y_min, diff_y_max],
                                '--k',
                                lw=2,
                                label='Predicted cleavage position')
                            ax2.plot([cut_point, cut_point],
                                     [p_y_min, p_y_max],
                                     '--k',
                                     lw=2,
                                     label='Predicted cleavage position')
                            plots = plots + plot_cleavage
                        else:
                            ax1.plot([cut_point, cut_point],
                                     [diff_y_min, diff_y_max],
                                     '--k',
                                     lw=2,
                                     label='_nolegend_')
                            ax2.plot([cut_point, cut_point],
                                     [diff_y_min, diff_y_max],
                                     '--k',
                                     lw=2,
                                     label='_nolegend_')

                    for idx, sgRNA_int in enumerate(sgRNA_intervals):
                        if idx == 0:
                            p2 = ax1.plot([sgRNA_int[0], sgRNA_int[1]],
                                          [diff_y_min, diff_y_min],
                                          lw=10,
                                          c=(0, 0, 0, 0.15),
                                          label='sgRNA')
                            ax2.plot([sgRNA_int[0], sgRNA_int[1]],
                                     [p_y_min, p_y_min],
                                     lw=10,
                                     c=(0, 0, 0, 0.15),
                                     label='sgRNA')
                            plots = plots + p2
                        else:
                            ax1.plot([sgRNA_int[0], sgRNA_int[1]],
                                     [diff_y_min, diff_y_min],
                                     lw=10,
                                     c=(0, 0, 0, 0.15),
                                     label='_nolegend_')
                            ax2.plot([sgRNA_int[0], sgRNA_int[1]],
                                     [p_y_min, p_y_min],
                                     lw=10,
                                     c=(0, 0, 0, 0.15),
                                     label='_nolegend_')

                labs = [p.get_label() for p in plots]
                lgd = plt.legend(plots,
                                 labs,
                                 loc='upper center',
                                 bbox_to_anchor=(0.5, -0.2),
                                 ncol=1,
                                 fancybox=True,
                                 shadow=False)

                plot_name = '2.' + amplicon_plot_name + mod + '_quantification'
                plt.savefig(_jp(plot_name + '.pdf'),
                            bbox_inches='tight',
                            bbox_extra_artists=(lgd, ))
                if save_png:
                    plt.savefig(_jp(plot_name + '.png'),
                                bbox_inches='tight',
                                bbox_extra_artists=(lgd, ))
                crispresso2_info['summary_plot_names'].append(plot_name)
                crispresso2_info['summary_plot_titles'][
                    plot_name] = mod_name + ' locations'
                crispresso2_info['summary_plot_labels'][
                    plot_name] = mod_name + ' location comparison for amplicon ' + amplicon_name + '; Top: percent difference; Bottom: p-value.'
                crispresso2_info['summary_plot_datas'][plot_name] = [
                    (mod_name + ' quantification',
                     os.path.basename(mod_filename))
                ]

            #create merged heatmaps for each cut site
            allele_files_1 = amplicon_info_1[amplicon_name]['allele_files']
            allele_files_2 = amplicon_info_2[amplicon_name]['allele_files']
            for allele_file_1 in allele_files_1:
                allele_file_1_name = os.path.split(allele_file_1)[
                    1]  #get file part of path
                for allele_file_2 in allele_files_2:
                    allele_file_2_name = os.path.split(allele_file_2)[
                        1]  #get file part of path
                    #if files are the same (same amplicon, cut site, guide), run comparison
                    if allele_file_1_name == allele_file_2_name:
                        df1 = pd.read_csv(allele_file_1, sep="\t")
                        df2 = pd.read_csv(allele_file_2, sep="\t")

                        #find unmodified reference for comparison (if it exists)
                        ref_seq_around_cut = ""
                        if len(df1.loc[df1['Reference_Sequence'].str.contains(
                                '-') == False]) > 0:
                            ref_seq_around_cut = df1.loc[
                                df1['Reference_Sequence'].str.contains('-') ==
                                False]['Reference_Sequence'].iloc[0]
                        #otherwise figure out which sgRNA was used for this comparison
                        elif len(df2.loc[df2['Reference_Sequence'].str.
                                         contains('-') == False]) > 0:
                            ref_seq_around_cut = df2.loc[
                                df2['Reference_Sequence'].str.contains('-') ==
                                False]['Reference_Sequence'].iloc[0]
                        else:
                            seq_len = df2[df2['Unedited'] ==
                                          True]['Reference_Sequence'].iloc[0]
                            for sgRNA_interval, cut_point in zip(
                                    sgRNA_intervals, cut_points):
                                sgRNA_seq = consensus_sequence[
                                    sgRNA_interval[0]:sgRNA_interval[1]]
                                if sgRNA_seq in allele_file_1_name:
                                    this_sgRNA_seq = sgRNA_seq
                                    this_cut_point = cut_point
                                    ref_seq_around_cut = consensus_sequence[max(
                                        0, this_cut_point -
                                        args.offset_around_cut_to_plot +
                                        1):min(
                                            len(reference_seq), cut_point +
                                            args.offset_around_cut_to_plot +
                                            1)]
                                    break

                        merged = pd.merge(df1,
                                          df2,
                                          on=[
                                              'Aligned_Sequence',
                                              'Reference_Sequence', 'Unedited',
                                              'n_deleted', 'n_inserted',
                                              'n_mutated'
                                          ],
                                          suffixes=('_' + sample_1_name,
                                                    '_' + sample_2_name),
                                          how='outer')
                        quant_cols = [
                            '#Reads_' + sample_1_name,
                            '%Reads_' + sample_1_name,
                            '#Reads_' + sample_2_name,
                            '%Reads_' + sample_2_name
                        ]
                        merged[quant_cols] = merged[quant_cols].fillna(0)
                        lfc_error = 0.1
                        merged['each_LFC'] = np.log2(
                            ((merged['%Reads_' + sample_1_name] + lfc_error) /
                             (merged['%Reads_' + sample_2_name] + lfc_error)
                             ).astype(float)).replace([np.inf, np.NaN], 0)
                        merged = merged.reset_index().set_index(
                            'Aligned_Sequence')
                        output_root = allele_file_1_name.replace(".txt", "")
                        allele_comparison_file = _jp(output_root + '.txt')
                        merged.to_csv(allele_comparison_file,
                                      sep="\t",
                                      index=None)

                        plot_name = '3.' + output_root + '_top'
                        CRISPRessoPlot.plot_alleles_table_compare(
                            ref_seq_around_cut,
                            merged.sort_values(['each_LFC'], ascending=True),
                            sample_1_name,
                            sample_2_name,
                            _jp(plot_name),
                            MIN_FREQUENCY=args.
                            min_frequency_alleles_around_cut_to_plot,
                            MAX_N_ROWS=args.
                            max_rows_alleles_around_cut_to_plot,
                            SAVE_ALSO_PNG=save_png)
                        crispresso2_info['summary_plot_names'].append(
                            plot_name)
                        crispresso2_info['summary_plot_titles'][
                            plot_name] = 'Alleles enriched in ' + sample_1_name
                        crispresso2_info['summary_plot_labels'][plot_name] = 'Distribution comparison of alleles. Nucleotides are indicated by unique colors (A = green; C = red; G = yellow; T = purple). Substitutions are shown in bold font. Red rectangles highlight inserted sequences. Horizontal dashed lines indicate deleted sequences. The vertical dashed line indicates the predicted cleavage site. '+ \
                        'The proportion and number of reads is shown for each sample on the right, with the values for ' + sample_1_name + ' followed by the values for ' + sample_2_name +'. Alleles are sorted for enrichment in ' + sample_1_name+'.'
                        crispresso2_info['summary_plot_datas'][plot_name] = [
                            ('Allele comparison table',
                             os.path.basename(allele_comparison_file))
                        ]

                        plot_name = '3.' + output_root + '_bottom'
                        CRISPRessoPlot.plot_alleles_table_compare(
                            ref_seq_around_cut,
                            merged.sort_values(['each_LFC'], ascending=False),
                            sample_1_name,
                            sample_2_name,
                            _jp(plot_name),
                            MIN_FREQUENCY=args.
                            min_frequency_alleles_around_cut_to_plot,
                            MAX_N_ROWS=args.
                            max_rows_alleles_around_cut_to_plot,
                            SAVE_ALSO_PNG=save_png)
                        crispresso2_info['summary_plot_names'].append(
                            plot_name)
                        crispresso2_info['summary_plot_titles'][
                            plot_name] = 'Alleles enriched in ' + sample_2_name
                        crispresso2_info['summary_plot_labels'][plot_name] = 'Distribution comparison of alleles. Nucleotides are indicated by unique colors (A = green; C = red; G = yellow; T = purple). Substitutions are shown in bold font. Red rectangles highlight inserted sequences. Horizontal dashed lines indicate deleted sequences. The vertical dashed line indicates the predicted cleavage site. '+ \
                        'The proportion and number of reads is shown for each sample on the right, with the values for ' + sample_1_name + ' followed by the values for ' + sample_2_name +'. Alleles are sorted for enrichment in ' + sample_2_name+'.'
                        crispresso2_info['summary_plot_datas'][plot_name] = [
                            ('Allele comparison table',
                             os.path.basename(allele_comparison_file))
                        ]

        if not args.suppress_report:
            if (args.place_report_in_output_folder):
                report_name = _jp("CRISPResso2Batch_report.html")
            else:
                report_name = OUTPUT_DIRECTORY + '.html'
            CRISPRessoReport.make_compare_report_from_folder(
                report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT)
            crispresso2_info['report_location'] = report_name
            crispresso2_info['report_filename'] = os.path.basename(report_name)

        cp.dump(crispresso2_info, open(crispresso2Compare_info_file, 'wb'))

        info('Analysis Complete!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)

        error('\n\nERROR: %s' % e)
        sys.exit(-1)