def grab_flanking_regions_from_mutantfile(
    mutant_dataset_infile,
    genome,
    flanksize=200,
    padding_char=".",
    min_readcount=0,
    chromosome_check_function=lambda x: True,
    ignore_both_strand_mutants=False,
):
    """ Return (flanking_seq,readcount) with both-side genomic flanking sequences for insertional mutants in mutant_dataset_infile.

    Grab all the insertion positions from mutant_dataset_infile (pickled mutant_analysis_classes.Insertional_mutant_dataset object), 
     use genome (a chrom_name:seq dict) to figure out the flanksize-length flanking sequences on both sides
      (padded with padding_char if the end of the chromosome is too close), reverse-complement if needed (if strand=='-') 
      to get it in the same orientation as the insertion.

    Filter the mutants: 
     - by readcount - ignore mutants with total readcount below min_readcount=0
     - by chromosome - ignore mutants in chromosomes for which chromosome_check_function returns False
     - by strand - both-strand (merged tandem) mutants will be ignored if ignore_both_strand_mutants is True, 
                otherwise ValueError will be raised; ValueError will be raised for other unexpected strand values.

    For all remaining mutants, append (flanking region seq, total_readcount) to output list.
    """
    dataset = mutant_analysis_classes.read_mutant_file(mutant_dataset_infile)
    flanking_region_count_list = []
    for mutant in sorted(dataset, key=lambda m: m.position):
        # filter out mutants with wrong readcounts or in wrong chromosomes
        if not chromosome_check_function(mutant.position.chromosome):
            continue
        if mutant.total_read_count < min_readcount:
            continue
        # filter out both-stranded mutants if desired;
        if mutant.position.strand not in "+-":
            if mutant.position.strand == "both" and ignore_both_strand_mutants:
                continue
            else:
                raise ValueError("Unexpected mutant strand! %s" % mutant.position)
        # grab mutant position/chromosome
        position_before_insertion = mutant.position.min_position
        # ignore cassette tandems (i.e. insertions that map to start or end of cassette)
        if mutant_analysis_classes.is_cassette_chromosome(mutant.position.chromosome):
            if position_before_insertion in [0, len(genome[mutant.position.chromosome])]:
                continue
        # grab the actual flanking sequence, with padding, correct orientation etc
        full_flanking_seq = flanking_region_from_pos(
            position_before_insertion,
            mutant.position.chromosome,
            mutant.position.strand,
            genome,
            flanksize,
            padding_char,
        )
        # append the sequence and readcount to output data
        flanking_region_count_list.append((full_flanking_seq, mutant.total_read_count))
    return flanking_region_count_list
Esempio n. 2
0
def merge_dataset_files(file_list=[], file_glob_pattern=None):
    """ Return single mutant dataset from adding all the inpu ones together (input can be filename list or glob pattern).
    """
    if (file_list
            and file_glob_pattern) or not (file_list or file_glob_pattern):
        raise Exception(
            "Must provide exactly one of file_list and file_glob_pattern!")
    if file_glob_pattern:
        file_list = glob.glob(file_glob_pattern)
    # make empty dataset
    full_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset()
    # read each listed dataset file, merge into the full dataset, then delete single dataset
    for infile in file_list:
        curr_dataset = mutant_analysis_classes.read_mutant_file(infile)
        full_dataset.merge_other_dataset(curr_dataset)
        del curr_dataset
    return full_dataset
def main(infiles, outfile, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing. 
    Print final dataset to outfile (if given); return final multi-dataset object and the list of dataset names in order.
    The options argument should be generated by an optparse parser.
    """

    # parse all infiles, print summaries to stdout if requested
    all_datasets = {}

    if options.dataset_names:
        dataset_names = options.dataset_names.split(',')
        if not len(dataset_names)==len(infiles):
            raise ValueError("If dataset names are provided via -D option, you must provide the same number of names "
                             +"as the total number of infiles! We have %s names and %s infiles."%(len(dataset_names), 
                                                                                                  len(infiles)))
    else:
        dataset_names = [os.path.splitext(os.path.basename(infile))[0] for infile in infiles]

    for dataset_name,infile in zip(dataset_names,infiles):
        if options.verbosity_level>1:   print "parsing input file %s - time %s."%(infile, time.ctime())
        if infile.endswith('.pickle'):
            current_dataset = unpickle(infile)
        else:
            current_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(infile=infile)
            current_dataset.count_adjacent_mutants(OUTPUT=None)
            # note - read_data_from_file doesn't deal with merging/counting info, so that will be wrong/missing
        all_datasets[dataset_name] = current_dataset
        if options.verbosity_level>0:   print "%s mutants in file %s"%(len(current_dataset), infile)
        elif options.verbosity_level>1: current_dataset.print_summary()
    
    # merge datasets into one multi-dataset object
    if options.verbosity_level>1:   print "merging the mutant data into combined dataset - time %s."%(time.ctime())
    multi_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(multi_dataset=True)
    multi_dataset.populate_multi_dataset(all_datasets, overwrite=False, check_gene_data=True)
    # make sure the datasets are in the same order as they were given on the command-line
    #  (using all_datasets to initialize multi_dataset didn't give an order, since all_datasets is a dictionary)
    multi_dataset.dataset_order = dataset_names
    # print varying amounts of summary data to stdout
    if options.verbosity_level>0:   print "total %s mutants present in combined dataset"%(len(multi_dataset))
    elif options.verbosity_level>0: multi_dataset.print_summary()

    ### optionally remove mutants based on another dataset
    if options.remove_mutants_from_file:
        other_dataset = mutant_analysis_classes.read_mutant_file(options.remove_mutants_from_file)
        old_N = len(multi_dataset)
        multi_dataset.remove_mutants_based_on_other_dataset(other_dataset, 
                 readcount_min=options.remove_mutants_readcount_min, perfect_reads=options.remove_mutants_min_is_perfect)
        if options.verbosity_level>0:   
            new_N = len(multi_dataset)
            print "removed %s mutants based on %s - %s mutants remaining in combined dataset"%(old_N - new_N, 
                                                                                   options.remove_mutants_from_file, new_N)

    # if requested, add gene annotation info from separate file
    if options.gene_annotation_file:
        if options.verbosity_level>1: 
            print "adding gene annotation from file %s - time %s."%(options.gene_annotation_file, time.ctime())
        multi_dataset.add_gene_annotation(options.gene_annotation_file, 
                                               if_standard_Cre_file=options.annotation_file_is_standard)

    # print full data to outfile, unless there is no outfile name given
    if outfile:
        if options.verbosity_level>1:   
            print "printing combined dataset output to file %s - time %s."%(outfile, time.ctime())
        with open(outfile,'w') as OUTFILE:
            write_header_data(OUTFILE,options)
            OUTFILE.write("### DATASET SUMMARIES:\n")
            multi_dataset.print_summary(OUTPUT=OUTFILE, line_prefix="#  ", header_prefix="## ")
            OUTFILE.write("### HEADER AND DATA:\n")
            multi_dataset.print_data(OUTPUT=OUTFILE, sort_data_by=options.sort_data_key, header_line=True)
        # TODO make a *.pickle outfile as well?

    return multi_dataset, dataset_names
def main(infiles, outfile, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    ### parse/process/reformat some options
    options.ignore_cassette |= options.separate_cassette
    options.count_cassette = not options.dont_count_cassette
    options.count_other = not options.dont_count_other
    options.merge_boundary_features = not options.dont_merge_boundary_features
    # MAYBE-TODO change outfile to a folder?  Since it'll have three things in it now...
    outfile_basename = os.path.splitext(outfile)[0]
    mutant_merging_outfile = outfile_basename + '_merging-info.txt'
    # MAYBE-TODO let -C take an optional argument to put the cassette files elsewhere?
    cassette_outfile = outfile_basename + '_cassette.txt'
    cassette_merging_outfile = outfile_basename + '_cassette_merging-info.txt'

    ### generate empty alignment set object with basic read position/orientation properties defined by options
    if options.Carette:
        dataset_class = mutant_Carette.Insertional_mutant_pool_dataset_Carette
    else:
        dataset_class = mutant_analysis_classes.Insertional_mutant_pool_dataset
    all_alignment_data = dataset_class(options.read_cassette_end,
                                       options.read_direction == 'reverse')
    if options.separate_cassette:
        cassette_alignment_data = dataset_class(
            options.read_cassette_end, options.read_direction == 'reverse')
    # MAYBE-TODO refactor the whole bunch of "if options.separate_cassette:" clauses to avoid code duplication?

    ### parse preprocessing/alignment metadata file to get discarded/not-aligned/etc readcounts, pass to all_alignment_data
    #   (all_alignment_data initializes them to 'unkown', so if file is not given or can't be found/parsed, do nothing)
    N_discarded, N_wrong_start, N_no_cassette, N_other_end, N_non_aligned, N_unaligned, N_multiple = \
            get_info_from_metadata_files(infiles, options.input_metadata_file, options.read_cassette_end, options.verbosity_level)
    if 'unknown' not in (N_wrong_start, N_no_cassette):
        assert N_discarded == N_wrong_start+N_no_cassette+N_other_end,\
                "Discarded subtotals don't add up to discarded total! %s+%s+%s != %s"%(N_wrong_start,N_no_cassette,
                                                                                       N_other_end,N_discarded)
    all_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start,
                                                   N_no_cassette, N_other_end)
    if options.separate_cassette:
        cassette_alignment_data.summary.add_discarded_reads(
            N_discarded, N_wrong_start, N_no_cassette, N_other_end)
    all_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned,
                                                    N_multiple)
    if options.separate_cassette:
        cassette_alignment_data.summary.add_nonaligned_reads(
            N_non_aligned, N_unaligned, N_multiple)
    # MAYBE-TODO also get the final total number of reads from the metadata infile and make sure it's the same
    #   as the number of processed reads I get from all_alignment_data.print_summary()?

    ### parse input file and store data - the add_alignment_reader_to_data function here does pretty much all the work!
    for infile in infiles:
        # if this is a new-style *_genomic-unique.sam file and has a matching *_cassette.sam file, parse that file too
        part_infiles = [infile]
        if infile.endswith('_genomic-unique.sam'):
            cassette_file = infile[:-len('_genomic-unique.sam'
                                         )] + '_cassette.sam'
            if os.path.exists(cassette_file):
                part_infiles.append(cassette_file)
        for part_infile in part_infiles:
            # initialize a parser for the SAM infile
            if options.verbosity_level > 1:
                print "parsing input file %s - time %s." % (part_infile,
                                                            time.ctime())
            infile_reader = HTSeq.SAM_Reader(part_infile)
            # fill the new alignment set object with data from the infile parser
            all_alignment_data.add_alignment_reader_to_data(
                infile_reader,
                uncollapse_read_counts=options.input_collapsed_to_unique,
                ignore_cassette=options.ignore_cassette,
                cassette_only=False,
                treat_unknown_as_match=options.treat_unknown_as_match)
            if options.separate_cassette:
                cassette_alignment_data.add_alignment_reader_to_data(
                    infile_reader,
                    uncollapse_read_counts=options.input_collapsed_to_unique,
                    ignore_cassette=False,
                    cassette_only=True,
                    treat_unknown_as_match=options.treat_unknown_as_match)

    ### optionally remove mutants based on another dataset - BEFORE adjacent mutant counting/merging
    # remove mutants that ARE present in another file (also do it for cassette mutants if those are separate)
    if options.remove_mutants_from_file:
        for other_file in options.remove_mutants_from_file.split(','):
            other_dataset = mutant_analysis_classes.read_mutant_file(
                other_file)
            all_alignment_data.remove_mutants_in_other_dataset(
                other_dataset,
                readcount_min=options.remove_from_file_readcount_min,
                perfect_reads=options.remove_from_file_min_is_perfect)
            if options.separate_cassette:
                cassette_alignment_data.remove_mutants_in_other_dataset(
                    other_dataset,
                    readcount_min=options.remove_from_file_readcount_min,
                    perfect_reads=options.remove_from_file_min_is_perfect)
    # remove mutants that are NOT present in another file (also do it for cassette mutants if those are separate)
    # TODO should I implement using multiple files here too?
    if options.remove_mutants_not_from_file:
        other_dataset = mutant_analysis_classes.read_mutant_file(
            options.remove_mutants_not_from_file)
        all_alignment_data.remove_mutants_not_in_other_dataset(
            other_dataset,
            readcount_min=options.remove_not_from_file_readcount_min,
            perfect_reads=options.remove_not_from_file_min_is_perfect)
        if options.separate_cassette:
            cassette_alignment_data.remove_mutants_not_in_other_dataset(
                other_dataset,
                readcount_min=options.remove_not_from_file_readcount_min,
                perfect_reads=options.remove_not_from_file_min_is_perfect)

    ### optionally merge some mutant categories
    with open(mutant_merging_outfile, 'w') as MERGEFILE:
        with open(cassette_merging_outfile, 'w') as CASSETTE_MERGEFILE:
            # 1) adjacent same-strand mutants (since they're probably just artifacts of indels during deepseq/PCR)
            if options.merge_adjacent_mutants:
                try:
                    leave_N_mutants = int(options.merge_adjacent_leave_mutants)
                except ValueError:
                    leave_N_mutants = options.merge_adjacent_leave_mutants
                if options.merge_adjacent_count_ratio is not None:
                    leave_N_mutants = 'use_ratio'
                all_alignment_data.merge_adjacent_mutants(
                    merge_max_distance=options.adjacent_max_distance,
                    leave_N_mutants=leave_N_mutants,
                    min_count_ratio=options.merge_adjacent_count_ratio,
                    leave_method=options.merge_mutant_choice_method,
                    merge_cassette_chromosomes=options.merge_in_cassette,
                    merge_other_chromosomes=options.merge_in_other_chrom,
                    OUTPUT=MERGEFILE)
                if options.merge_in_cassette and options.separate_cassette:
                    cassette_alignment_data.merge_adjacent_mutants(
                        merge_max_distance=options.adjacent_max_distance,
                        leave_N_mutants=leave_N_mutants,
                        min_count_ratio=options.merge_adjacent_count_ratio,
                        leave_method=options.merge_mutant_choice_method,
                        merge_cassette_chromosomes=True,
                        merge_other_chromosomes=False,
                        OUTPUT=CASSETTE_MERGEFILE)
            # 2) opposite-strand same-position mutants (since they're probably just tail-to-tail cassette tandems)
            if options.merge_opposite_tandem_mutants:
                try:
                    leave_N_mutants = int(options.merge_opposite_leave_mutants)
                except ValueError:
                    leave_N_mutants = options.merge_opposite_leave_mutants
                if options.merge_opposite_count_ratio is not None:
                    leave_N_mutants = 'use_ratio'
                all_alignment_data.merge_opposite_tandem_mutants(
                    leave_N_mutants=leave_N_mutants,
                    max_count_ratio=options.merge_opposite_count_ratio,
                    leave_method=options.merge_mutant_choice_method,
                    merge_cassette_chromosomes=options.merge_in_cassette,
                    merge_other_chromosomes=options.merge_in_other_chrom,
                    OUTPUT=MERGEFILE)
                if options.merge_in_cassette and options.separate_cassette:
                    cassette_alignment_data.merge_opposite_tandem_mutants(
                        leave_N_mutants=leave_N_mutants,
                        max_count_ratio=options.merge_opposite_count_ratio,
                        leave_method=options.merge_mutant_choice_method,
                        merge_cassette_chromosomes=True,
                        merge_other_chromosomes=False,
                        OUTPUT=CASSETTE_MERGEFILE)
            # 3) count adjacent mutants, even if not doing any merging
            # Actually there's always a count done after each merge, but that count doesn't print the details to anything,
            #  so just run it again here with detail-printing - MAYBE-TODO find a more efficient way instead of counting twice?
            all_alignment_data.count_adjacent_mutants(
                max_distance_to_print=options.adjacent_max_distance,
                max_distance_to_count=10000,
                count_cassette_chromosomes=options.merge_in_cassette,
                count_other_chromosomes=options.merge_in_other_chrom,
                OUTPUT=MERGEFILE)
            if options.separate_cassette:
                cassette_alignment_data.count_adjacent_mutants(
                    max_distance_to_print=options.adjacent_max_distance,
                    max_distance_to_count=10,
                    count_cassette_chromosomes=True,
                    count_other_chromosomes=False,
                    OUTPUT=CASSETTE_MERGEFILE)
            # MAYBE-TODO make an option for max_distance_to_count?  I'm using a much lower one for cassette because it's so dense.
    # since there's no optional as/with statement, just remove the cassette_merging_outfile if unwanted
    if not options.separate_cassette:
        os.remove(cassette_merging_outfile)

    ### optionally parse gene position/info files and look up the genes for each mutant in the data
    if options.gene_position_reference_file is not None:
        genefile = options.gene_position_reference_file
        if options.verbosity_level > 1:
            print "adding genes from file %s to mutant data - time %s." % (
                genefile, time.ctime())
        all_alignment_data.find_genes_for_mutants(
            genefile,
            detailed_features=options.detailed_gene_features,
            N_run_groups=options.N_detail_run_groups,
            verbosity_level=options.verbosity_level)

        # if we have gene info, optionally also add annotation
        if options.gene_annotation_file:
            if options.verbosity_level > 1:
                print "adding gene annotation from file %s - time %s." % (
                    options.gene_annotation_file, time.ctime())
            all_alignment_data.add_gene_annotation(
                options.gene_annotation_file,
                if_standard_Phytozome_file=options.
                annotation_file_standard_type,
                print_info=(options.verbosity_level >= 2))

    ### output data to files
    save_dataset_files(all_alignment_data, outfile, options.verbosity_level,
                       True, options.count_cassette, options.count_other,
                       options.merge_boundary_features, options.sort_data_key,
                       options.N_sequences_per_group, options)
    # TODO write some info about all the other files that go with this one (pickle, merging-info, *cassette*)

    if options.separate_cassette:
        save_dataset_files(cassette_alignment_data, cassette_outfile, 0, True,
                           False, False, options.merge_boundary_features,
                           options.sort_data_key,
                           options.N_sequences_per_group, options)
def main(infiles, outfile, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    ### parse/process/reformat some options
    options.ignore_cassette |= options.separate_cassette
    options.count_cassette = not options.dont_count_cassette
    options.count_other = not options.dont_count_other
    options.merge_boundary_features = not options.dont_merge_boundary_features
    # MAYBE-TODO change outfile to a folder?  Since it'll have three things in it now...
    outfile_basename = os.path.splitext(outfile)[0]
    mutant_merging_outfile = outfile_basename + '_merging-info.txt'
    # MAYBE-TODO let -C take an optional argument to put the cassette files elsewhere?
    cassette_outfile = outfile_basename + '_cassette.txt'
    cassette_merging_outfile = outfile_basename + '_cassette_merging-info.txt'

    ### generate empty alignment set object with basic read position/orientation properties defined by options
    if options.Carette:     dataset_class = mutant_Carette.Insertional_mutant_pool_dataset_Carette
    else:                   dataset_class = mutant_analysis_classes.Insertional_mutant_pool_dataset
    all_alignment_data = dataset_class(options.read_cassette_end, options.read_direction=='reverse')
    if options.separate_cassette:
        cassette_alignment_data = dataset_class(options.read_cassette_end, options.read_direction=='reverse')
    # MAYBE-TODO refactor the whole bunch of "if options.separate_cassette:" clauses to avoid code duplication?

    ### parse preprocessing/alignment metadata file to get discarded/not-aligned/etc readcounts, pass to all_alignment_data
    #   (all_alignment_data initializes them to 'unkown', so if file is not given or can't be found/parsed, do nothing)
    N_discarded, N_wrong_start, N_no_cassette, N_other_end, N_non_aligned, N_unaligned, N_multiple = \
            get_info_from_metadata_files(infiles, options.input_metadata_file, options.read_cassette_end, options.verbosity_level)
    if 'unknown' not in (N_wrong_start, N_no_cassette):
        assert N_discarded == N_wrong_start+N_no_cassette+N_other_end,\
                "Discarded subtotals don't add up to discarded total! %s+%s+%s != %s"%(N_wrong_start,N_no_cassette,
                                                                                       N_other_end,N_discarded)
    all_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start, N_no_cassette, N_other_end)
    if options.separate_cassette:
        cassette_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start, N_no_cassette, N_other_end)
    all_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned, N_multiple)
    if options.separate_cassette:
        cassette_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned, N_multiple)
    # MAYBE-TODO also get the final total number of reads from the metadata infile and make sure it's the same 
    #   as the number of processed reads I get from all_alignment_data.print_summary()?

    ### parse input file and store data - the add_alignment_reader_to_data function here does pretty much all the work!
    for infile in infiles:
        # if this is a new-style *_genomic-unique.sam file and has a matching *_cassette.sam file, parse that file too
        part_infiles = [infile]
        if infile.endswith('_genomic-unique.sam'):
            cassette_file = infile[:-len('_genomic-unique.sam')] + '_cassette.sam'
            if os.path.exists(cassette_file):
                part_infiles.append(cassette_file)
        for part_infile in part_infiles:
            # initialize a parser for the SAM infile
            if options.verbosity_level>1: 
                print "parsing input file %s - time %s."%(part_infile, time.ctime())
            infile_reader = HTSeq.SAM_Reader(part_infile)
            # fill the new alignment set object with data from the infile parser
            all_alignment_data.add_alignment_reader_to_data(infile_reader, 
                                        uncollapse_read_counts = options.input_collapsed_to_unique, 
                                        ignore_cassette = options.ignore_cassette, cassette_only = False, 
                                        treat_unknown_as_match = options.treat_unknown_as_match)
            if options.separate_cassette:
                cassette_alignment_data.add_alignment_reader_to_data(infile_reader, 
                                        uncollapse_read_counts = options.input_collapsed_to_unique, 
                                        ignore_cassette = False, cassette_only = True, 
                                        treat_unknown_as_match = options.treat_unknown_as_match)

    ### optionally remove mutants based on another dataset - BEFORE adjacent mutant counting/merging
    # remove mutants that ARE present in another file (also do it for cassette mutants if those are separate)
    if options.remove_mutants_from_file:
        for other_file in options.remove_mutants_from_file.split(','):
            other_dataset = mutant_analysis_classes.read_mutant_file(other_file)
            all_alignment_data.remove_mutants_in_other_dataset(other_dataset, 
                     readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect)
            if options.separate_cassette:
                cassette_alignment_data.remove_mutants_in_other_dataset(other_dataset, 
                     readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect)
    # remove mutants that are NOT present in another file (also do it for cassette mutants if those are separate)
    # TODO should I implement using multiple files here too?
    if options.remove_mutants_not_from_file:
        other_dataset = mutant_analysis_classes.read_mutant_file(options.remove_mutants_not_from_file)
        all_alignment_data.remove_mutants_not_in_other_dataset(other_dataset, 
                 readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect)
        if options.separate_cassette:
            cassette_alignment_data.remove_mutants_not_in_other_dataset(other_dataset, 
                 readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect)

    ### optionally merge some mutant categories
    with open(mutant_merging_outfile, 'w') as MERGEFILE:
      with open(cassette_merging_outfile, 'w') as CASSETTE_MERGEFILE:
        # 1) adjacent same-strand mutants (since they're probably just artifacts of indels during deepseq/PCR)
        if options.merge_adjacent_mutants: 
            try:                                                leave_N_mutants = int(options.merge_adjacent_leave_mutants)
            except ValueError:                                  leave_N_mutants = options.merge_adjacent_leave_mutants
            if options.merge_adjacent_count_ratio is not None:  leave_N_mutants = 'use_ratio'
            all_alignment_data.merge_adjacent_mutants(merge_max_distance=options.adjacent_max_distance, 
                      leave_N_mutants=leave_N_mutants, min_count_ratio = options.merge_adjacent_count_ratio, 
                      leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = options.merge_in_cassette, 
                      merge_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE)
            if options.merge_in_cassette and options.separate_cassette:
                cassette_alignment_data.merge_adjacent_mutants(merge_max_distance = options.adjacent_max_distance, 
                          leave_N_mutants=leave_N_mutants, min_count_ratio = options.merge_adjacent_count_ratio, 
                          leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = True, 
                          merge_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE)
        # 2) opposite-strand same-position mutants (since they're probably just tail-to-tail cassette tandems)
        if options.merge_opposite_tandem_mutants: 
            try:                                                leave_N_mutants = int(options.merge_opposite_leave_mutants)
            except ValueError:                                  leave_N_mutants = options.merge_opposite_leave_mutants
            if options.merge_opposite_count_ratio is not None:  leave_N_mutants = 'use_ratio'
            all_alignment_data.merge_opposite_tandem_mutants(leave_N_mutants=leave_N_mutants, 
                          max_count_ratio = options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method, 
                          merge_cassette_chromosomes = options.merge_in_cassette, 
                          merge_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE)
            if options.merge_in_cassette and options.separate_cassette:
                cassette_alignment_data.merge_opposite_tandem_mutants(leave_N_mutants=leave_N_mutants, 
                              max_count_ratio = options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method, 
                              merge_cassette_chromosomes = True, merge_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE)
        # 3) count adjacent mutants, even if not doing any merging
        # Actually there's always a count done after each merge, but that count doesn't print the details to anything, 
        #  so just run it again here with detail-printing - MAYBE-TODO find a more efficient way instead of counting twice?
        all_alignment_data.count_adjacent_mutants(max_distance_to_print = options.adjacent_max_distance, 
                                  max_distance_to_count = 10000, count_cassette_chromosomes = options.merge_in_cassette, 
                                  count_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE)
        if options.separate_cassette:
            cassette_alignment_data.count_adjacent_mutants(max_distance_to_print = options.adjacent_max_distance, 
                                  max_distance_to_count = 10, count_cassette_chromosomes = True, 
                                  count_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE)
        # MAYBE-TODO make an option for max_distance_to_count?  I'm using a much lower one for cassette because it's so dense.
    # since there's no optional as/with statement, just remove the cassette_merging_outfile if unwanted
    if not options.separate_cassette:
        os.remove(cassette_merging_outfile)

    ### optionally parse gene position/info files and look up the genes for each mutant in the data
    if options.gene_position_reference_file is not None:
        genefile = options.gene_position_reference_file
        if options.verbosity_level>1: print "adding genes from file %s to mutant data - time %s."%(genefile, time.ctime())
        all_alignment_data.find_genes_for_mutants(genefile, detailed_features=options.detailed_gene_features, 
                                                  N_run_groups=options.N_detail_run_groups, 
                                                  verbosity_level=options.verbosity_level)

        # if we have gene info, optionally also add annotation
        if options.gene_annotation_file:
            if options.verbosity_level>1: 
                print "adding gene annotation from file %s - time %s."%(options.gene_annotation_file, time.ctime())
            all_alignment_data.add_gene_annotation(options.gene_annotation_file, 
                       if_standard_Phytozome_file=options.annotation_file_standard_type, print_info=(options.verbosity_level >= 2))

    ### output data to files
    save_dataset_files(all_alignment_data, outfile, options.verbosity_level, True, options.count_cassette, options.count_other, 
                         options.merge_boundary_features, options.sort_data_key, options.N_sequences_per_group, options)
    # TODO write some info about all the other files that go with this one (pickle, merging-info, *cassette*)

    if options.separate_cassette:
        save_dataset_files(cassette_alignment_data, cassette_outfile, 0, True, False, False, 
                             options.merge_boundary_features,  options.sort_data_key, options.N_sequences_per_group, options)