def grab_flanking_regions_from_mutantfile( mutant_dataset_infile, genome, flanksize=200, padding_char=".", min_readcount=0, chromosome_check_function=lambda x: True, ignore_both_strand_mutants=False, ): """ Return (flanking_seq,readcount) with both-side genomic flanking sequences for insertional mutants in mutant_dataset_infile. Grab all the insertion positions from mutant_dataset_infile (pickled mutant_analysis_classes.Insertional_mutant_dataset object), use genome (a chrom_name:seq dict) to figure out the flanksize-length flanking sequences on both sides (padded with padding_char if the end of the chromosome is too close), reverse-complement if needed (if strand=='-') to get it in the same orientation as the insertion. Filter the mutants: - by readcount - ignore mutants with total readcount below min_readcount=0 - by chromosome - ignore mutants in chromosomes for which chromosome_check_function returns False - by strand - both-strand (merged tandem) mutants will be ignored if ignore_both_strand_mutants is True, otherwise ValueError will be raised; ValueError will be raised for other unexpected strand values. For all remaining mutants, append (flanking region seq, total_readcount) to output list. """ dataset = mutant_analysis_classes.read_mutant_file(mutant_dataset_infile) flanking_region_count_list = [] for mutant in sorted(dataset, key=lambda m: m.position): # filter out mutants with wrong readcounts or in wrong chromosomes if not chromosome_check_function(mutant.position.chromosome): continue if mutant.total_read_count < min_readcount: continue # filter out both-stranded mutants if desired; if mutant.position.strand not in "+-": if mutant.position.strand == "both" and ignore_both_strand_mutants: continue else: raise ValueError("Unexpected mutant strand! %s" % mutant.position) # grab mutant position/chromosome position_before_insertion = mutant.position.min_position # ignore cassette tandems (i.e. insertions that map to start or end of cassette) if mutant_analysis_classes.is_cassette_chromosome(mutant.position.chromosome): if position_before_insertion in [0, len(genome[mutant.position.chromosome])]: continue # grab the actual flanking sequence, with padding, correct orientation etc full_flanking_seq = flanking_region_from_pos( position_before_insertion, mutant.position.chromosome, mutant.position.strand, genome, flanksize, padding_char, ) # append the sequence and readcount to output data flanking_region_count_list.append((full_flanking_seq, mutant.total_read_count)) return flanking_region_count_list
def merge_dataset_files(file_list=[], file_glob_pattern=None): """ Return single mutant dataset from adding all the inpu ones together (input can be filename list or glob pattern). """ if (file_list and file_glob_pattern) or not (file_list or file_glob_pattern): raise Exception( "Must provide exactly one of file_list and file_glob_pattern!") if file_glob_pattern: file_list = glob.glob(file_glob_pattern) # make empty dataset full_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset() # read each listed dataset file, merge into the full dataset, then delete single dataset for infile in file_list: curr_dataset = mutant_analysis_classes.read_mutant_file(infile) full_dataset.merge_other_dataset(curr_dataset) del curr_dataset return full_dataset
def main(infiles, outfile, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. Print final dataset to outfile (if given); return final multi-dataset object and the list of dataset names in order. The options argument should be generated by an optparse parser. """ # parse all infiles, print summaries to stdout if requested all_datasets = {} if options.dataset_names: dataset_names = options.dataset_names.split(',') if not len(dataset_names)==len(infiles): raise ValueError("If dataset names are provided via -D option, you must provide the same number of names " +"as the total number of infiles! We have %s names and %s infiles."%(len(dataset_names), len(infiles))) else: dataset_names = [os.path.splitext(os.path.basename(infile))[0] for infile in infiles] for dataset_name,infile in zip(dataset_names,infiles): if options.verbosity_level>1: print "parsing input file %s - time %s."%(infile, time.ctime()) if infile.endswith('.pickle'): current_dataset = unpickle(infile) else: current_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(infile=infile) current_dataset.count_adjacent_mutants(OUTPUT=None) # note - read_data_from_file doesn't deal with merging/counting info, so that will be wrong/missing all_datasets[dataset_name] = current_dataset if options.verbosity_level>0: print "%s mutants in file %s"%(len(current_dataset), infile) elif options.verbosity_level>1: current_dataset.print_summary() # merge datasets into one multi-dataset object if options.verbosity_level>1: print "merging the mutant data into combined dataset - time %s."%(time.ctime()) multi_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(multi_dataset=True) multi_dataset.populate_multi_dataset(all_datasets, overwrite=False, check_gene_data=True) # make sure the datasets are in the same order as they were given on the command-line # (using all_datasets to initialize multi_dataset didn't give an order, since all_datasets is a dictionary) multi_dataset.dataset_order = dataset_names # print varying amounts of summary data to stdout if options.verbosity_level>0: print "total %s mutants present in combined dataset"%(len(multi_dataset)) elif options.verbosity_level>0: multi_dataset.print_summary() ### optionally remove mutants based on another dataset if options.remove_mutants_from_file: other_dataset = mutant_analysis_classes.read_mutant_file(options.remove_mutants_from_file) old_N = len(multi_dataset) multi_dataset.remove_mutants_based_on_other_dataset(other_dataset, readcount_min=options.remove_mutants_readcount_min, perfect_reads=options.remove_mutants_min_is_perfect) if options.verbosity_level>0: new_N = len(multi_dataset) print "removed %s mutants based on %s - %s mutants remaining in combined dataset"%(old_N - new_N, options.remove_mutants_from_file, new_N) # if requested, add gene annotation info from separate file if options.gene_annotation_file: if options.verbosity_level>1: print "adding gene annotation from file %s - time %s."%(options.gene_annotation_file, time.ctime()) multi_dataset.add_gene_annotation(options.gene_annotation_file, if_standard_Cre_file=options.annotation_file_is_standard) # print full data to outfile, unless there is no outfile name given if outfile: if options.verbosity_level>1: print "printing combined dataset output to file %s - time %s."%(outfile, time.ctime()) with open(outfile,'w') as OUTFILE: write_header_data(OUTFILE,options) OUTFILE.write("### DATASET SUMMARIES:\n") multi_dataset.print_summary(OUTPUT=OUTFILE, line_prefix="# ", header_prefix="## ") OUTFILE.write("### HEADER AND DATA:\n") multi_dataset.print_data(OUTPUT=OUTFILE, sort_data_by=options.sort_data_key, header_line=True) # TODO make a *.pickle outfile as well? return multi_dataset, dataset_names
def main(infiles, outfile, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ ### parse/process/reformat some options options.ignore_cassette |= options.separate_cassette options.count_cassette = not options.dont_count_cassette options.count_other = not options.dont_count_other options.merge_boundary_features = not options.dont_merge_boundary_features # MAYBE-TODO change outfile to a folder? Since it'll have three things in it now... outfile_basename = os.path.splitext(outfile)[0] mutant_merging_outfile = outfile_basename + '_merging-info.txt' # MAYBE-TODO let -C take an optional argument to put the cassette files elsewhere? cassette_outfile = outfile_basename + '_cassette.txt' cassette_merging_outfile = outfile_basename + '_cassette_merging-info.txt' ### generate empty alignment set object with basic read position/orientation properties defined by options if options.Carette: dataset_class = mutant_Carette.Insertional_mutant_pool_dataset_Carette else: dataset_class = mutant_analysis_classes.Insertional_mutant_pool_dataset all_alignment_data = dataset_class(options.read_cassette_end, options.read_direction == 'reverse') if options.separate_cassette: cassette_alignment_data = dataset_class( options.read_cassette_end, options.read_direction == 'reverse') # MAYBE-TODO refactor the whole bunch of "if options.separate_cassette:" clauses to avoid code duplication? ### parse preprocessing/alignment metadata file to get discarded/not-aligned/etc readcounts, pass to all_alignment_data # (all_alignment_data initializes them to 'unkown', so if file is not given or can't be found/parsed, do nothing) N_discarded, N_wrong_start, N_no_cassette, N_other_end, N_non_aligned, N_unaligned, N_multiple = \ get_info_from_metadata_files(infiles, options.input_metadata_file, options.read_cassette_end, options.verbosity_level) if 'unknown' not in (N_wrong_start, N_no_cassette): assert N_discarded == N_wrong_start+N_no_cassette+N_other_end,\ "Discarded subtotals don't add up to discarded total! %s+%s+%s != %s"%(N_wrong_start,N_no_cassette, N_other_end,N_discarded) all_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start, N_no_cassette, N_other_end) if options.separate_cassette: cassette_alignment_data.summary.add_discarded_reads( N_discarded, N_wrong_start, N_no_cassette, N_other_end) all_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned, N_multiple) if options.separate_cassette: cassette_alignment_data.summary.add_nonaligned_reads( N_non_aligned, N_unaligned, N_multiple) # MAYBE-TODO also get the final total number of reads from the metadata infile and make sure it's the same # as the number of processed reads I get from all_alignment_data.print_summary()? ### parse input file and store data - the add_alignment_reader_to_data function here does pretty much all the work! for infile in infiles: # if this is a new-style *_genomic-unique.sam file and has a matching *_cassette.sam file, parse that file too part_infiles = [infile] if infile.endswith('_genomic-unique.sam'): cassette_file = infile[:-len('_genomic-unique.sam' )] + '_cassette.sam' if os.path.exists(cassette_file): part_infiles.append(cassette_file) for part_infile in part_infiles: # initialize a parser for the SAM infile if options.verbosity_level > 1: print "parsing input file %s - time %s." % (part_infile, time.ctime()) infile_reader = HTSeq.SAM_Reader(part_infile) # fill the new alignment set object with data from the infile parser all_alignment_data.add_alignment_reader_to_data( infile_reader, uncollapse_read_counts=options.input_collapsed_to_unique, ignore_cassette=options.ignore_cassette, cassette_only=False, treat_unknown_as_match=options.treat_unknown_as_match) if options.separate_cassette: cassette_alignment_data.add_alignment_reader_to_data( infile_reader, uncollapse_read_counts=options.input_collapsed_to_unique, ignore_cassette=False, cassette_only=True, treat_unknown_as_match=options.treat_unknown_as_match) ### optionally remove mutants based on another dataset - BEFORE adjacent mutant counting/merging # remove mutants that ARE present in another file (also do it for cassette mutants if those are separate) if options.remove_mutants_from_file: for other_file in options.remove_mutants_from_file.split(','): other_dataset = mutant_analysis_classes.read_mutant_file( other_file) all_alignment_data.remove_mutants_in_other_dataset( other_dataset, readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect) if options.separate_cassette: cassette_alignment_data.remove_mutants_in_other_dataset( other_dataset, readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect) # remove mutants that are NOT present in another file (also do it for cassette mutants if those are separate) # TODO should I implement using multiple files here too? if options.remove_mutants_not_from_file: other_dataset = mutant_analysis_classes.read_mutant_file( options.remove_mutants_not_from_file) all_alignment_data.remove_mutants_not_in_other_dataset( other_dataset, readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect) if options.separate_cassette: cassette_alignment_data.remove_mutants_not_in_other_dataset( other_dataset, readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect) ### optionally merge some mutant categories with open(mutant_merging_outfile, 'w') as MERGEFILE: with open(cassette_merging_outfile, 'w') as CASSETTE_MERGEFILE: # 1) adjacent same-strand mutants (since they're probably just artifacts of indels during deepseq/PCR) if options.merge_adjacent_mutants: try: leave_N_mutants = int(options.merge_adjacent_leave_mutants) except ValueError: leave_N_mutants = options.merge_adjacent_leave_mutants if options.merge_adjacent_count_ratio is not None: leave_N_mutants = 'use_ratio' all_alignment_data.merge_adjacent_mutants( merge_max_distance=options.adjacent_max_distance, leave_N_mutants=leave_N_mutants, min_count_ratio=options.merge_adjacent_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes=options.merge_in_cassette, merge_other_chromosomes=options.merge_in_other_chrom, OUTPUT=MERGEFILE) if options.merge_in_cassette and options.separate_cassette: cassette_alignment_data.merge_adjacent_mutants( merge_max_distance=options.adjacent_max_distance, leave_N_mutants=leave_N_mutants, min_count_ratio=options.merge_adjacent_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes=True, merge_other_chromosomes=False, OUTPUT=CASSETTE_MERGEFILE) # 2) opposite-strand same-position mutants (since they're probably just tail-to-tail cassette tandems) if options.merge_opposite_tandem_mutants: try: leave_N_mutants = int(options.merge_opposite_leave_mutants) except ValueError: leave_N_mutants = options.merge_opposite_leave_mutants if options.merge_opposite_count_ratio is not None: leave_N_mutants = 'use_ratio' all_alignment_data.merge_opposite_tandem_mutants( leave_N_mutants=leave_N_mutants, max_count_ratio=options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes=options.merge_in_cassette, merge_other_chromosomes=options.merge_in_other_chrom, OUTPUT=MERGEFILE) if options.merge_in_cassette and options.separate_cassette: cassette_alignment_data.merge_opposite_tandem_mutants( leave_N_mutants=leave_N_mutants, max_count_ratio=options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes=True, merge_other_chromosomes=False, OUTPUT=CASSETTE_MERGEFILE) # 3) count adjacent mutants, even if not doing any merging # Actually there's always a count done after each merge, but that count doesn't print the details to anything, # so just run it again here with detail-printing - MAYBE-TODO find a more efficient way instead of counting twice? all_alignment_data.count_adjacent_mutants( max_distance_to_print=options.adjacent_max_distance, max_distance_to_count=10000, count_cassette_chromosomes=options.merge_in_cassette, count_other_chromosomes=options.merge_in_other_chrom, OUTPUT=MERGEFILE) if options.separate_cassette: cassette_alignment_data.count_adjacent_mutants( max_distance_to_print=options.adjacent_max_distance, max_distance_to_count=10, count_cassette_chromosomes=True, count_other_chromosomes=False, OUTPUT=CASSETTE_MERGEFILE) # MAYBE-TODO make an option for max_distance_to_count? I'm using a much lower one for cassette because it's so dense. # since there's no optional as/with statement, just remove the cassette_merging_outfile if unwanted if not options.separate_cassette: os.remove(cassette_merging_outfile) ### optionally parse gene position/info files and look up the genes for each mutant in the data if options.gene_position_reference_file is not None: genefile = options.gene_position_reference_file if options.verbosity_level > 1: print "adding genes from file %s to mutant data - time %s." % ( genefile, time.ctime()) all_alignment_data.find_genes_for_mutants( genefile, detailed_features=options.detailed_gene_features, N_run_groups=options.N_detail_run_groups, verbosity_level=options.verbosity_level) # if we have gene info, optionally also add annotation if options.gene_annotation_file: if options.verbosity_level > 1: print "adding gene annotation from file %s - time %s." % ( options.gene_annotation_file, time.ctime()) all_alignment_data.add_gene_annotation( options.gene_annotation_file, if_standard_Phytozome_file=options. annotation_file_standard_type, print_info=(options.verbosity_level >= 2)) ### output data to files save_dataset_files(all_alignment_data, outfile, options.verbosity_level, True, options.count_cassette, options.count_other, options.merge_boundary_features, options.sort_data_key, options.N_sequences_per_group, options) # TODO write some info about all the other files that go with this one (pickle, merging-info, *cassette*) if options.separate_cassette: save_dataset_files(cassette_alignment_data, cassette_outfile, 0, True, False, False, options.merge_boundary_features, options.sort_data_key, options.N_sequences_per_group, options)
def main(infiles, outfile, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ ### parse/process/reformat some options options.ignore_cassette |= options.separate_cassette options.count_cassette = not options.dont_count_cassette options.count_other = not options.dont_count_other options.merge_boundary_features = not options.dont_merge_boundary_features # MAYBE-TODO change outfile to a folder? Since it'll have three things in it now... outfile_basename = os.path.splitext(outfile)[0] mutant_merging_outfile = outfile_basename + '_merging-info.txt' # MAYBE-TODO let -C take an optional argument to put the cassette files elsewhere? cassette_outfile = outfile_basename + '_cassette.txt' cassette_merging_outfile = outfile_basename + '_cassette_merging-info.txt' ### generate empty alignment set object with basic read position/orientation properties defined by options if options.Carette: dataset_class = mutant_Carette.Insertional_mutant_pool_dataset_Carette else: dataset_class = mutant_analysis_classes.Insertional_mutant_pool_dataset all_alignment_data = dataset_class(options.read_cassette_end, options.read_direction=='reverse') if options.separate_cassette: cassette_alignment_data = dataset_class(options.read_cassette_end, options.read_direction=='reverse') # MAYBE-TODO refactor the whole bunch of "if options.separate_cassette:" clauses to avoid code duplication? ### parse preprocessing/alignment metadata file to get discarded/not-aligned/etc readcounts, pass to all_alignment_data # (all_alignment_data initializes them to 'unkown', so if file is not given or can't be found/parsed, do nothing) N_discarded, N_wrong_start, N_no_cassette, N_other_end, N_non_aligned, N_unaligned, N_multiple = \ get_info_from_metadata_files(infiles, options.input_metadata_file, options.read_cassette_end, options.verbosity_level) if 'unknown' not in (N_wrong_start, N_no_cassette): assert N_discarded == N_wrong_start+N_no_cassette+N_other_end,\ "Discarded subtotals don't add up to discarded total! %s+%s+%s != %s"%(N_wrong_start,N_no_cassette, N_other_end,N_discarded) all_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start, N_no_cassette, N_other_end) if options.separate_cassette: cassette_alignment_data.summary.add_discarded_reads(N_discarded, N_wrong_start, N_no_cassette, N_other_end) all_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned, N_multiple) if options.separate_cassette: cassette_alignment_data.summary.add_nonaligned_reads(N_non_aligned, N_unaligned, N_multiple) # MAYBE-TODO also get the final total number of reads from the metadata infile and make sure it's the same # as the number of processed reads I get from all_alignment_data.print_summary()? ### parse input file and store data - the add_alignment_reader_to_data function here does pretty much all the work! for infile in infiles: # if this is a new-style *_genomic-unique.sam file and has a matching *_cassette.sam file, parse that file too part_infiles = [infile] if infile.endswith('_genomic-unique.sam'): cassette_file = infile[:-len('_genomic-unique.sam')] + '_cassette.sam' if os.path.exists(cassette_file): part_infiles.append(cassette_file) for part_infile in part_infiles: # initialize a parser for the SAM infile if options.verbosity_level>1: print "parsing input file %s - time %s."%(part_infile, time.ctime()) infile_reader = HTSeq.SAM_Reader(part_infile) # fill the new alignment set object with data from the infile parser all_alignment_data.add_alignment_reader_to_data(infile_reader, uncollapse_read_counts = options.input_collapsed_to_unique, ignore_cassette = options.ignore_cassette, cassette_only = False, treat_unknown_as_match = options.treat_unknown_as_match) if options.separate_cassette: cassette_alignment_data.add_alignment_reader_to_data(infile_reader, uncollapse_read_counts = options.input_collapsed_to_unique, ignore_cassette = False, cassette_only = True, treat_unknown_as_match = options.treat_unknown_as_match) ### optionally remove mutants based on another dataset - BEFORE adjacent mutant counting/merging # remove mutants that ARE present in another file (also do it for cassette mutants if those are separate) if options.remove_mutants_from_file: for other_file in options.remove_mutants_from_file.split(','): other_dataset = mutant_analysis_classes.read_mutant_file(other_file) all_alignment_data.remove_mutants_in_other_dataset(other_dataset, readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect) if options.separate_cassette: cassette_alignment_data.remove_mutants_in_other_dataset(other_dataset, readcount_min=options.remove_from_file_readcount_min, perfect_reads=options.remove_from_file_min_is_perfect) # remove mutants that are NOT present in another file (also do it for cassette mutants if those are separate) # TODO should I implement using multiple files here too? if options.remove_mutants_not_from_file: other_dataset = mutant_analysis_classes.read_mutant_file(options.remove_mutants_not_from_file) all_alignment_data.remove_mutants_not_in_other_dataset(other_dataset, readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect) if options.separate_cassette: cassette_alignment_data.remove_mutants_not_in_other_dataset(other_dataset, readcount_min=options.remove_not_from_file_readcount_min, perfect_reads=options.remove_not_from_file_min_is_perfect) ### optionally merge some mutant categories with open(mutant_merging_outfile, 'w') as MERGEFILE: with open(cassette_merging_outfile, 'w') as CASSETTE_MERGEFILE: # 1) adjacent same-strand mutants (since they're probably just artifacts of indels during deepseq/PCR) if options.merge_adjacent_mutants: try: leave_N_mutants = int(options.merge_adjacent_leave_mutants) except ValueError: leave_N_mutants = options.merge_adjacent_leave_mutants if options.merge_adjacent_count_ratio is not None: leave_N_mutants = 'use_ratio' all_alignment_data.merge_adjacent_mutants(merge_max_distance=options.adjacent_max_distance, leave_N_mutants=leave_N_mutants, min_count_ratio = options.merge_adjacent_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = options.merge_in_cassette, merge_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE) if options.merge_in_cassette and options.separate_cassette: cassette_alignment_data.merge_adjacent_mutants(merge_max_distance = options.adjacent_max_distance, leave_N_mutants=leave_N_mutants, min_count_ratio = options.merge_adjacent_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = True, merge_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE) # 2) opposite-strand same-position mutants (since they're probably just tail-to-tail cassette tandems) if options.merge_opposite_tandem_mutants: try: leave_N_mutants = int(options.merge_opposite_leave_mutants) except ValueError: leave_N_mutants = options.merge_opposite_leave_mutants if options.merge_opposite_count_ratio is not None: leave_N_mutants = 'use_ratio' all_alignment_data.merge_opposite_tandem_mutants(leave_N_mutants=leave_N_mutants, max_count_ratio = options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = options.merge_in_cassette, merge_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE) if options.merge_in_cassette and options.separate_cassette: cassette_alignment_data.merge_opposite_tandem_mutants(leave_N_mutants=leave_N_mutants, max_count_ratio = options.merge_opposite_count_ratio, leave_method=options.merge_mutant_choice_method, merge_cassette_chromosomes = True, merge_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE) # 3) count adjacent mutants, even if not doing any merging # Actually there's always a count done after each merge, but that count doesn't print the details to anything, # so just run it again here with detail-printing - MAYBE-TODO find a more efficient way instead of counting twice? all_alignment_data.count_adjacent_mutants(max_distance_to_print = options.adjacent_max_distance, max_distance_to_count = 10000, count_cassette_chromosomes = options.merge_in_cassette, count_other_chromosomes = options.merge_in_other_chrom, OUTPUT = MERGEFILE) if options.separate_cassette: cassette_alignment_data.count_adjacent_mutants(max_distance_to_print = options.adjacent_max_distance, max_distance_to_count = 10, count_cassette_chromosomes = True, count_other_chromosomes = False, OUTPUT = CASSETTE_MERGEFILE) # MAYBE-TODO make an option for max_distance_to_count? I'm using a much lower one for cassette because it's so dense. # since there's no optional as/with statement, just remove the cassette_merging_outfile if unwanted if not options.separate_cassette: os.remove(cassette_merging_outfile) ### optionally parse gene position/info files and look up the genes for each mutant in the data if options.gene_position_reference_file is not None: genefile = options.gene_position_reference_file if options.verbosity_level>1: print "adding genes from file %s to mutant data - time %s."%(genefile, time.ctime()) all_alignment_data.find_genes_for_mutants(genefile, detailed_features=options.detailed_gene_features, N_run_groups=options.N_detail_run_groups, verbosity_level=options.verbosity_level) # if we have gene info, optionally also add annotation if options.gene_annotation_file: if options.verbosity_level>1: print "adding gene annotation from file %s - time %s."%(options.gene_annotation_file, time.ctime()) all_alignment_data.add_gene_annotation(options.gene_annotation_file, if_standard_Phytozome_file=options.annotation_file_standard_type, print_info=(options.verbosity_level >= 2)) ### output data to files save_dataset_files(all_alignment_data, outfile, options.verbosity_level, True, options.count_cassette, options.count_other, options.merge_boundary_features, options.sort_data_key, options.N_sequences_per_group, options) # TODO write some info about all the other files that go with this one (pickle, merging-info, *cassette*) if options.separate_cassette: save_dataset_files(cassette_alignment_data, cassette_outfile, 0, True, False, False, options.merge_boundary_features, options.sort_data_key, options.N_sequences_per_group, options)