def trim_prefix(prefix_bases, infile, trimmed_outfile, wrong_prefix_outfile=os.devnull, 
               INFOFILE=None, verbosity=1):
    """ Trim prefix_bases from seqs in infile and print to trimmed_outfile; print other seqs to wrong_prefix_outfile.

    Reads fasta or fastq files; outputs fasta files only.
    For each seq in infile, if seq starts with prefix_bases, trim them and print result to trimmed_outfile; 
     otherwise print full seq to wrong_prefix_outfile (if not None). 
    INFOFILE should be an open file handle to print summary info to, or None; 
     verbosity governs how much is printed to stdout
    """
    text = "### Trimming %s from start of each sequence in %s (output to %s, untrimmed to %s)\n"%(prefix_bases, infile, 
                                                                                trimmed_outfile, wrong_prefix_outfile)
    # MAYBE-TODO modify so it can output fastq too?
    if INFOFILE is not None:    INFOFILE.write(text+'\n')
    if verbosity>0:             print text
    N_trimmed, N_untrimmed = 0, 0
    with open(trimmed_outfile, 'w') as TRIMMED_OUTFILE:
        with open(wrong_prefix_outfile, 'w') as WRONG_PREFIX_OUTFILE:
            # MAYBE-TODO right now if wrong_prefix_outfile==None, /dev/null is used - it would be faster with a custom file-like object that doesn't touch the OS, but I'm not sure how to write one so it can be opened!  See general_utilities.FAKE_OUTFILE for an already open one.
            name_seq_generator = name_seq_generator_from_fasta_fastq(infile, verbosity>2)
            for name,seq in name_seq_generator:
                if_trimmed = _trim_prefix_single(name, seq, prefix_bases, TRIMMED_OUTFILE, WRONG_PREFIX_OUTFILE)
                if if_trimmed:  N_trimmed += 1
                else:           N_untrimmed += 1

    N_total = N_trimmed + N_untrimmed
    text = "Trimmed sequences: %s\nUntrimmed sequences: %s\n"%(value_and_percentages(N_trimmed, [N_total]), 
                                                               value_and_percentages(N_untrimmed, [N_total]))
    if INFOFILE is not None:    INFOFILE.write(text+'\n')
    if verbosity>1:             print text
    return N_trimmed, N_untrimmed
Esempio n. 2
0
def name_to_ID_dicts(infile_GFF2, convert_to_singles=True):
    """ Given a JGI-style GFF2 file, return a set of all gene names, and name:transcriptID and name:proteinID dictionaries.

    The input should be the filename.

    Assumes the file has a 'name' annotation on each line (prints an error if not found), 
     and 'transcriptId' and 'proteinId' on some lines; ignores other annotations.

    If sometimes a single name has more than one transcript/protein, both returned dictionaries have sets as values; 
     if there's always exactly one, they'll be converted to name:ID dictionaries if convert_to_singles is True.
    """
    # go over the file, parse each line and make name:proteinID and name:transcriptID dictionaries
    all_names = set()
    name_to_transcriptIDs = defaultdict(set)
    name_to_proteinIDs = defaultdict(set)
    for fields, annotations in parse_JGI_GFF2_file(infile_GFF2):
        if 'name' not in annotations:
            print "ERROR: line has no 'name' tag! %s"%line
            continue
        name = annotations['name'].strip('"')
        all_names.add(name)
        if 'transcriptId' in annotations:
            name_to_transcriptIDs[name].add(annotations['transcriptId'])
        if 'proteinId' in annotations:
            name_to_proteinIDs[name].add(annotations['proteinId'])
    # print some summaries
    print "Total %s gene names in file; %s have transcript IDs, %s have protein IDs."%(len(all_names), 
                           value_and_percentages(len(name_to_transcriptIDs), [len(all_names)]), 
                           value_and_percentages(len(name_to_proteinIDs), [len(all_names)]))
    genes_with_both_IDs = set(name_to_transcriptIDs) & set(name_to_proteinIDs)
    N_same = sum(name_to_proteinIDs[name]==name_to_transcriptIDs[name] for name in genes_with_both_IDs)
    print "Out of genes that have both (%s), the protein ID and transcript ID are the same for %s."%(len(genes_with_both_IDs), 
                     value_and_percentages(N_same, [len(genes_with_both_IDs)]))
    # See how many unique protein/transcript IDs each name has
    N_transcript_IDs = Counter(len(x) for x in name_to_transcriptIDs.values()) 
    print "Number of unique transcript IDs per gene name, and how many genes have that number: %s"%(
        ', '.join('%s - %s'%(N_IDs,N_genes) for N_IDs,N_genes in sorted(N_transcript_IDs.items())))
    N_protein_IDs = Counter(len(x) for x in name_to_proteinIDs.values())    
    print "Number of unique protein IDs per gene name, and how many genes have that number: %s"%(
        ', '.join('%s - %s'%(N_IDs,N_genes) for N_IDs,N_genes in sorted(N_protein_IDs.items())))
    # Make new dictionaries that just have a single protein/transcript ID per name, instead of a set
    if list(N_transcript_IDs.keys()) == [1]:
        name_to_transcriptIDs = {name: transcriptIDs.pop() for name, transcriptIDs in name_to_transcriptIDs.items()}
    if list(N_protein_IDs.keys()) == [1]:
        name_to_proteinIDs = {name: proteinIDs.pop() for name, proteinIDs in name_to_proteinIDs.items()}
    return all_names, name_to_transcriptIDs, name_to_proteinIDs
Esempio n. 3
0
def trim_prefix(prefix_bases,
                infile,
                trimmed_outfile,
                wrong_prefix_outfile=os.devnull,
                INFOFILE=None,
                verbosity=1):
    """ Trim prefix_bases from seqs in infile and print to trimmed_outfile; print other seqs to wrong_prefix_outfile.

    Reads fasta or fastq files; outputs fasta files only.
    For each seq in infile, if seq starts with prefix_bases, trim them and print result to trimmed_outfile; 
     otherwise print full seq to wrong_prefix_outfile (if not None). 
    INFOFILE should be an open file handle to print summary info to, or None; 
     verbosity governs how much is printed to stdout
    """
    text = "### Trimming %s from start of each sequence in %s (output to %s, untrimmed to %s)\n" % (
        prefix_bases, infile, trimmed_outfile, wrong_prefix_outfile)
    # MAYBE-TODO modify so it can output fastq too?
    if INFOFILE is not None: INFOFILE.write(text + '\n')
    if verbosity > 0: print text
    N_trimmed, N_untrimmed = 0, 0
    with open(trimmed_outfile, 'w') as TRIMMED_OUTFILE:
        with open(wrong_prefix_outfile, 'w') as WRONG_PREFIX_OUTFILE:
            # MAYBE-TODO right now if wrong_prefix_outfile==None, /dev/null is used - it would be faster with a custom file-like object that doesn't touch the OS, but I'm not sure how to write one so it can be opened!  See general_utilities.FAKE_OUTFILE for an already open one.
            name_seq_generator = name_seq_generator_from_fasta_fastq(
                infile, verbosity > 2)
            for name, seq in name_seq_generator:
                if_trimmed = _trim_prefix_single(name, seq, prefix_bases,
                                                 TRIMMED_OUTFILE,
                                                 WRONG_PREFIX_OUTFILE)
                if if_trimmed: N_trimmed += 1
                else: N_untrimmed += 1

    N_total = N_trimmed + N_untrimmed
    text = "Trimmed sequences: %s\nUntrimmed sequences: %s\n" % (
        value_and_percentages(N_trimmed, [N_total]),
        value_and_percentages(N_untrimmed, [N_total]))
    if INFOFILE is not None: INFOFILE.write(text + '\n')
    if verbosity > 1: print text
    return N_trimmed, N_untrimmed
def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        infile, outfile_base = args
    except ValueError:
        parser.print_help()
        sys.exit("\nError: one infile and one outfile base name are required! Got %s"%args)

    indexes = options.index_list.split(',')
    index_OUTFILES = {index: open("%s_%s.fq"%(outfile_base, index), 'w') for index in indexes}
    unmatched_OUTFILE = open("%s_unmatched.fq"%outfile_base, 'w')
    index_counts = {index: 0 for index in indexes}
    unmatched_count = 0
    if options.index_in_sequence:
        index_len = set(len(x) for x in indexes)
        if len(index_len) > 1:
            raise Exception("Indexes need to all have the same lengths!  Found lengths %s from indexes %s"%(index_len, indexes))
        index_len = index_len.pop()
        get_index = lambda name,seq:  seq[:index_len]
        make_output = lambda name, seq, qual, index: ("%s:%s"%(name,index), seq[index_len:], qual[index_len:])
    else:
        get_index = lambda name, seq:  name.split(':')[-1]
        make_output = lambda name, seq, qual, index: (name, seq, qual)
    for (name,seq,qual) in basic_seq_utilities.parse_fastq(infile):
        index = get_index(name, seq) 
        try:
            OUTFILE = index_OUTFILES[index]
            index_counts[index] += 1
        except KeyError:
            OUTFILE = unmatched_OUTFILE
            unmatched_count += 1
        basic_seq_utilities.write_fastq_line(*make_output(name, seq, qual, index), OUTFILE=OUTFILE)
    if not options.quiet:
        total = unmatched_count + sum(index_counts.values())
        print "%s reads:\n%s unmatched\n%s"%(total, value_and_percentages(unmatched_count, [total]), 
                 '\n'.join("%s %s"%(value_and_percentages(count, [total]), index) for (index,count) in index_counts.items()) )
def replicate_reproducibility_info(dataset1, dataset2, readcount_cutoffs=[1,10,100,1000], ratio_cutoffs=[1.2,1.5,2,5,10], 
                                   both_ways=True, higher_only=False, real_min_reads_1=None, real_min_reads_2=None, quiet=False):
    """ Return/print reproducibility information between two replicates: % of readcounts within 2x of each other, etc. 

    Answers two basic questions, for each N in readcount_cutoffs:
        1) what percent of flanking sequences with N+ reads in one replicate were also observed in the other replicate?  
            (observed at all, not with N+ reads)
        2) what % of the mutants with N+ reads in one replicate have the two replicate readcounts within 2x/etc of each other? 
            Using each value in ratio_cutoffs for the ratio; using real or normalized readcount 
            (normalized is so that if one replicate has 2x more reads total than the other, the 2x ratio is counted as 1x etc; 
             still using raw readcounts for the cutoffs, because using normalized ones would probably be too hard to explain...)
           If higher_only is True, this only excludes readcounts that are 2x/etc HIGHER in dataset1 than in dataset1 (normalized), 
             rather than either higher or lower. 
        Also reports the total number of mutants with N+ reads: total, as a percentage of all mutants, and optionally 
         as a percentage of all "real" mutants, i.e. ones with at least real_min_reads_1 in dataset1 or real_min_reads_2 in dataset2 
          (if real_min_reads_* are both not None).
        Note that we're looking at mutants with >=X reads in one dataset, and putting no cutoff on the other; 
        If both_ways is False, we just do it one way (looking at mutants with N+ counts in A and checking presence/ratio in B); 
         if True, we do it both ways (A to B and B to A) and add the results together.
        Both_ways and higher_only cannot both be True, since the first one implies we're treating the two samples the same 
         and the second that we're treating them differently.

    For example, if we have two replicates A and B, with readcounts of (0,1,3,4) and (1,0,2,5) for the same 4 mutants:
        * looking at mutants with 1+ read (3 in A, 3 in B):
          - out of mutants with 1+ read in A, 2 are present in B; same the other way around - so total 67% present. 
          - out of mutants with 1+ read in A, the A:B ratios are 1:0, 3:2, 4:5 - so 1/3 within 1.3x, 2/3 within 2x (and 5x etc) 
             (and the reverse for mutants with 1+ read in B, leading to the same final counts)
            if we're looking higher_only, then 1/3 ratios are below 1.3x (and even 1x), 2/3 below 2x (and 5x etc)
            (or if we were looking at mutants with 1+ read in B, the B:A ratios are 1:0, 2:3, 5:4 - same except 2 below 1.3x.)
        * looking at mutants with 3+ reads (2 in A, 1 in B):
          - out of mutants with 3+ reads in A, 2 are present in B; out of 1 in B, 1 present in A - so total 100% present.
          - the ratios are 2:3 and 5:4 for the two in A, and 4:5 for the one in B, so 2/3 within 1.3x, 3/3 within 2x.
        (the ratios are the same normalized and unnormalized here; if A was (0,2,6,8) instead, then the normalized ratios from 
         the last section would be the same as above, but the raw ratios would be 2:6, 5:8, 8:5 - 0/3 within 1.3x, 2/3 within 2x.)
    
    The first two inputs should be either lists if integer readcounts (must be for the same list of mutants, in same order!), 
     or mutant_analysis_classes.Insertional_mutant_pool_dataset instances.

    Outputs, five total, in order:
        1-4: four dictionaries with all the readcount cutoffs as keys (readcount cutoff is N), and various values:
          1: number of mutants with N+ reads in one dataset
          2: number of mutants with N+ reads in one dataset AND present in the other dataset
          3-4: two dictionaries with ratio cutoffs as keys, and the values being the number of mutants 
             with N+ reads in one dataset that had a readcount within that ratio cutoff to the other dataset: 
              first dictionary uses raw readcounts to check ratios, the second uses readcounts normalized to total per dataset.
        5: a mutant_totals list: first element is the total number of mutants with non-zero readcounts, 
            second element is the total number of "real" mutants (i.e. ones with at least real_min_reads_1 in dataset1 
             or real_min_reads_2 in dataset2) - second element only present if real_min_reads_* are both not None.

    If quiet is True, don't print the data, just return it.  Note: printing is nicely formatted and has percentages.
    """
    # MAYBE-TODO add example with a lot more mutants in A than B, or such, to illustrate difference between A/B, B/A, and both_ways?
    # check if the arguments are consistent
    if both_ways and higher_only:
            raise ValueError("Both_ways and higher_only arguments cannot both be True!")
    # if the inputs are datasets instead of readcount lists, convert to readcount lists 
    #   (join the two datasets first so that both readcount lists are for the same mutant list)
    if isinstance(dataset1[0], int):
        readcounts1, readcounts2 = dataset1, dataset2
        if not len(readcounts1) == len(readcounts2):
            raise ValueError("First two arguments to replicate_reproducibility_info must be same length!")
    else:
        joint_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(multi_dataset=True)
        joint_dataset.populate_multi_dataset({'1': dataset1, '2': dataset2})
        readcounts1, readcounts2 = zip(*[[mutant.by_dataset[D].total_read_count for D in '12'] for mutant in joint_dataset])
    # make normalized data for the normalized ratios
    total_1, total_2 = sum(readcounts1), sum(readcounts2)
    norm_readcounts1, norm_readcounts2 = [x/total_1 for x in readcounts1], [x/total_2 for x in readcounts2]
    # we're adding together the comparisons both ways, then redefine the two readcount lists appropriately
    if both_ways:   readcountsA, readcountsB = readcounts1+readcounts2, readcounts2+readcounts1
    else:           readcountsA, readcountsB = readcounts1, readcounts2
    if both_ways:   norm_readcountsA, norm_readcountsB = norm_readcounts1+norm_readcounts2, norm_readcounts2+norm_readcounts1
    else:           norm_readcountsA, norm_readcountsB = norm_readcounts1, norm_readcounts2
    # calculate the total number of mutants that are present in dataset A
    total_N_mutants = sum(x>0 for x in readcountsA)
    # if we got real_min_reads_* data, use that to calculate a separate "real mutant" total and use it
    if real_min_reads_1 is not None and real_min_reads_2 is not None:
        if both_ways:   total_N_real_mutants = sum(x>=real_min_reads_1 for x in readcounts1)\
                                              +sum(x>=real_min_reads_2 for x in readcounts2)
        else:           total_N_real_mutants = sum(x>=real_min_reads_1 for x in readcounts1)
        mutant_count_totals = [total_N_mutants, total_N_real_mutants]
        total_descriptions = ['of all', 'of after-standard-cutoffs']
    else:
        mutant_count_totals = [total_N_mutants]
        total_descriptions = ['of all']
    # make a few dictionaries to return the data in
    N_filtered_dict, N_present_in_B_dict, N_within_ratio_raw_dict, N_within_ratio_norm_dict = {}, {}, {}, {}
    # for each readcount cutoff, filter the data by the cutoff in datasetA, then look at presence and ratio in datasetB:
    for readcount_cutoff in readcount_cutoffs:
        # filter data by minimum readcount in A
        filtered_readcount_pairs = filter_data_by_min_reference(readcountsA, [readcountsA,readcountsB], readcount_cutoff)
        filtered_norm_readcount_pairs = filter_data_by_min_reference(readcountsA, [norm_readcountsA,norm_readcountsB], 
                                                                     readcount_cutoff)
        N_filtered_in_A = len(filtered_readcount_pairs)
        N_filtered_dict[readcount_cutoff] = N_filtered_in_A
        if not quiet:
            print " * looking at mutants with at least %s reads in at least one dataset - %s"%(readcount_cutoff, 
                   general_utilities.value_and_percentages(N_filtered_in_A, mutant_count_totals, 
                                                           percentage_format_str='%.0f', words_for_percentages=total_descriptions))
        # see how many are present in B
        N_present_in_B = sum(b>0 for (a,b) in filtered_readcount_pairs)
        if not quiet:
            print "   - out of those, %s are present in the other."%(general_utilities.value_and_percentages(N_present_in_B, 
                                                                                                             [N_filtered_in_A]))
        N_present_in_B_dict[readcount_cutoff] = N_present_in_B
        # look at how many mutants are within given B:A readcount ratios (raw and normalized readcounts)
        if not quiet:
            if higher_only:     print '   - how many mutants have a readcount replicate1:2 ratio exceeding the given value:'
            else:               print '   - how many mutants are within a given readcount ratio between the replicates:'
        info_strings_raw, info_strings_norm = [], []
        N_within_ratio_raw_dict[readcount_cutoff], N_within_ratio_norm_dict[readcount_cutoff] = {}, {}
        for ratio_cutoff in ratio_cutoffs:
            # note that we're always taking a b/a ratio, not a/b, because we know a is non-zero, but we don't know that about b!
            if higher_only:
                # for the higher-only option we want to test a/b <= ratio_cutoff, i.e. 1/ratio_cutoff <= b/a
                N_within_ratio_raw = sum((1/ratio_cutoff <= b/a) for a,b in filtered_readcount_pairs)
                N_within_ratio_norm = sum((1/ratio_cutoff <= b/a) for a,b in filtered_norm_readcount_pairs)
            else:
                N_within_ratio_raw = sum((1/ratio_cutoff <= b/a <= ratio_cutoff) for a,b in filtered_readcount_pairs)
                N_within_ratio_norm = sum((1/ratio_cutoff <= b/a <= ratio_cutoff) for a,b in filtered_norm_readcount_pairs)
            N_within_ratio_raw_dict[readcount_cutoff][ratio_cutoff] = N_within_ratio_raw
            N_within_ratio_norm_dict[readcount_cutoff][ratio_cutoff] = N_within_ratio_norm
            info_strings_raw.append('%sx - %s'%(ratio_cutoff, 
                                                'N/A' if N_filtered_in_A==0 else '%.0f%%'%(N_within_ratio_raw/N_filtered_in_A*100)))
            info_strings_norm.append('%sx - %s'%(ratio_cutoff, 
                                                'N/A' if N_filtered_in_A==0 else '%.0f%%'%(N_within_ratio_norm/N_filtered_in_A*100)))
        if not quiet:
            print '       raw readcount ratios       : '+', '.join(info_strings_raw)
            print '       normalized readcount ratios: '+', '.join(info_strings_norm)
Esempio n. 6
0
def replicate_reproducibility_info(dataset1,
                                   dataset2,
                                   readcount_cutoffs=[1, 10, 100, 1000],
                                   ratio_cutoffs=[1.2, 1.5, 2, 5, 10],
                                   both_ways=True,
                                   higher_only=False,
                                   real_min_reads_1=None,
                                   real_min_reads_2=None,
                                   quiet=False):
    """ Return/print reproducibility information between two replicates: % of readcounts within 2x of each other, etc. 

    Answers two basic questions, for each N in readcount_cutoffs:
        1) what percent of flanking sequences with N+ reads in one replicate were also observed in the other replicate?  
            (observed at all, not with N+ reads)
        2) what % of the mutants with N+ reads in one replicate have the two replicate readcounts within 2x/etc of each other? 
            Using each value in ratio_cutoffs for the ratio; using real or normalized readcount 
            (normalized is so that if one replicate has 2x more reads total than the other, the 2x ratio is counted as 1x etc; 
             still using raw readcounts for the cutoffs, because using normalized ones would probably be too hard to explain...)
           If higher_only is True, this only excludes readcounts that are 2x/etc HIGHER in dataset1 than in dataset1 (normalized), 
             rather than either higher or lower. 
        Also reports the total number of mutants with N+ reads: total, as a percentage of all mutants, and optionally 
         as a percentage of all "real" mutants, i.e. ones with at least real_min_reads_1 in dataset1 or real_min_reads_2 in dataset2 
          (if real_min_reads_* are both not None).
        Note that we're looking at mutants with >=X reads in one dataset, and putting no cutoff on the other; 
        If both_ways is False, we just do it one way (looking at mutants with N+ counts in A and checking presence/ratio in B); 
         if True, we do it both ways (A to B and B to A) and add the results together.
        Both_ways and higher_only cannot both be True, since the first one implies we're treating the two samples the same 
         and the second that we're treating them differently.

    For example, if we have two replicates A and B, with readcounts of (0,1,3,4) and (1,0,2,5) for the same 4 mutants:
        * looking at mutants with 1+ read (3 in A, 3 in B):
          - out of mutants with 1+ read in A, 2 are present in B; same the other way around - so total 67% present. 
          - out of mutants with 1+ read in A, the A:B ratios are 1:0, 3:2, 4:5 - so 1/3 within 1.3x, 2/3 within 2x (and 5x etc) 
             (and the reverse for mutants with 1+ read in B, leading to the same final counts)
            if we're looking higher_only, then 1/3 ratios are below 1.3x (and even 1x), 2/3 below 2x (and 5x etc)
            (or if we were looking at mutants with 1+ read in B, the B:A ratios are 1:0, 2:3, 5:4 - same except 2 below 1.3x.)
        * looking at mutants with 3+ reads (2 in A, 1 in B):
          - out of mutants with 3+ reads in A, 2 are present in B; out of 1 in B, 1 present in A - so total 100% present.
          - the ratios are 2:3 and 5:4 for the two in A, and 4:5 for the one in B, so 2/3 within 1.3x, 3/3 within 2x.
        (the ratios are the same normalized and unnormalized here; if A was (0,2,6,8) instead, then the normalized ratios from 
         the last section would be the same as above, but the raw ratios would be 2:6, 5:8, 8:5 - 0/3 within 1.3x, 2/3 within 2x.)
    
    The first two inputs should be either lists if integer readcounts (must be for the same list of mutants, in same order!), 
     or mutant_analysis_classes.Insertional_mutant_pool_dataset instances.

    Outputs, five total, in order:
        1-4: four dictionaries with all the readcount cutoffs as keys (readcount cutoff is N), and various values:
          1: number of mutants with N+ reads in one dataset
          2: number of mutants with N+ reads in one dataset AND present in the other dataset
          3-4: two dictionaries with ratio cutoffs as keys, and the values being the number of mutants 
             with N+ reads in one dataset that had a readcount within that ratio cutoff to the other dataset: 
              first dictionary uses raw readcounts to check ratios, the second uses readcounts normalized to total per dataset.
        5: a mutant_totals list: first element is the total number of mutants with non-zero readcounts, 
            second element is the total number of "real" mutants (i.e. ones with at least real_min_reads_1 in dataset1 
             or real_min_reads_2 in dataset2) - second element only present if real_min_reads_* are both not None.

    If quiet is True, don't print the data, just return it.  Note: printing is nicely formatted and has percentages.
    """
    # MAYBE-TODO add example with a lot more mutants in A than B, or such, to illustrate difference between A/B, B/A, and both_ways?
    # check if the arguments are consistent
    if both_ways and higher_only:
        raise ValueError(
            "Both_ways and higher_only arguments cannot both be True!")
    # if the inputs are datasets instead of readcount lists, convert to readcount lists
    #   (join the two datasets first so that both readcount lists are for the same mutant list)
    if isinstance(dataset1[0], int):
        readcounts1, readcounts2 = dataset1, dataset2
        if not len(readcounts1) == len(readcounts2):
            raise ValueError(
                "First two arguments to replicate_reproducibility_info must be same length!"
            )
    else:
        joint_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(
            multi_dataset=True)
        joint_dataset.populate_multi_dataset({'1': dataset1, '2': dataset2})
        readcounts1, readcounts2 = zip(
            *[[mutant.by_dataset[D].total_read_count for D in '12']
              for mutant in joint_dataset])
    # make normalized data for the normalized ratios
    total_1, total_2 = sum(readcounts1), sum(readcounts2)
    norm_readcounts1, norm_readcounts2 = [x / total_1 for x in readcounts1], [
        x / total_2 for x in readcounts2
    ]
    # we're adding together the comparisons both ways, then redefine the two readcount lists appropriately
    if both_ways:
        readcountsA, readcountsB = readcounts1 + readcounts2, readcounts2 + readcounts1
    else:
        readcountsA, readcountsB = readcounts1, readcounts2
    if both_ways:
        norm_readcountsA, norm_readcountsB = norm_readcounts1 + norm_readcounts2, norm_readcounts2 + norm_readcounts1
    else:
        norm_readcountsA, norm_readcountsB = norm_readcounts1, norm_readcounts2
    # calculate the total number of mutants that are present in dataset A
    total_N_mutants = sum(x > 0 for x in readcountsA)
    # if we got real_min_reads_* data, use that to calculate a separate "real mutant" total and use it
    if real_min_reads_1 is not None and real_min_reads_2 is not None:
        if both_ways:            total_N_real_mutants = sum(x>=real_min_reads_1 for x in readcounts1)\
                                  +sum(x>=real_min_reads_2 for x in readcounts2)
        else:
            total_N_real_mutants = sum(x >= real_min_reads_1
                                       for x in readcounts1)
        mutant_count_totals = [total_N_mutants, total_N_real_mutants]
        total_descriptions = ['of all', 'of after-standard-cutoffs']
    else:
        mutant_count_totals = [total_N_mutants]
        total_descriptions = ['of all']
    # make a few dictionaries to return the data in
    N_filtered_dict, N_present_in_B_dict, N_within_ratio_raw_dict, N_within_ratio_norm_dict = {}, {}, {}, {}
    # for each readcount cutoff, filter the data by the cutoff in datasetA, then look at presence and ratio in datasetB:
    for readcount_cutoff in readcount_cutoffs:
        # filter data by minimum readcount in A
        filtered_readcount_pairs = filter_data_by_min_reference(
            readcountsA, [readcountsA, readcountsB], readcount_cutoff)
        filtered_norm_readcount_pairs = filter_data_by_min_reference(
            readcountsA, [norm_readcountsA, norm_readcountsB],
            readcount_cutoff)
        N_filtered_in_A = len(filtered_readcount_pairs)
        N_filtered_dict[readcount_cutoff] = N_filtered_in_A
        if not quiet:
            print " * looking at mutants with at least %s reads in at least one dataset - %s" % (
                readcount_cutoff,
                general_utilities.value_and_percentages(
                    N_filtered_in_A,
                    mutant_count_totals,
                    percentage_format_str='%.0f',
                    words_for_percentages=total_descriptions))
        # see how many are present in B
        N_present_in_B = sum(b > 0 for (a, b) in filtered_readcount_pairs)
        if not quiet:
            print "   - out of those, %s are present in the other." % (
                general_utilities.value_and_percentages(
                    N_present_in_B, [N_filtered_in_A]))
        N_present_in_B_dict[readcount_cutoff] = N_present_in_B
        # look at how many mutants are within given B:A readcount ratios (raw and normalized readcounts)
        if not quiet:
            if higher_only:
                print '   - how many mutants have a readcount replicate1:2 ratio exceeding the given value:'
            else:
                print '   - how many mutants are within a given readcount ratio between the replicates:'
        info_strings_raw, info_strings_norm = [], []
        N_within_ratio_raw_dict[readcount_cutoff], N_within_ratio_norm_dict[
            readcount_cutoff] = {}, {}
        for ratio_cutoff in ratio_cutoffs:
            # note that we're always taking a b/a ratio, not a/b, because we know a is non-zero, but we don't know that about b!
            if higher_only:
                # for the higher-only option we want to test a/b <= ratio_cutoff, i.e. 1/ratio_cutoff <= b/a
                N_within_ratio_raw = sum((1 / ratio_cutoff <= b / a)
                                         for a, b in filtered_readcount_pairs)
                N_within_ratio_norm = sum(
                    (1 / ratio_cutoff <= b / a)
                    for a, b in filtered_norm_readcount_pairs)
            else:
                N_within_ratio_raw = sum(
                    (1 / ratio_cutoff <= b / a <= ratio_cutoff)
                    for a, b in filtered_readcount_pairs)
                N_within_ratio_norm = sum(
                    (1 / ratio_cutoff <= b / a <= ratio_cutoff)
                    for a, b in filtered_norm_readcount_pairs)
            N_within_ratio_raw_dict[readcount_cutoff][
                ratio_cutoff] = N_within_ratio_raw
            N_within_ratio_norm_dict[readcount_cutoff][
                ratio_cutoff] = N_within_ratio_norm
            info_strings_raw.append(
                '%sx - %s' %
                (ratio_cutoff, 'N/A' if N_filtered_in_A == 0 else '%.0f%%' %
                 (N_within_ratio_raw / N_filtered_in_A * 100)))
            info_strings_norm.append(
                '%sx - %s' %
                (ratio_cutoff, 'N/A' if N_filtered_in_A == 0 else '%.0f%%' %
                 (N_within_ratio_norm / N_filtered_in_A * 100)))
        if not quiet:
            print '       raw readcount ratios       : ' + ', '.join(
                info_strings_raw)
            print '       normalized readcount ratios: ' + ', '.join(
                info_strings_norm)
    return N_filtered_dict, N_present_in_B_dict, N_within_ratio_raw_dict, N_within_ratio_norm_dict, mutant_count_totals
def filter_flanking_regions_by_pattern(
    flanking_region_count_list,
    pattern,
    either_orientation=True,
    print_info=True,
    category=None,
    meaning_of_seqs="positions",
    meaning_of_counts="counts",
):
    """ Return separate lists of flanking regions that do and don't match given sequence pattern.
    
    flanking_region_count_list should be a list of (flanking_region, count) pairs (like from grab_flanking_regions_from_mutantfile); 
     the two return values (flanking regions that match and don't match the pattern) are the same format.

    The pattern should be a sequence string (allowed letters are ACTGN). It'ss considered to be centered around the cut site; 
     the flanking regions likewise.  E.g. if pattern is GNAN, a fl.region of GCAC or TTGCACTT would match, but TTTTGCAC would not. 
    If either_orientation is True, each flanking region will be tried against the pattern in both the forward and the reverse
     orientation, and the returned flanking region will be in the orientation that matched - e.g. if pattern is GNAN, 
     a flanking region of either TTGCACTT or TTCTCCTT would match (forward and rev-compl respectively), 
      and the latter would be returned as rev-compl, AAGGAGAA.

    If print_info is True, some information will be printed about what number/percentage matched and didn't: it'll be given two ways:
     - by flanking region, counting each once, if meaning_of_seqs is not None, and meaning_of_seqs will be used as the description
     - by count, if some counts are not 1 and meaning_of_counts is not None, and meaning_of_counts will be used as the description.
    """
    if not flanking_region_count_list:
        return []
    flanking_region_length = get_all_seq_length(zip(*flanking_region_count_list)[0])
    if flanking_region_length % 2:
        raise ValueError("Flanking region length must be an even number!")
    if len(pattern) % 2:
        raise ValueError("Pattern length must be an even number!")
    if len(pattern) > flanking_region_length:
        raise ValueError("Pattern cannot be longer than flanking regions!")
    # pad the pattern to match the flanking region length
    orig_pattern = pattern
    if len(pattern) < flanking_region_length:
        padding_len = int((flanking_region_length - len(pattern)) / 2)
        pattern = "N" * padding_len + pattern + "N" * padding_len
    # go over all the flanking regions:
    flanking_region_count_list_match, flanking_region_count_list_nomatch = [], []
    for (flanking_region, count) in flanking_region_count_list:
        # if the flanking region is padded with .'s, change them to N's to make check_seq_against_pattern take it
        flanking_region = flanking_region.replace(".", "N")
        # if we're looking at both orientations, then first randomize the orientation to avoid bias
        if either_orientation and random.random() < 0.5:
            flanking_region = reverse_complement(flanking_region)
        # if it matches the pattern, save it as a match and go to the next one
        if check_seq_against_pattern(flanking_region, pattern):
            flanking_region_count_list_match.append((flanking_region, count))
            continue
        # or if its rev-compl matches the pattern and either_orientation is True, save it as a match and go on to the next one;
        if either_orientation:
            flanking_region = reverse_complement(flanking_region)
            if check_seq_against_pattern(flanking_region, pattern):
                flanking_region_count_list_match.append((flanking_region, count))
                continue
        # if it didn't match anywhere, save it as a no-match.
        flanking_region_count_list_nomatch.append((flanking_region, count))
    if print_info:
        if meaning_of_seqs is None and meaning_of_counts is None:
            raise ValueError("To get info printed, at least one of meaning_of_seqs/meaning_of_counts must be not None!")
        print_data = "%smatched %s:  " % ("" if category is None else category + " ", orig_pattern)
        if meaning_of_seqs is not None:
            positions_matched, positions_unmatched = (
                len(flanking_region_count_list_match),
                len(flanking_region_count_list_nomatch),
            )
            positions_all = positions_matched + positions_unmatched
            print_data += "%s, unmatched %s/%s" % (
                general_utilities.value_and_percentages(
                    positions_matched, [positions_all], insert_word=meaning_of_seqs
                ),
                positions_unmatched,
                positions_all,
            )
        if meaning_of_counts is not None:
            counts_matched, counts_unmatched = [
                sum(zip(*data)[1]) for data in (flanking_region_count_list_match, flanking_region_count_list_nomatch)
            ]
            counts_all = counts_matched + counts_unmatched
            print_data += ";  %s, unmatched %s/%s." % (
                general_utilities.value_and_percentages(counts_matched, [counts_all], insert_word=meaning_of_counts),
                counts_unmatched,
                counts_all,
            )
        print print_data
    return flanking_region_count_list_match, flanking_region_count_list_nomatch
def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any([x in options.other_cutadapt_options for x in adapter_options.split()]):
        sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options
                 +" - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    infile_suffix = os.path.splitext(infile)[1]
    outfile_suffix = '.fa'
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    ends = "5' 3'".split()
    outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends}
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    cutadapt_tmpfiles_original = cutadapt_tmpfiles
    
    with open(infofile,'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE,options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input", 
                                             options.total_read_number_only, False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... 
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), 
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                "first-base-trimming output", options.total_read_number_only, False)
            untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
            # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), 
            #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the 
            #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue) 
            #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it 
        if not if_running_cutadapt:
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            cutadapt_tmpfile = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seq) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                if not adapter_seq.replace('"','').replace("'",'').replace(' ',''):  continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                full_cutadapt_options = '-a %s %s'%(adapter_seq, options.other_cutadapt_options)
                for extra_seq_category in ('untrimmed', 'too-short', 'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="cutadapt for %s"%end_type)
                cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                               "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:                no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:     pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file,'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile):     os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):     os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because 
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile):     continue
                command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                               cutadapt_tmpfile, outfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="fastx_collapser for %s"%end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1),
                                    "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity>1: print text
                INFOFILE.write(text+'\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text+'\n')
            extra_collapsed_readcounts = {}    
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                                   extra_file)
                retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True, 
                                                                             input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))]
        final_output.append("# starting total read count:\t%s\n"%starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'%
                                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append('#  "bad" read count (wrong-start) (%% of total):\t%s\n'%
                                value_and_percentages(untrimmed_readcount, [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'%
                        (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount])))
            final_output.append('#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'%
                                value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type, 
                                value_and_percentages(cutadapt_readcount[end_type], [starting_readcount])))
        final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'%
                            value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type
                                    +'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type], 
                                                                                       [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity>0:  print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, 
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile):     os.remove(tmpfile)
Esempio n. 9
0
def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
        # TODO multiple infiles would be nice!
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any(
        [x in options.other_cutadapt_options
         for x in adapter_options.split()]):
        sys.exit(
            "Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"
            % adapter_options +
            " - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    # outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that?
    #infile_suffix = os.path.splitext(infile)[1]
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    outfile_suffix = '.fa'
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    # outfiles and tmpfiles should be split by end ONLY if cutadapt is being run!
    if options.other_cutadapt_options == 'NONE' or not (
            options.adapter_5prime or options.adapter_3prime):
        outfiles = {'': options.outfile_basename + '.fa'}
        no_cassette_tmpfiles = {
            '': options.outfile_basename + '_no-cassette-tmpfile.fa'
        }
        cutadapt_tmpfiles = {
            '': options.outfile_basename + '_cutadapt-tmpfile.fa'
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)
    else:
        ends = "5' 3'".split()
        outfiles = {
            end:
            options.outfile_basename + '_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        no_cassette_tmpfiles = {
            end: options.outfile_basename +
            '_no-cassette-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles = {
            end: options.outfile_basename +
            '_cutadapt-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)

    with open(infofile, 'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE, options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE,
                                             bool(options.verbosity > 1),
                                             "original input",
                                             options.total_read_number_only,
                                             False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function...
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl),
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile,
                        wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE,
                                                bool(options.verbosity > 1),
                                                "first-base-trimming output",
                                                options.total_read_number_only,
                                                False)
            untrimmed_readcount = check_readcount(wrong_start_file, None,
                                                  False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
        # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version),
        #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the
        #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue)
        #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it
        if not if_running_cutadapt:
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            cutadapt_tmpfiles[''] = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seqs) in [("5'", options.adapter_5prime),
                                             ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                adapter_seqs = adapter_seqs.replace('"', '').replace(
                    "'", '').replace(' ', '')
                if not adapter_seqs: continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                all_adapter_options = ' '.join(
                    ['-a %s' % seq for seq in adapter_seqs.split(',')])
                full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options
                for extra_seq_category in ('untrimmed', 'too-short',
                                           'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s' % (
                            extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s" % (
                    full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command,
                                              INFOFILE,
                                              options.verbosity,
                                              shell=True,
                                              program_name="cutadapt for %s" %
                                              end_type)
                cutadapt_readcount[end_type] = check_readcount(
                    cutadapt_tmpfile, INFOFILE, bool(options.verbosity > 1),
                    "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(
                    no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:
                    no_cassette_seqs.append(
                        dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:
                    pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(
                *[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file, 'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header,
                                     no_cassette_seqs[0][header].upper(),
                                     NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile): os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):
                    os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile): continue
                command = "fastx_collapser -v %s -i %s -o %s" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    cutadapt_tmpfile, outfile)
                run_command_print_info_output(
                    command,
                    INFOFILE,
                    options.verbosity,
                    shell=True,
                    program_name="fastx_collapser for %s" % end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(
                    outfile,
                    INFOFILE,
                    bool(options.verbosity > 1),
                    "fastx_collapser output",
                    options.total_read_number_only,
                    input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(
                    outfile,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[
                        end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity > 1: print text
                INFOFILE.write(text + '\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text + '\n')
            extra_collapsed_readcounts = {}
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    extra_file)
                retcode = run_command_print_info_output(command,
                                                        None,
                                                        options.verbosity - 1,
                                                        shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(
                    extra_file,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = [
            "### Final read count info for %s (main output files %s)\n" %
            (infile, ', '.join(outfiles))
        ]
        final_output.append("# starting total read count:\t%s\n" %
                            starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append(
                '# "good" read count after start trimming (%% of total):\t%s\n'
                %
                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append(
                '#  "bad" read count (wrong-start) (%% of total):\t%s\n' %
                value_and_percentages(untrimmed_readcount,
                                      [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'
                    % (end_type,
                       value_and_percentages(
                           cutadapt_readcount[end_type],
                           [starting_readcount, trimmed_readcount])))
            final_output.append(
                '#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'
                %
                value_and_percentages(no_cassette_readcount,
                                      [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append(
                '## final "good" %s reads (in main output file) (%% of total):\t%s\n'
                % (end_type,
                   value_and_percentages(cutadapt_readcount[end_type],
                                         [starting_readcount])))
        final_output.append(
            '## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'
            % value_and_percentages(
                starting_readcount - sum(cutadapt_readcount.values()),
                [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s unique sequence count after collapsing reads to unique sequences '
                    % end_type + '(%% of read count):\t%s\n' %
                    value_and_percentages(collapsed_readcount[end_type],
                                          [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append(
                    '# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[wrong_start_file],
                        [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append(
                    '# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[no_cassette_file],
                        [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity > 0: print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps,
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original
                        ] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile): os.remove(tmpfile)
def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    try:
        [infile] = args
    except ValueError:
        parser.print_help()
        sys.exit(
            "Error: exactly one infile required! %s infiles provided: %s" %
            (len(args), args))
        # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles...

    other_bowtie_options_split = options.other_bowtie_options.split(' ')
    if any([
            x in other_bowtie_options_split
            for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))
    ]):
        raise Exception(
            "Cannot include -v/-n/-e and related bowtie options in -B!  Use separate -e option for that; "
            "note that this program allows -v bowtie mode only.")
    if any([
            x in other_bowtie_options_split
            for x in ('-m -k -a --all'.split(' '))
    ]):
        raise Exception(
            "Cannot include -m/-a bowtie options in -B!  Use separate -m option for that."
        )

    specific_bowtie_options = '-v %s' % options.allowed_errors
    if not any([x in options.other_bowtie_options for x in ('-f', '-q')]):
        infile_format = check_fasta_fastq_format(infile)
        if infile_format == 'fasta': specific_bowtie_options += ' -f'
        elif infile_format == 'fastq': specific_bowtie_options += ' -q'
        else:
            raise Exception("Cannot process auto-detected infile format %s!" %
                            infile_format)

    # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments
    if options.multiple_to_show == -1: multiple_bowtie_option = '-a'
    else: multiple_bowtie_option = '-k %s' % max(options.multiple_to_show, 2)

    # output file names: temporary for alignments, final (split or all), metadata info file.
    outfile_suffix = '.sam' if any(
        [x in options.other_bowtie_options
         for x in ['-S', '--sam']]) else '.map'
    tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix
    if options.cassette_bowtie_index != 'NONE':
        tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix
    if options.dont_split_by_category:
        outfile_all = options.outfile_basename + outfile_suffix
    else:
        outfile_unaligned = options.outfile_basename + '_unaligned.fa'
        outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix
        outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\
                                   + ('.fa' if options.multiple_to_show==0 else outfile_suffix)
        outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix
    infofile = options.outfile_basename + '_info.txt'

    with open(infofile, 'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE, options)

        ### run bowtie vs the main/genome index file
        # run 'bowtie --version' to get that data (print to INFOFILE but not stdout)
        INFOFILE.write('\n\n')
        run_command_print_info_output("bowtie --version",
                                      INFOFILE,
                                      printing_level=0,
                                      shell=True)
        # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE
        #   (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's
        #    an error, so I can see the error message!  Or I could try to detect whether there was an error or not
        #    based on the output contents, but that seems like unnecessary work.)
        INFOFILE.write('\n\n')
        command = "bowtie %s %s %s %s %s %s" % (
            specific_bowtie_options, multiple_bowtie_option,
            options.other_bowtie_options, options.genome_bowtie_index, infile,
            tmpfile_genome)

        if options.bowtie_aln_file_genome is None:
            run_command_print_info_output(command,
                                          INFOFILE,
                                          printing_level=(not options.quiet),
                                          shell=True)
        else:
            options.keep_tmpfiles = True
            if not os.access(options.bowtie_aln_file_genome, os.R_OK):
                raise Exception(
                    "Can't read provided options.bowtie_aln_file_genome %s!" %
                    options.bowtie_aln_file_genome)
            text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % (
                command, options.bowtie_aln_file_genome)
            print text
            INFOFILE.write('\n' + text + '\n')
            tmpfile_genome = options.bowtie_aln_file_genome

        ### run bowtie vs the cassette index file if given
        if options.cassette_bowtie_index != 'NONE':
            INFOFILE.write('\n\n')
            command = "bowtie %s %s %s %s %s %s" % (
                specific_bowtie_options, '--all', options.other_bowtie_options,
                options.cassette_bowtie_index, infile, tmpfile_cassette)
            if options.bowtie_aln_file_cassette is None:
                run_command_print_info_output(
                    command,
                    INFOFILE,
                    printing_level=(not options.quiet),
                    shell=True)
            else:
                options.keep_tmpfiles = True
                if not os.access(options.bowtie_aln_file_cassette, os.R_OK):
                    raise Exception(
                        "Can't read provided options.bowtie_aln_file_cassette %s!"
                        % options.bowtie_aln_file_cassette)
                text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % (
                    command, options.bowtie_aln_file_cassette)
                print text
                INFOFILE.write('\n' + text + '\n')
                tmpfile_cassette = options.bowtie_aln_file_cassette

        ### Check that bowtie runs worked
        missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message."
        if not os.access(tmpfile_genome, os.R_OK):
            sys.exit(missing_alnfile_text %
                     (options.genome_bowtie_index, infofile))
        if options.cassette_bowtie_index != 'NONE' and not os.access(
                tmpfile_cassette, os.R_OK):
            sys.exit(missing_alnfile_text %
                     (options.cassette_bowtie_index, infofile))
        # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1?  Hard - bowtie is unfortunately ANNOYING
        #  and uses stderr both for normal output and for errors, AND gives no returncode.

        ### Parse the two alignment files in parallel, and merge them together (remove sub-optimal alignments,
        #    (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files.
        #  Do all this WITHOUT reading the entire files into memory!  A bit tricky.
        if options.cassette_bowtie_index != 'NONE':
            aln_list_generator = aln_generator_from_two_samfiles_parallel(
                tmpfile_genome, tmpfile_cassette)
        else:
            aln_list_generator = aln_generator_from_single_samfile(
                tmpfile_genome)
        ### Decide the proper category for each read, and write the info to appropriate final output files
        if options.dont_split_by_category:
            GENOMIC_UNIQUE_FILE = MULTIPLE_GENOMIC_FILE = CASSETTE_FILE = UNALIGNED_FILE = open(
                outfile_all, 'w')
            unaligned_as_fasta = False
        else:
            UNALIGNED_FILE = open(outfile_unaligned, 'w')
            CASSETTE_FILE = open(outfile_cassette, 'w')
            MULTIPLE_GENOMIC_FILE = open(outfile_multiple_genomic, 'w')
            GENOMIC_UNIQUE_FILE = open(outfile_genomic_unique, 'w')
            unaligned_as_fasta = True
        category_readcounts = {
            'unaligned': 0,
            'cassette': 0,
            'multiple-genomic': 0,
            'genomic-unique': 0,
            'cassette-multiple': 0
        }
        for (readname, full_aln_list) in aln_list_generator:
            reduced_aln_list = reduce_alignment_list(full_aln_list)
            final_aln_list = prioritize_cassette_reads(
                reduced_aln_list, if_cassette_function=is_cassette_chromosome)
            categorize_reads_print_to_files(
                readname,
                final_aln_list,
                category_readcounts,
                UNALIGNED_FILE,
                CASSETTE_FILE,
                MULTIPLE_GENOMIC_FILE,
                GENOMIC_UNIQUE_FILE,
                unaligned_as_fasta=unaligned_as_fasta,
                multiple_to_write=options.multiple_to_show,
                input_collapsed_to_unique=options.input_collapsed_to_unique,
                no_multi_cassette_warnings=options.no_multi_cassette_warnings)
        if options.dont_split_by_category:
            # all files are actually the same pointer, so only close once
            GENOMIC_UNIQUE_FILE.close()
        else:
            UNALIGNED_FILE.close()
            CASSETTE_FILE.close()
            MULTIPLE_GENOMIC_FILE.close()
            GENOMIC_UNIQUE_FILE.close()

        # delete alignment tmpfiles now that they've been parsed
        if not options.keep_tmpfiles:
            os.remove(tmpfile_genome)
            if options.cassette_bowtie_index != 'NONE':
                os.remove(tmpfile_cassette)

        ### print category_readcounts to INFOFILE in a nice way
        text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS"
        cassette_multiple = category_readcounts.pop('cassette-multiple')
        total_reads = sum(category_readcounts.values())
        text2 = "# total reads:  %s" % total_reads
        if options.input_collapsed_to_unique:
            text2 += " (uncollapsed readcounts)"
        lines = [text1, text2]
        for category, count in sorted(category_readcounts.items()):
            text = "# %s:  %s" % (category,
                                  value_and_percentages(count, [total_reads]))
            if category == 'cassette' and cassette_multiple:
                text += ' (Warning: %s multiple!!)' % cassette_multiple
            lines.append(text)
        INFOFILE.write('\n')
        for text in lines:
            INFOFILE.write(text + '\n')
            if not options.quiet: print text

        ### copy preprocessing metadata file to the bottom of the new metadata file
        INFOFILE.write(
            "\n\n################## Metadata from input preprocessing ##################\n\n"
        )
        if options.input_metadata_file == 'NONE':
            INFOFILE.write(
                'Not looking for a metadata input file, as specified by options\n'
            )
        else:
            if options.input_metadata_file == 'AUTO':
                # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both.
                #  (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt;
                #   in the old version it was just X.txt and X_info.txt)
                # MAYBE-TODO add a test-case for this thing!  Probably too minor.
                metafile_basename = os.path.splitext(infile)[0]
                options.input_metadata_file = metafile_basename + '_info.txt'
                if not os.path.exists(options.input_metadata_file):
                    if metafile_basename.endswith(
                            '_3prime') or metafile_basename.endswith(
                                '_5prime'):
                        options.input_metadata_file = metafile_basename[:-len(
                            '_3prime')] + '_info.txt'
                text = 'Automatically determining metadata input file name: %s\n' % options.input_metadata_file
                if not options.quiet:
                    print text,
            else:
                text = 'Metadata input file name provided in options: %s\n' % options.input_metadata_file
            INFOFILE.write(text + '\n')
            if os.path.exists(options.input_metadata_file):
                print_text_from_file(options.input_metadata_file,
                                     INFOFILE,
                                     printing=False)
            else:
                text = 'Metadata input file %s not found!\n' % options.input_metadata_file
                if not options.quiet:
                    print text,
                INFOFILE.write(text)
def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    try:
        [infile] = args
    except ValueError:
        parser.print_help()
        sys.exit("Error: exactly one infile required! %s infiles provided: %s"%(len(args), args))
        # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles...

    other_bowtie_options_split = options.other_bowtie_options.split(' ')
    if any([x in other_bowtie_options_split for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))]):
        raise Exception("Cannot include -v/-n/-e and related bowtie options in -B!  Use separate -e option for that; "
                        "note that this program allows -v bowtie mode only.")
    if any([x in other_bowtie_options_split for x in ('-m -k -a --all'.split(' '))]):
        raise Exception("Cannot include -m/-a bowtie options in -B!  Use separate -m option for that.")

    specific_bowtie_options = '-v %s'%options.allowed_errors
    if not any([x in options.other_bowtie_options for x in ('-f', '-q')]):
        infile_format = check_fasta_fastq_format(infile)
        if infile_format=='fasta':      specific_bowtie_options += ' -f'
        elif infile_format=='fastq':    specific_bowtie_options += ' -q'
        else:                           raise Exception("Cannot process auto-detected infile format %s!"%infile_format)

    # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments
    if options.multiple_to_show == -1:  multiple_bowtie_option = '-a' 
    else:                               multiple_bowtie_option = '-k %s'%max(options.multiple_to_show, 2)

    # output file names: temporary for alignments, final (split or all), metadata info file. 
    outfile_suffix = '.sam' if any([x in options.other_bowtie_options for x in ['-S','--sam']]) else '.map'
    tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix
    if options.cassette_bowtie_index != 'NONE':
        tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix
    if options.dont_split_by_category:
        outfile_all = options.outfile_basename + outfile_suffix
    else:
        outfile_unaligned = options.outfile_basename + '_unaligned.fa'
        outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix
        outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\
                                   + ('.fa' if options.multiple_to_show==0 else outfile_suffix)
        outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix
    infofile = options.outfile_basename + '_info.txt'

    with open(infofile,'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE,options)

        ### run bowtie vs the main/genome index file
        # run 'bowtie --version' to get that data (print to INFOFILE but not stdout)
        INFOFILE.write('\n\n')
        run_command_print_info_output("bowtie --version", INFOFILE, printing_level=0, shell=True)
        # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE
        #   (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's 
        #    an error, so I can see the error message!  Or I could try to detect whether there was an error or not
        #    based on the output contents, but that seems like unnecessary work.)
        INFOFILE.write('\n\n')
        command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, multiple_bowtie_option, 
                                      options.other_bowtie_options, options.genome_bowtie_index, infile, tmpfile_genome)
        run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True)

        ### run bowtie vs the cassette index file if given
        if options.cassette_bowtie_index != 'NONE':
            INFOFILE.write('\n\n')
            command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, '--all', options.other_bowtie_options, 
                                                  options.cassette_bowtie_index, infile, tmpfile_cassette)
            run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True)

        ### Check that bowtie runs worked
        missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message."
        if not os.access(tmpfile_genome, os.R_OK):
            sys.exit(missing_alnfile_text%(options.genome_bowtie_index, infofile))
        if options.cassette_bowtie_index != 'NONE' and not os.access(tmpfile_cassette, os.R_OK):
            sys.exit(missing_alnfile_text%(options.cassette_bowtie_index, infofile))
        # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1?  Hard - bowtie is unfortunately ANNOYING 
        #  and uses stderr both for normal output and for errors, AND gives no returncode. 

        ### Parse the two alignment files, and merge them together (remove sub-optimal alignments,
        #    (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files.
        readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_genome)
        if options.cassette_bowtie_index != 'NONE':
            readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_cassette, starting_dict=readname_to_aln_list)
        # MAYBE-TODO right now I'm reading the entire files into memory before merging and processing them, 
        #  which takes a fair amount of memory - could instead write something that would read both alignment files
        #  in parallel and do the merging and output-writing read-by-read.  Do that if I start getting memory issues.
        reduce_alignment_dict(readname_to_aln_list)
        prioritize_cassette_reads(readname_to_aln_list, if_cassette_function=is_cassette_chromosome)
        # delete alignment tmpfiles now that they've been parsed
        os.remove(tmpfile_genome)
        if options.cassette_bowtie_index != 'NONE':
            os.remove(tmpfile_cassette)

        ### Decide the proper category for each read, and write the info to appropriate final output files
        if options.dont_split_by_category:
            with open(outfile_all,'w') as ALL_FILE:
                category_counts = categorize_reads_print_to_files(readname_to_aln_list, ALL_FILE, ALL_FILE, ALL_FILE, 
                                          ALL_FILE, unaligned_as_fasta=False, multiple_to_write=options.multiple_to_show, 
                                          input_collapsed_to_unique=options.input_collapsed_to_unique, 
                                          no_warnings=options.quiet)
        else:
            with open(outfile_unaligned, 'w') as UNALIGNED_FILE:
                with open(outfile_cassette, 'w') as CASSETTE_FILE:
                    with open(outfile_multiple_genomic, 'w') as MULTIPLE_GENOMIC_FILE:
                        with open(outfile_genomic_unique, 'w') as GENOMIC_UNIQUE_FILE:
                            category_counts = categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, 
                                                      CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, 
                                                      unaligned_as_fasta=True, multiple_to_write=options.multiple_to_show, 
                                                      input_collapsed_to_unique=options.input_collapsed_to_unique, 
                                                      no_warnings=options.quiet)

        ### print category_readcounts to INFOFILE in a nice way
        text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS"
        cassette_multiple = category_counts.pop('cassette-multiple')
        total_reads = sum(category_counts.values())
        text2 = "# total reads:  %s"%total_reads
        if options.input_collapsed_to_unique: text2 +=" (uncollapsed readcounts)"
        lines = [text1, text2]
        for category,count in sorted(category_counts.items()):
            text = "# %s:  %s"%(category, value_and_percentages(count, [total_reads]))
            if category=='cassette' and cassette_multiple:  
                text += ' (Warning: %s multiple!!)'%cassette_multiple
            lines.append(text)
        INFOFILE.write('\n')
        for text in lines:
            INFOFILE.write(text + '\n')
            if not options.quiet: print text

        ### copy preprocessing metadata file to the bottom of the new metadata file
        INFOFILE.write("\n\n################## Metadata from input preprocessing ##################\n\n")
        if options.input_metadata_file == 'NONE':
            INFOFILE.write('Not looking for a metadata input file, as specified by options\n')
        else:
            if options.input_metadata_file == 'AUTO':
                # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both.
                #  (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt; 
                #   in the old version it was just X.txt and X_info.txt)
                # MAYBE-TODO add a test-case for this thing!  Probably too minor.
                metafile_basename = os.path.splitext(infile)[0] 
                options.input_metadata_file = metafile_basename + '_info.txt'
                if not os.path.exists(options.input_metadata_file):
                    if metafile_basename.endswith('_3prime') or metafile_basename.endswith('_5prime'):
                        options.input_metadata_file = metafile_basename[:-len('_3prime')] + '_info.txt'
                text = 'Automatically determining metadata input file name: %s\n'%options.input_metadata_file
                if not options.quiet:
                    print text,
            else:
                text = 'Metadata input file name provided in options: %s\n'%options.input_metadata_file
            INFOFILE.write(text+'\n')
            if os.path.exists(options.input_metadata_file):
                print_text_from_file(options.input_metadata_file, INFOFILE, printing=False)
            else:
                text = 'Metadata input file %s not found!\n'%options.input_metadata_file
                if not options.quiet:
                    print text,
                INFOFILE.write(text)