def print_flanking_regions_to_fasta(flanking_region_count_list, outfile, convert_counts=lambda x: x):
    """ Given a (seq,count) list, make fasta file with the each seq present convert_counts(count) times. """
    with open(outfile, "w") as OUTFILE:
        for N, (seq, count) in enumerate(flanking_region_count_list):
            seqname = "%s (%s reads)" % (N, count)
            for _ in range(convert_counts(count)):
                write_fasta_line(seqname, seq, OUTFILE)
def categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, 
                                    GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=-1, 
                                    input_collapsed_to_unique=False, no_warnings=False):
    """ Decide the proper category for each read, write to appropriate output file; return category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments. 
    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.
    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    Each read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, multiple_to_write lines will be written; if unaligned_as_fasta, unaligned reads
     will be written as fasta instead of SAM format (and so will multiple-genomic if multiple_to_write is 0).
    """
    category_readcounts = {'unaligned':0, 'cassette':0, 'multiple-genomic':0, 'genomic-unique':0, 'cassette-multiple':0}

    for readname,aln_list in sorted(readname_to_aln_list.items()):
        readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname)
        # if there's a single alignment, it's unaligned, cassette or genomic-unique
        if len(aln_list) == 1:
            aln = aln_list[0]
            if not aln.aligned:
                category_readcounts['unaligned'] += readcount
                if unaligned_as_fasta:  write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
                else:                   write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
            elif is_cassette_chromosome(aln.iv.chrom):
                category_readcounts['cassette'] += readcount
                write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
            else:
                category_readcounts['genomic-unique'] += readcount
                write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
        # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
        else:
            assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!"
            # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
            # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
            if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
                assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!"
                category_readcounts['cassette'] += readcount
                if not no_warnings:
                    print "Warning: multiple cassette alignments! Printing all to cassette file.\n\t%s"%(aln_list)
                category_readcounts['cassette-multiple'] += readcount
                for aln in aln_list:
                    write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
            # multiple genomic alignments - how many get written depends on multiple_to_write; 
            #  if it's 0, the outfile should be fasta, or else I guess it should be written as unaligned?
            #   (MAYBE-TODO writing single multiple as unaligned not implemented!)
            else:
                category_readcounts['multiple-genomic'] += readcount
                if multiple_to_write == 0:
                    if unaligned_as_fasta:
                        write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE)
                    else:
                        raise Exception("Writing 0 multiple alignments in SAM format NOT IMPLEMENTED!")
                else:
                    for aln in aln_list[:multiple_to_write]:
                        write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    return category_readcounts
def _trim_prefix_single(seqname, seq, prefix_bases, TRIMMED_OUTFILE, WRONG_PREFIX_OUTFILE=None):
    """ If prefix_bases is a prefix of seq, trim it off, print to TRIMMED_OUTFILE, return 1; 
    otherwise print to WRONG_PREFIX_OUTFILE if not None, return 0. 
    """
    if seq.upper().startswith(prefix_bases.upper()):
        seq_trimmed = seq[len(prefix_bases):]
        # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
        write_fasta_line(seqname, seq_trimmed.upper(), TRIMMED_OUTFILE)
        return 1
    else:
        if WRONG_PREFIX_OUTFILE is not None:
            # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
            write_fasta_line(seqname, seq.upper(), WRONG_PREFIX_OUTFILE)
        return 0
Example #4
0
def _trim_prefix_single(seqname,
                        seq,
                        prefix_bases,
                        TRIMMED_OUTFILE,
                        WRONG_PREFIX_OUTFILE=None):
    """ If prefix_bases is a prefix of seq, trim it off, print to TRIMMED_OUTFILE, return 1; 
    otherwise print to WRONG_PREFIX_OUTFILE if not None, return 0. 
    """
    if seq.upper().startswith(prefix_bases.upper()):
        seq_trimmed = seq[len(prefix_bases):]
        # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
        write_fasta_line(seqname, seq_trimmed.upper(), TRIMMED_OUTFILE)
        return 1
    else:
        if WRONG_PREFIX_OUTFILE is not None:
            # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
            write_fasta_line(seqname, seq.upper(), WRONG_PREFIX_OUTFILE)
        return 0
def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any([x in options.other_cutadapt_options for x in adapter_options.split()]):
        sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options
                 +" - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    infile_suffix = os.path.splitext(infile)[1]
    outfile_suffix = '.fa'
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    ends = "5' 3'".split()
    outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends}
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    cutadapt_tmpfiles_original = cutadapt_tmpfiles
    
    with open(infofile,'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE,options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input", 
                                             options.total_read_number_only, False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... 
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), 
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                "first-base-trimming output", options.total_read_number_only, False)
            untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
            # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), 
            #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the 
            #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue) 
            #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it 
        if not if_running_cutadapt:
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            cutadapt_tmpfile = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seq) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                if not adapter_seq.replace('"','').replace("'",'').replace(' ',''):  continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                full_cutadapt_options = '-a %s %s'%(adapter_seq, options.other_cutadapt_options)
                for extra_seq_category in ('untrimmed', 'too-short', 'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="cutadapt for %s"%end_type)
                cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                               "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:                no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:     pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file,'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile):     os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):     os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because 
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile):     continue
                command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                               cutadapt_tmpfile, outfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="fastx_collapser for %s"%end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1),
                                    "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity>1: print text
                INFOFILE.write(text+'\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text+'\n')
            extra_collapsed_readcounts = {}    
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                                   extra_file)
                retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True, 
                                                                             input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))]
        final_output.append("# starting total read count:\t%s\n"%starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'%
                                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append('#  "bad" read count (wrong-start) (%% of total):\t%s\n'%
                                value_and_percentages(untrimmed_readcount, [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'%
                        (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount])))
            final_output.append('#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'%
                                value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type, 
                                value_and_percentages(cutadapt_readcount[end_type], [starting_readcount])))
        final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'%
                            value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type
                                    +'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type], 
                                                                                       [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity>0:  print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, 
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile):     os.remove(tmpfile)
Example #6
0
def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
        # TODO multiple infiles would be nice!
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any(
        [x in options.other_cutadapt_options
         for x in adapter_options.split()]):
        sys.exit(
            "Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"
            % adapter_options +
            " - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    # outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that?
    #infile_suffix = os.path.splitext(infile)[1]
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    outfile_suffix = '.fa'
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    # outfiles and tmpfiles should be split by end ONLY if cutadapt is being run!
    if options.other_cutadapt_options == 'NONE' or not (
            options.adapter_5prime or options.adapter_3prime):
        outfiles = {'': options.outfile_basename + '.fa'}
        no_cassette_tmpfiles = {
            '': options.outfile_basename + '_no-cassette-tmpfile.fa'
        }
        cutadapt_tmpfiles = {
            '': options.outfile_basename + '_cutadapt-tmpfile.fa'
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)
    else:
        ends = "5' 3'".split()
        outfiles = {
            end:
            options.outfile_basename + '_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        no_cassette_tmpfiles = {
            end: options.outfile_basename +
            '_no-cassette-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles = {
            end: options.outfile_basename +
            '_cutadapt-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)

    with open(infofile, 'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE, options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE,
                                             bool(options.verbosity > 1),
                                             "original input",
                                             options.total_read_number_only,
                                             False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function...
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl),
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile,
                        wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE,
                                                bool(options.verbosity > 1),
                                                "first-base-trimming output",
                                                options.total_read_number_only,
                                                False)
            untrimmed_readcount = check_readcount(wrong_start_file, None,
                                                  False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
        # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version),
        #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the
        #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue)
        #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it
        if not if_running_cutadapt:
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            cutadapt_tmpfiles[''] = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seqs) in [("5'", options.adapter_5prime),
                                             ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                adapter_seqs = adapter_seqs.replace('"', '').replace(
                    "'", '').replace(' ', '')
                if not adapter_seqs: continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                all_adapter_options = ' '.join(
                    ['-a %s' % seq for seq in adapter_seqs.split(',')])
                full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options
                for extra_seq_category in ('untrimmed', 'too-short',
                                           'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s' % (
                            extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s" % (
                    full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command,
                                              INFOFILE,
                                              options.verbosity,
                                              shell=True,
                                              program_name="cutadapt for %s" %
                                              end_type)
                cutadapt_readcount[end_type] = check_readcount(
                    cutadapt_tmpfile, INFOFILE, bool(options.verbosity > 1),
                    "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(
                    no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:
                    no_cassette_seqs.append(
                        dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:
                    pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(
                *[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file, 'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header,
                                     no_cassette_seqs[0][header].upper(),
                                     NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile): os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):
                    os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile): continue
                command = "fastx_collapser -v %s -i %s -o %s" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    cutadapt_tmpfile, outfile)
                run_command_print_info_output(
                    command,
                    INFOFILE,
                    options.verbosity,
                    shell=True,
                    program_name="fastx_collapser for %s" % end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(
                    outfile,
                    INFOFILE,
                    bool(options.verbosity > 1),
                    "fastx_collapser output",
                    options.total_read_number_only,
                    input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(
                    outfile,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[
                        end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity > 1: print text
                INFOFILE.write(text + '\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text + '\n')
            extra_collapsed_readcounts = {}
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    extra_file)
                retcode = run_command_print_info_output(command,
                                                        None,
                                                        options.verbosity - 1,
                                                        shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(
                    extra_file,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = [
            "### Final read count info for %s (main output files %s)\n" %
            (infile, ', '.join(outfiles))
        ]
        final_output.append("# starting total read count:\t%s\n" %
                            starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append(
                '# "good" read count after start trimming (%% of total):\t%s\n'
                %
                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append(
                '#  "bad" read count (wrong-start) (%% of total):\t%s\n' %
                value_and_percentages(untrimmed_readcount,
                                      [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'
                    % (end_type,
                       value_and_percentages(
                           cutadapt_readcount[end_type],
                           [starting_readcount, trimmed_readcount])))
            final_output.append(
                '#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'
                %
                value_and_percentages(no_cassette_readcount,
                                      [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append(
                '## final "good" %s reads (in main output file) (%% of total):\t%s\n'
                % (end_type,
                   value_and_percentages(cutadapt_readcount[end_type],
                                         [starting_readcount])))
        final_output.append(
            '## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'
            % value_and_percentages(
                starting_readcount - sum(cutadapt_readcount.values()),
                [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s unique sequence count after collapsing reads to unique sequences '
                    % end_type + '(%% of read count):\t%s\n' %
                    value_and_percentages(collapsed_readcount[end_type],
                                          [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append(
                    '# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[wrong_start_file],
                        [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append(
                    '# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[no_cassette_file],
                        [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity > 0: print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps,
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original
                        ] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile): os.remove(tmpfile)
def categorize_reads_print_to_files(readname,
                                    aln_list,
                                    category_readcounts,
                                    UNALIGNED_FILE,
                                    CASSETTE_FILE,
                                    MULTIPLE_GENOMIC_FILE,
                                    GENOMIC_UNIQUE_FILE,
                                    unaligned_as_fasta=False,
                                    multiple_to_write=-1,
                                    input_collapsed_to_unique=False,
                                    no_multi_cassette_warnings=False):
    """ Decide the proper category for the read, write to appropriate output file; adjust category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). 

    The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; 
     they can all be the SAME file object if desired.)

    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.

    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    The read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats 
      the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments.
      
    If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, 
     and so will multiple if multiple_to_write is 0.
    """
    readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(
        readname)
    # if there's a single alignment, it's unaligned, cassette or genomic-unique
    if len(aln_list) == 1:
        aln = aln_list[0]
        if not aln.aligned:
            category = 'unaligned'
            if unaligned_as_fasta:
                write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
            else:
                write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
        elif is_cassette_chromosome(aln.iv.chrom):
            category = 'cassette'
            write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
        else:
            category = 'genomic-unique'
            write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
    # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
    else:
        assert all([aln.aligned for aln in aln_list
                    ]), "Shouldn't see multiple unaligned lines per read!"
        # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
        # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
        # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.)
        if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
            assert all([
                is_cassette_chromosome(aln.iv.chrom) for aln in aln_list
            ]), "Mixed cassette/other!"
            if not no_multi_cassette_warnings:
                print(
                    "Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "
                    % aln_list[0].read.seq,
                    "first 3 positions %s" % ', '.join([
                        "%s %s %s" % (a.iv.chrom, a.iv.strand, a.iv.start)
                        for a in aln_list[:3]
                    ]))
                category = 'cassette-multiple'
            else:
                category = 'cassette'
            # first position alphabetically is chosen - MAYBE-TODO add other choice options?
            aln_to_print = sorted(
                aln_list,
                key=lambda a:
                (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0]
            # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names?
            #   but that would be tricky, need to strip matching prefixes from them,
            #   what about multiple alignments to SAME chromosome, etc.
            aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others'
            write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE)
        # multiple genomic alignments:
        # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line,
        #   else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments.
        # - if multiple_to_write>0, print that many normal SAM lines for N alignments
        # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file?
        else:
            category = 'multiple-genomic'
            if multiple_to_write == 0:
                if unaligned_as_fasta:
                    write_fasta_line(readname, aln_list[0].read.seq,
                                     MULTIPLE_GENOMIC_FILE)
                else:
                    aln = aln_list[0]
                    MULTIPLE_GENOMIC_FILE.write(
                        '%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n' %
                        (aln.read.name, aln.read.seq, aln.read.qualstr,
                         len(aln_list)))
            else:
                for aln in aln_list[:multiple_to_write]:
                    write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    category_readcounts[category] += readcount
    return category
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, 
                                    MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, 
                                    input_collapsed_to_unique=False, no_multi_cassette_warnings=False):
    """ Decide the proper category for the read, write to appropriate output file; adjust category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). 

    The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; 
     they can all be the SAME file object if desired.)

    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.

    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    The read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats 
      the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments.
      
    If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, 
     and so will multiple if multiple_to_write is 0.
    """
    readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname)
    # if there's a single alignment, it's unaligned, cassette or genomic-unique
    if len(aln_list) == 1:
        aln = aln_list[0]
        if not aln.aligned:
            category = 'unaligned'
            if unaligned_as_fasta:  write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
            else:                   write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
        elif is_cassette_chromosome(aln.iv.chrom):
            category = 'cassette'
            write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
        else:
            category = 'genomic-unique'
            write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
    # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
    else:
        assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!"
        # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
        # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
        # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.)
        if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
            assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!"
            if not no_multi_cassette_warnings:
                print ("Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "%aln_list[0].read.seq, 
                       "first 3 positions %s"%', '.join(["%s %s %s"%(a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3]]))
                category = 'cassette-multiple'
            else:
                category = 'cassette'
            # first position alphabetically is chosen - MAYBE-TODO add other choice options?
            aln_to_print = sorted(aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0]
            # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names?
            #   but that would be tricky, need to strip matching prefixes from them, 
            #   what about multiple alignments to SAME chromosome, etc.
            aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others'
            write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE)
        # multiple genomic alignments: 
        # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, 
        #   else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments.
        # - if multiple_to_write>0, print that many normal SAM lines for N alignments
        # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file?
        else:
            category = 'multiple-genomic'
            if multiple_to_write == 0:
                if unaligned_as_fasta:
                    write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE)
                else:
                    aln = aln_list[0]
                    MULTIPLE_GENOMIC_FILE.write('%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n'%(aln.read.name, aln.read.seq, 
                                                                                                 aln.read.qualstr, len(aln_list)))
            else:
                for aln in aln_list[:multiple_to_write]:
                    write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    category_readcounts[category] += readcount
    return category