Python parse_fasta Examples

Programming Language: Python

Namespace/Package Name: basic_seq_utilities

Method/Function: parse_fasta

Examples at hotexamples.com: 4

Python parse_fasta - 4 examples found. These are the top rated real world Python examples of basic_seq_utilities.parse_fasta extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: mutant_utilities.py Project: Jonikas-Lab/mutant-pools

def get_chromosome_lengths(genome_file=None):
    """ Return chromosome:length dictionary based on reading a genome fasta file. """
    original_input = genome_file
    if genome_file is None:
        genome_file = DEFAULT_GENOME_CASSETTE_FILE
    chromosome_lengths = defaultdict(int)
    try:
        for header,seq in basic_seq_utilities.parse_fasta(genome_file):
            chromosome_lengths[header] = len(seq)
        return dict(chromosome_lengths)
    except IOError:
        file_info = "default " if original_input is None else ""
        raise ValueError("%sgenome fasta file %s not found! Provide filename."%(file_info, genome_file))

Example #2

Show file

def get_chromosome_lengths(genome_file=None):
    """ Return chromosome:length dictionary based on reading a genome fasta file. """
    original_input = genome_file
    if genome_file is None:
        genome_file = DEFAULT_GENOME_CASSETTE_FILE
    chromosome_lengths = defaultdict(int)
    try:
        for header, seq in basic_seq_utilities.parse_fasta(genome_file):
            chromosome_lengths[header] = len(seq)
        return dict(chromosome_lengths)
    except IOError:
        file_info = "default " if original_input is None else ""
        raise ValueError(
            "%sgenome fasta file %s not found! Provide filename." %
            (file_info, genome_file))

Example #3

Show file

File: deepseq_preprocessing_wrapper.py Project: Jonikas-Lab/Zhang-Patena-2014

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any([x in options.other_cutadapt_options for x in adapter_options.split()]):
        sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options
                 +" - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    infile_suffix = os.path.splitext(infile)[1]
    outfile_suffix = '.fa'
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    ends = "5' 3'".split()
    outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends}
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends}
    cutadapt_tmpfiles_original = cutadapt_tmpfiles
    
    with open(infofile,'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE,options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input", 
                                             options.total_read_number_only, False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... 
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), 
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                "first-base-trimming output", options.total_read_number_only, False)
            untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
            # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), 
            #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the 
            #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue) 
            #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it 
        if not if_running_cutadapt:
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            cutadapt_tmpfile = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seq) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                if not adapter_seq.replace('"','').replace("'",'').replace(' ',''):  continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                full_cutadapt_options = '-a %s %s'%(adapter_seq, options.other_cutadapt_options)
                for extra_seq_category in ('untrimmed', 'too-short', 'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="cutadapt for %s"%end_type)
                cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1), 
                                                               "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:                no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:     pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file,'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile):     os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity>0:   print text
            INFOFILE.write(text+'\n')
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):     os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because 
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile):     continue
                command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                               cutadapt_tmpfile, outfile)
                run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, 
                                              program_name="fastx_collapser for %s"%end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1),
                                    "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity>1: print text
                INFOFILE.write(text+'\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text+'\n')
            extra_collapsed_readcounts = {}    
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], 
                                                                   extra_file)
                retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True, 
                                                                             input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))]
        final_output.append("# starting total read count:\t%s\n"%starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'%
                                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append('#  "bad" read count (wrong-start) (%% of total):\t%s\n'%
                                value_and_percentages(untrimmed_readcount, [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'%
                        (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount])))
            final_output.append('#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'%
                                value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type, 
                                value_and_percentages(cutadapt_readcount[end_type], [starting_readcount])))
        final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'%
                            value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type
                                    +'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type], 
                                                                                       [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                        %value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity>0:  print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, 
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile):     os.remove(tmpfile)

Example #4

Show file

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """
    try:
        [infile] = args
        # TODO multiple infiles would be nice!
    except ValueError:
        parser = define_option_parser()
        parser.print_help()
        sys.exit("Error: exactly one infile required!")
    # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc...

    ### check inputs
    adapter_options = '-a --adapter -b --anywhere -g --front'
    if any(
        [x in options.other_cutadapt_options
         for x in adapter_options.split()]):
        sys.exit(
            "Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"
            % adapter_options +
            " - use -5/-3 options to specify adapters instead!")

    ### outfile and tmpfile names
    # outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that?
    #infile_suffix = os.path.splitext(infile)[1]
    #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix
    outfile_suffix = '.fa'
    infofile = options.outfile_basename + '_info.txt'
    wrong_start_file = options.outfile_basename + '_wrong-start.fa'
    no_cassette_file = options.outfile_basename + '_no-cassette.fa'
    trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa'
    # outfiles and tmpfiles should be split by end ONLY if cutadapt is being run!
    if options.other_cutadapt_options == 'NONE' or not (
            options.adapter_5prime or options.adapter_3prime):
        outfiles = {'': options.outfile_basename + '.fa'}
        no_cassette_tmpfiles = {
            '': options.outfile_basename + '_no-cassette-tmpfile.fa'
        }
        cutadapt_tmpfiles = {
            '': options.outfile_basename + '_cutadapt-tmpfile.fa'
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)
    else:
        ends = "5' 3'".split()
        outfiles = {
            end:
            options.outfile_basename + '_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        no_cassette_tmpfiles = {
            end: options.outfile_basename +
            '_no-cassette-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles = {
            end: options.outfile_basename +
            '_cutadapt-tmpfile_%s.fa' % end.replace("'", "prime")
            for end in ends
        }
        cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles)

    with open(infofile, 'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE, options)
        INFOFILE.write('\n')

        ### 0. look at the infile; make sure it's readable, etc
        #       (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format)
        starting_readcount = check_readcount(infile, INFOFILE,
                                             bool(options.verbosity > 1),
                                             "original input",
                                             options.total_read_number_only,
                                             False)

        ### 1. Trim the first bases (from adapter)
        # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function...
        #  Would that be faster, or better in any other way?
        # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl),
        #  since that's the eventual point of having those constant first bases there...
        if options.first_bases_to_trim == 'NONE':
            text = "### Not trimming first bases, since NONE was passed to -F option.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            trimmed_tmpfile = infile
            trimmed_readcount = starting_readcount
            untrimmed_readcount = 0
        else:
            trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile,
                        wrong_start_file, INFOFILE, options.verbosity)
            trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE,
                                                bool(options.verbosity > 1),
                                                "first-base-trimming output",
                                                options.total_read_number_only,
                                                False)
            untrimmed_readcount = check_readcount(wrong_start_file, None,
                                                  False, True, False)
            assert trimmed_readcount+untrimmed_readcount==starting_readcount,\
                    "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\
                    +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount)

        ### 2. run cutadapt to strip cassette sequence
        # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version),
        #  to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the
        #  standard install!  Or wait until the cutadapt maintainer does it (I submitted it as an issue)
        #  (see ~/experiments/basic_programs/cutadapt_modifications/).
        if_running_cutadapt = True
        if options.other_cutadapt_options == 'NONE':
            if_running_cutadapt = False
            text = "### Not running cutadapt, since NONE was passed to -A option.\n"
        elif not (options.adapter_5prime or options.adapter_3prime):
            if_running_cutadapt = False
            text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n"
        # if not running it, just skip it
        if not if_running_cutadapt:
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            cutadapt_tmpfiles[''] = trimmed_tmpfile
            cutadapt_readcount = {'all': trimmed_readcount}
            no_cassette_readcount = 0
        # otherwise run the 5' and 3' ends separately
        else:
            cutadapt_readcount = {}
            for (end_type, adapter_seqs) in [("5'", options.adapter_5prime),
                                             ("3'", options.adapter_3prime)]:
                assert end_type in ends
                # if the adapter sequence for that side is empty, skip
                adapter_seqs = adapter_seqs.replace('"', '').replace(
                    "'", '').replace(' ', '')
                if not adapter_seqs: continue
                cutadapt_tmpfile = cutadapt_tmpfiles[end_type]
                all_adapter_options = ' '.join(
                    ['-a %s' % seq for seq in adapter_seqs.split(',')])
                full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options
                for extra_seq_category in ('untrimmed', 'too-short',
                                           'too-long'):
                    if not extra_seq_category in full_cutadapt_options:
                        full_cutadapt_options += ' --%s-output %s' % (
                            extra_seq_category, no_cassette_tmpfiles[end_type])
                command = "cutadapt_mod %s -o %s %s" % (
                    full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile)
                run_command_print_info_output(command,
                                              INFOFILE,
                                              options.verbosity,
                                              shell=True,
                                              program_name="cutadapt for %s" %
                                              end_type)
                cutadapt_readcount[end_type] = check_readcount(
                    cutadapt_tmpfile, INFOFILE, bool(options.verbosity > 1),
                    "cutadapt output", options.total_read_number_only, False)
                tmp_no_cassette_readcount = check_readcount(
                    no_cassette_tmpfiles[end_type], None, False, True, False)
                assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\
                        "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                        +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount)
            # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles!
            text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            no_cassette_seqs = []
            for no_cassette_tmpfile in no_cassette_tmpfiles.values():
                try:
                    no_cassette_seqs.append(
                        dict(parse_fasta(no_cassette_tmpfile)))
                except IOError:
                    pass
            # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets
            overlapping_no_cassette_headers = set.intersection(
                *[set(d.keys()) for d in no_cassette_seqs])
            no_cassette_readcount = len(overlapping_no_cassette_headers)
            with open(no_cassette_file, 'w') as NO_CASSETTE_FILE:
                for header in sorted(overlapping_no_cassette_headers):
                    # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase
                    write_fasta_line(header,
                                     no_cassette_seqs[0][header].upper(),
                                     NO_CASSETTE_FILE)
            assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\
                            "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\
                            +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount)
            # remove the original no_cassette_tmpfiles
            for tmpfile in no_cassette_tmpfiles.values():
                if os.path.exists(tmpfile): os.remove(tmpfile)

        ### 3. run fastx_collapser to collapse the sequences to unique
        if not options.collapse_to_unique:
            text = "### Not running fastx_collapser, since -C option was not used.\n"
            if options.verbosity > 0: print text
            INFOFILE.write(text + '\n')
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                if os.path.exists(cutadapt_tmpfile):
                    os.rename(cutadapt_tmpfile, outfiles[end_type])
            collapsed_readcount = cutadapt_readcount
            # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because
            #    fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off.
        else:
            collapsed_readcount, uncollapsed_readcount = {}, {}
            for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items():
                outfile = outfiles[end_type]
                # if there is no file for that end, skip
                if not os.path.exists(cutadapt_tmpfile): continue
                command = "fastx_collapser -v %s -i %s -o %s" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    cutadapt_tmpfile, outfile)
                run_command_print_info_output(
                    command,
                    INFOFILE,
                    options.verbosity,
                    shell=True,
                    program_name="fastx_collapser for %s" % end_type)
                INFOFILE.write('\n')
                collapsed_readcount[end_type] = check_readcount(
                    outfile,
                    INFOFILE,
                    bool(options.verbosity > 1),
                    "fastx_collapser output",
                    options.total_read_number_only,
                    input_collapsed_to_unique=False)
                # make sure uncollapsed readcount is the same as before collapsing
                uncollapsed_readcount[end_type] = check_readcount(
                    outfile,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=True)
                if not uncollapsed_readcount[end_type] == cutadapt_readcount[
                        end_type]:
                    text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count!  Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n"
                else:
                    text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n"
                if options.verbosity > 1: print text
                INFOFILE.write(text + '\n')
            # also run fastx_collapser on wrong_start_file and no_cassette_file
            text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n"
            if options.verbosity: print text
            INFOFILE.write(text + '\n')
            extra_collapsed_readcounts = {}
            for extra_file in (wrong_start_file, no_cassette_file):
                command = "fastx_collapser -v %s -i %s -o tmp.fa" % (
                    FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding],
                    extra_file)
                retcode = run_command_print_info_output(command,
                                                        None,
                                                        options.verbosity - 1,
                                                        shell=True)
                # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists
                #  (also it chokes on empty files, AND on lowercase bases!  That's a bit ridiculous...)
                #  it also apparently sometimes changes the order of the sequences for no good reason! ARGH.
                if retcode in (0, None) and os.path.exists('tmp.fa'):
                    os.remove(extra_file)
                    os.rename('tmp.fa', extra_file)
                extra_collapsed_readcounts[extra_file] = check_readcount(
                    extra_file,
                    None,
                    False,
                    "",
                    True,
                    input_collapsed_to_unique=False)

        ### Final readcount check
        final_output = [
            "### Final read count info for %s (main output files %s)\n" %
            (infile, ', '.join(outfiles))
        ]
        final_output.append("# starting total read count:\t%s\n" %
                            starting_readcount)
        if not options.first_bases_to_trim == 'NONE':
            final_output.append(
                '# "good" read count after start trimming (%% of total):\t%s\n'
                %
                value_and_percentages(trimmed_readcount, [starting_readcount]))
            final_output.append(
                '#  "bad" read count (wrong-start) (%% of total):\t%s\n' %
                value_and_percentages(untrimmed_readcount,
                                      [starting_readcount]))
        if if_running_cutadapt:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'
                    % (end_type,
                       value_and_percentages(
                           cutadapt_readcount[end_type],
                           [starting_readcount, trimmed_readcount])))
            final_output.append(
                '#  "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'
                %
                value_and_percentages(no_cassette_readcount,
                                      [starting_readcount, trimmed_readcount]))
        for end_type in cutadapt_readcount.keys():
            final_output.append(
                '## final "good" %s reads (in main output file) (%% of total):\t%s\n'
                % (end_type,
                   value_and_percentages(cutadapt_readcount[end_type],
                                         [starting_readcount])))
        final_output.append(
            '## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'
            % value_and_percentages(
                starting_readcount - sum(cutadapt_readcount.values()),
                [starting_readcount]))
        if options.collapse_to_unique:
            for end_type in cutadapt_readcount.keys():
                final_output.append(
                    '# "good" %s unique sequence count after collapsing reads to unique sequences '
                    % end_type + '(%% of read count):\t%s\n' %
                    value_and_percentages(collapsed_readcount[end_type],
                                          [cutadapt_readcount[end_type]]))
            if not options.first_bases_to_trim == 'NONE':
                final_output.append(
                    '# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[wrong_start_file],
                        [untrimmed_readcount]))
            if if_running_cutadapt:
                final_output.append(
                    '# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n'
                    % value_and_percentages(
                        extra_collapsed_readcounts[no_cassette_file],
                        [no_cassette_readcount]))
        for line in final_output:
            INFOFILE.write(line)
            if options.verbosity > 0: print line,

    ### Remove tmpfiles
    # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps,
    #   and I don't want to remove the infile!
    if not options.keep_tmpfiles:
        for tmpfile in [trimmed_tmpfile_original
                        ] + cutadapt_tmpfiles_original.values():
            if os.path.exists(tmpfile): os.remove(tmpfile)