def get_chromosome_lengths(genome_file=None): """ Return chromosome:length dictionary based on reading a genome fasta file. """ original_input = genome_file if genome_file is None: genome_file = DEFAULT_GENOME_CASSETTE_FILE chromosome_lengths = defaultdict(int) try: for header,seq in basic_seq_utilities.parse_fasta(genome_file): chromosome_lengths[header] = len(seq) return dict(chromosome_lengths) except IOError: file_info = "default " if original_input is None else "" raise ValueError("%sgenome fasta file %s not found! Provide filename."%(file_info, genome_file))
def get_chromosome_lengths(genome_file=None): """ Return chromosome:length dictionary based on reading a genome fasta file. """ original_input = genome_file if genome_file is None: genome_file = DEFAULT_GENOME_CASSETTE_FILE chromosome_lengths = defaultdict(int) try: for header, seq in basic_seq_utilities.parse_fasta(genome_file): chromosome_lengths[header] = len(seq) return dict(chromosome_lengths) except IOError: file_info = "default " if original_input is None else "" raise ValueError( "%sgenome fasta file %s not found! Provide filename." % (file_info, genome_file))
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args except ValueError: parser = define_option_parser() parser.print_help() sys.exit("Error: exactly one infile required!") # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc... ### check inputs adapter_options = '-a --adapter -b --anywhere -g --front' if any([x in options.other_cutadapt_options for x in adapter_options.split()]): sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options +" - use -5/-3 options to specify adapters instead!") ### outfile and tmpfile names infile_suffix = os.path.splitext(infile)[1] outfile_suffix = '.fa' #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix ends = "5' 3'".split() outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends} infofile = options.outfile_basename + '_info.txt' wrong_start_file = options.outfile_basename + '_wrong-start.fa' no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime") for end in ends} no_cassette_file = options.outfile_basename + '_no-cassette.fa' trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa' cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends} cutadapt_tmpfiles_original = cutadapt_tmpfiles with open(infofile,'w') as INFOFILE: ### write header data write_header_data(INFOFILE,options) INFOFILE.write('\n') ### 0. look at the infile; make sure it's readable, etc # (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format) starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input", options.total_read_number_only, False) ### 1. Trim the first bases (from adapter) # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... # Would that be faster, or better in any other way? # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), # since that's the eventual point of having those constant first bases there... if options.first_bases_to_trim == 'NONE': text = "### Not trimming first bases, since NONE was passed to -F option.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') trimmed_tmpfile = infile trimmed_readcount = starting_readcount untrimmed_readcount = 0 else: trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity) trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1), "first-base-trimming output", options.total_read_number_only, False) untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False) assert trimmed_readcount+untrimmed_readcount==starting_readcount,\ "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\ +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount) ### 2. run cutadapt to strip cassette sequence # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), # to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the # standard install! Or wait until the cutadapt maintainer does it (I submitted it as an issue) # (see ~/experiments/basic_programs/cutadapt_modifications/). if_running_cutadapt = True if options.other_cutadapt_options == 'NONE': if_running_cutadapt = False text = "### Not running cutadapt, since NONE was passed to -A option.\n" elif not (options.adapter_5prime or options.adapter_3prime): if_running_cutadapt = False text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n" # if not running it, just skip it if not if_running_cutadapt: if options.verbosity>0: print text INFOFILE.write(text+'\n') cutadapt_tmpfile = trimmed_tmpfile cutadapt_readcount = {'all': trimmed_readcount} no_cassette_readcount = 0 # otherwise run the 5' and 3' ends separately else: cutadapt_readcount = {} for (end_type, adapter_seq) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]: assert end_type in ends # if the adapter sequence for that side is empty, skip if not adapter_seq.replace('"','').replace("'",'').replace(' ',''): continue cutadapt_tmpfile = cutadapt_tmpfiles[end_type] full_cutadapt_options = '-a %s %s'%(adapter_seq, options.other_cutadapt_options) for extra_seq_category in ('untrimmed', 'too-short', 'too-long'): if not extra_seq_category in full_cutadapt_options: full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type]) command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="cutadapt for %s"%end_type) cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1), "cutadapt output", options.total_read_number_only, False) tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False) assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\ "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount) # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles! text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') no_cassette_seqs = [] for no_cassette_tmpfile in no_cassette_tmpfiles.values(): try: no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile))) except IOError: pass # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs]) no_cassette_readcount = len(overlapping_no_cassette_headers) with open(no_cassette_file,'w') as NO_CASSETTE_FILE: for header in sorted(overlapping_no_cassette_headers): # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE) assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\ "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount) # remove the original no_cassette_tmpfiles for tmpfile in no_cassette_tmpfiles.values(): if os.path.exists(tmpfile): os.remove(tmpfile) ### 3. run fastx_collapser to collapse the sequences to unique if not options.collapse_to_unique: text = "### Not running fastx_collapser, since -C option was not used.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items(): if os.path.exists(cutadapt_tmpfile): os.rename(cutadapt_tmpfile, outfiles[end_type]) collapsed_readcount = cutadapt_readcount # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because # fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off. else: collapsed_readcount, uncollapsed_readcount = {}, {} for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items(): outfile = outfiles[end_type] # if there is no file for that end, skip if not os.path.exists(cutadapt_tmpfile): continue command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], cutadapt_tmpfile, outfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="fastx_collapser for %s"%end_type) INFOFILE.write('\n') collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1), "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False) # make sure uncollapsed readcount is the same as before collapsing uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True) if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]: text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count! Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n" else: text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n" if options.verbosity>1: print text INFOFILE.write(text+'\n') # also run fastx_collapser on wrong_start_file and no_cassette_file text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n" if options.verbosity: print text INFOFILE.write(text+'\n') extra_collapsed_readcounts = {} for extra_file in (wrong_start_file, no_cassette_file): command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], extra_file) retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True) # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists # (also it chokes on empty files, AND on lowercase bases! That's a bit ridiculous...) # it also apparently sometimes changes the order of the sequences for no good reason! ARGH. if retcode in (0, None) and os.path.exists('tmp.fa'): os.remove(extra_file) os.rename('tmp.fa', extra_file) extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True, input_collapsed_to_unique=False) ### Final readcount check final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))] final_output.append("# starting total read count:\t%s\n"%starting_readcount) if not options.first_bases_to_trim == 'NONE': final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'% value_and_percentages(trimmed_readcount, [starting_readcount])) final_output.append('# "bad" read count (wrong-start) (%% of total):\t%s\n'% value_and_percentages(untrimmed_readcount, [starting_readcount])) if if_running_cutadapt: for end_type in cutadapt_readcount.keys(): final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'% (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount]))) final_output.append('# "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'% value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount])) for end_type in cutadapt_readcount.keys(): final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount]))) final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'% value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount])) if options.collapse_to_unique: for end_type in cutadapt_readcount.keys(): final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type +'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type], [cutadapt_readcount[end_type]])) if not options.first_bases_to_trim == 'NONE': final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n' %value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount])) if if_running_cutadapt: final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n' %value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount])) for line in final_output: INFOFILE.write(line) if options.verbosity>0: print line, ### Remove tmpfiles # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, # and I don't want to remove the infile! if not options.keep_tmpfiles: for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values(): if os.path.exists(tmpfile): os.remove(tmpfile)
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args # TODO multiple infiles would be nice! except ValueError: parser = define_option_parser() parser.print_help() sys.exit("Error: exactly one infile required!") # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc... ### check inputs adapter_options = '-a --adapter -b --anywhere -g --front' if any( [x in options.other_cutadapt_options for x in adapter_options.split()]): sys.exit( "Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)" % adapter_options + " - use -5/-3 options to specify adapters instead!") ### outfile and tmpfile names # outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that? #infile_suffix = os.path.splitext(infile)[1] #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix outfile_suffix = '.fa' infofile = options.outfile_basename + '_info.txt' wrong_start_file = options.outfile_basename + '_wrong-start.fa' no_cassette_file = options.outfile_basename + '_no-cassette.fa' trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa' # outfiles and tmpfiles should be split by end ONLY if cutadapt is being run! if options.other_cutadapt_options == 'NONE' or not ( options.adapter_5prime or options.adapter_3prime): outfiles = {'': options.outfile_basename + '.fa'} no_cassette_tmpfiles = { '': options.outfile_basename + '_no-cassette-tmpfile.fa' } cutadapt_tmpfiles = { '': options.outfile_basename + '_cutadapt-tmpfile.fa' } cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles) else: ends = "5' 3'".split() outfiles = { end: options.outfile_basename + '_%s.fa' % end.replace("'", "prime") for end in ends } no_cassette_tmpfiles = { end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa' % end.replace("'", "prime") for end in ends } cutadapt_tmpfiles = { end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa' % end.replace("'", "prime") for end in ends } cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles) with open(infofile, 'w') as INFOFILE: ### write header data write_header_data(INFOFILE, options) INFOFILE.write('\n') ### 0. look at the infile; make sure it's readable, etc # (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format) starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity > 1), "original input", options.total_read_number_only, False) ### 1. Trim the first bases (from adapter) # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... # Would that be faster, or better in any other way? # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), # since that's the eventual point of having those constant first bases there... if options.first_bases_to_trim == 'NONE': text = "### Not trimming first bases, since NONE was passed to -F option.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') trimmed_tmpfile = infile trimmed_readcount = starting_readcount untrimmed_readcount = 0 else: trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity) trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity > 1), "first-base-trimming output", options.total_read_number_only, False) untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False) assert trimmed_readcount+untrimmed_readcount==starting_readcount,\ "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\ +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount) ### 2. run cutadapt to strip cassette sequence # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), # to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the # standard install! Or wait until the cutadapt maintainer does it (I submitted it as an issue) # (see ~/experiments/basic_programs/cutadapt_modifications/). if_running_cutadapt = True if options.other_cutadapt_options == 'NONE': if_running_cutadapt = False text = "### Not running cutadapt, since NONE was passed to -A option.\n" elif not (options.adapter_5prime or options.adapter_3prime): if_running_cutadapt = False text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n" # if not running it, just skip it if not if_running_cutadapt: if options.verbosity > 0: print text INFOFILE.write(text + '\n') cutadapt_tmpfiles[''] = trimmed_tmpfile cutadapt_readcount = {'all': trimmed_readcount} no_cassette_readcount = 0 # otherwise run the 5' and 3' ends separately else: cutadapt_readcount = {} for (end_type, adapter_seqs) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]: assert end_type in ends # if the adapter sequence for that side is empty, skip adapter_seqs = adapter_seqs.replace('"', '').replace( "'", '').replace(' ', '') if not adapter_seqs: continue cutadapt_tmpfile = cutadapt_tmpfiles[end_type] all_adapter_options = ' '.join( ['-a %s' % seq for seq in adapter_seqs.split(',')]) full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options for extra_seq_category in ('untrimmed', 'too-short', 'too-long'): if not extra_seq_category in full_cutadapt_options: full_cutadapt_options += ' --%s-output %s' % ( extra_seq_category, no_cassette_tmpfiles[end_type]) command = "cutadapt_mod %s -o %s %s" % ( full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="cutadapt for %s" % end_type) cutadapt_readcount[end_type] = check_readcount( cutadapt_tmpfile, INFOFILE, bool(options.verbosity > 1), "cutadapt output", options.total_read_number_only, False) tmp_no_cassette_readcount = check_readcount( no_cassette_tmpfiles[end_type], None, False, True, False) assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\ "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount) # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles! text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') no_cassette_seqs = [] for no_cassette_tmpfile in no_cassette_tmpfiles.values(): try: no_cassette_seqs.append( dict(parse_fasta(no_cassette_tmpfile))) except IOError: pass # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets overlapping_no_cassette_headers = set.intersection( *[set(d.keys()) for d in no_cassette_seqs]) no_cassette_readcount = len(overlapping_no_cassette_headers) with open(no_cassette_file, 'w') as NO_CASSETTE_FILE: for header in sorted(overlapping_no_cassette_headers): # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE) assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\ "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount) # remove the original no_cassette_tmpfiles for tmpfile in no_cassette_tmpfiles.values(): if os.path.exists(tmpfile): os.remove(tmpfile) ### 3. run fastx_collapser to collapse the sequences to unique if not options.collapse_to_unique: text = "### Not running fastx_collapser, since -C option was not used.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items(): if os.path.exists(cutadapt_tmpfile): os.rename(cutadapt_tmpfile, outfiles[end_type]) collapsed_readcount = cutadapt_readcount # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because # fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off. else: collapsed_readcount, uncollapsed_readcount = {}, {} for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items(): outfile = outfiles[end_type] # if there is no file for that end, skip if not os.path.exists(cutadapt_tmpfile): continue command = "fastx_collapser -v %s -i %s -o %s" % ( FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], cutadapt_tmpfile, outfile) run_command_print_info_output( command, INFOFILE, options.verbosity, shell=True, program_name="fastx_collapser for %s" % end_type) INFOFILE.write('\n') collapsed_readcount[end_type] = check_readcount( outfile, INFOFILE, bool(options.verbosity > 1), "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False) # make sure uncollapsed readcount is the same as before collapsing uncollapsed_readcount[end_type] = check_readcount( outfile, None, False, "", True, input_collapsed_to_unique=True) if not uncollapsed_readcount[end_type] == cutadapt_readcount[ end_type]: text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count! Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n" else: text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n" if options.verbosity > 1: print text INFOFILE.write(text + '\n') # also run fastx_collapser on wrong_start_file and no_cassette_file text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n" if options.verbosity: print text INFOFILE.write(text + '\n') extra_collapsed_readcounts = {} for extra_file in (wrong_start_file, no_cassette_file): command = "fastx_collapser -v %s -i %s -o tmp.fa" % ( FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], extra_file) retcode = run_command_print_info_output(command, None, options.verbosity - 1, shell=True) # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists # (also it chokes on empty files, AND on lowercase bases! That's a bit ridiculous...) # it also apparently sometimes changes the order of the sequences for no good reason! ARGH. if retcode in (0, None) and os.path.exists('tmp.fa'): os.remove(extra_file) os.rename('tmp.fa', extra_file) extra_collapsed_readcounts[extra_file] = check_readcount( extra_file, None, False, "", True, input_collapsed_to_unique=False) ### Final readcount check final_output = [ "### Final read count info for %s (main output files %s)\n" % (infile, ', '.join(outfiles)) ] final_output.append("# starting total read count:\t%s\n" % starting_readcount) if not options.first_bases_to_trim == 'NONE': final_output.append( '# "good" read count after start trimming (%% of total):\t%s\n' % value_and_percentages(trimmed_readcount, [starting_readcount])) final_output.append( '# "bad" read count (wrong-start) (%% of total):\t%s\n' % value_and_percentages(untrimmed_readcount, [starting_readcount])) if if_running_cutadapt: for end_type in cutadapt_readcount.keys(): final_output.append( '# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n' % (end_type, value_and_percentages( cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount]))) final_output.append( '# "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n' % value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount])) for end_type in cutadapt_readcount.keys(): final_output.append( '## final "good" %s reads (in main output file) (%% of total):\t%s\n' % (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount]))) final_output.append( '## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n' % value_and_percentages( starting_readcount - sum(cutadapt_readcount.values()), [starting_readcount])) if options.collapse_to_unique: for end_type in cutadapt_readcount.keys(): final_output.append( '# "good" %s unique sequence count after collapsing reads to unique sequences ' % end_type + '(%% of read count):\t%s\n' % value_and_percentages(collapsed_readcount[end_type], [cutadapt_readcount[end_type]])) if not options.first_bases_to_trim == 'NONE': final_output.append( '# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n' % value_and_percentages( extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount])) if if_running_cutadapt: final_output.append( '# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n' % value_and_percentages( extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount])) for line in final_output: INFOFILE.write(line) if options.verbosity > 0: print line, ### Remove tmpfiles # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, # and I don't want to remove the infile! if not options.keep_tmpfiles: for tmpfile in [trimmed_tmpfile_original ] + cutadapt_tmpfiles_original.values(): if os.path.exists(tmpfile): os.remove(tmpfile)