def print_flanking_regions_to_fasta(flanking_region_count_list, outfile, convert_counts=lambda x: x): """ Given a (seq,count) list, make fasta file with the each seq present convert_counts(count) times. """ with open(outfile, "w") as OUTFILE: for N, (seq, count) in enumerate(flanking_region_count_list): seqname = "%s (%s reads)" % (N, count) for _ in range(convert_counts(count)): write_fasta_line(seqname, seq, OUTFILE)
def categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=-1, input_collapsed_to_unique=False, no_warnings=False): """ Decide the proper category for each read, write to appropriate output file; return category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments. If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. Each read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, multiple_to_write lines will be written; if unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format (and so will multiple-genomic if multiple_to_write is 0). """ category_readcounts = {'unaligned':0, 'cassette':0, 'multiple-genomic':0, 'genomic-unique':0, 'cassette-multiple':0} for readname,aln_list in sorted(readname_to_aln_list.items()): readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category_readcounts['unaligned'] += readcount if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category_readcounts['cassette'] += readcount write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category_readcounts['genomic-unique'] += readcount write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!" category_readcounts['cassette'] += readcount if not no_warnings: print "Warning: multiple cassette alignments! Printing all to cassette file.\n\t%s"%(aln_list) category_readcounts['cassette-multiple'] += readcount for aln in aln_list: write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) # multiple genomic alignments - how many get written depends on multiple_to_write; # if it's 0, the outfile should be fasta, or else I guess it should be written as unaligned? # (MAYBE-TODO writing single multiple as unaligned not implemented!) else: category_readcounts['multiple-genomic'] += readcount if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: raise Exception("Writing 0 multiple alignments in SAM format NOT IMPLEMENTED!") else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) return category_readcounts
def _trim_prefix_single(seqname, seq, prefix_bases, TRIMMED_OUTFILE, WRONG_PREFIX_OUTFILE=None): """ If prefix_bases is a prefix of seq, trim it off, print to TRIMMED_OUTFILE, return 1; otherwise print to WRONG_PREFIX_OUTFILE if not None, return 0. """ if seq.upper().startswith(prefix_bases.upper()): seq_trimmed = seq[len(prefix_bases):] # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(seqname, seq_trimmed.upper(), TRIMMED_OUTFILE) return 1 else: if WRONG_PREFIX_OUTFILE is not None: # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(seqname, seq.upper(), WRONG_PREFIX_OUTFILE) return 0
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args except ValueError: parser = define_option_parser() parser.print_help() sys.exit("Error: exactly one infile required!") # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc... ### check inputs adapter_options = '-a --adapter -b --anywhere -g --front' if any([x in options.other_cutadapt_options for x in adapter_options.split()]): sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options +" - use -5/-3 options to specify adapters instead!") ### outfile and tmpfile names infile_suffix = os.path.splitext(infile)[1] outfile_suffix = '.fa' #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix ends = "5' 3'".split() outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends} infofile = options.outfile_basename + '_info.txt' wrong_start_file = options.outfile_basename + '_wrong-start.fa' no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime") for end in ends} no_cassette_file = options.outfile_basename + '_no-cassette.fa' trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa' cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends} cutadapt_tmpfiles_original = cutadapt_tmpfiles with open(infofile,'w') as INFOFILE: ### write header data write_header_data(INFOFILE,options) INFOFILE.write('\n') ### 0. look at the infile; make sure it's readable, etc # (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format) starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input", options.total_read_number_only, False) ### 1. Trim the first bases (from adapter) # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... # Would that be faster, or better in any other way? # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), # since that's the eventual point of having those constant first bases there... if options.first_bases_to_trim == 'NONE': text = "### Not trimming first bases, since NONE was passed to -F option.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') trimmed_tmpfile = infile trimmed_readcount = starting_readcount untrimmed_readcount = 0 else: trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity) trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1), "first-base-trimming output", options.total_read_number_only, False) untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False) assert trimmed_readcount+untrimmed_readcount==starting_readcount,\ "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\ +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount) ### 2. run cutadapt to strip cassette sequence # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), # to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the # standard install! Or wait until the cutadapt maintainer does it (I submitted it as an issue) # (see ~/experiments/basic_programs/cutadapt_modifications/). if_running_cutadapt = True if options.other_cutadapt_options == 'NONE': if_running_cutadapt = False text = "### Not running cutadapt, since NONE was passed to -A option.\n" elif not (options.adapter_5prime or options.adapter_3prime): if_running_cutadapt = False text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n" # if not running it, just skip it if not if_running_cutadapt: if options.verbosity>0: print text INFOFILE.write(text+'\n') cutadapt_tmpfile = trimmed_tmpfile cutadapt_readcount = {'all': trimmed_readcount} no_cassette_readcount = 0 # otherwise run the 5' and 3' ends separately else: cutadapt_readcount = {} for (end_type, adapter_seq) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]: assert end_type in ends # if the adapter sequence for that side is empty, skip if not adapter_seq.replace('"','').replace("'",'').replace(' ',''): continue cutadapt_tmpfile = cutadapt_tmpfiles[end_type] full_cutadapt_options = '-a %s %s'%(adapter_seq, options.other_cutadapt_options) for extra_seq_category in ('untrimmed', 'too-short', 'too-long'): if not extra_seq_category in full_cutadapt_options: full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type]) command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="cutadapt for %s"%end_type) cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1), "cutadapt output", options.total_read_number_only, False) tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False) assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\ "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount) # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles! text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') no_cassette_seqs = [] for no_cassette_tmpfile in no_cassette_tmpfiles.values(): try: no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile))) except IOError: pass # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs]) no_cassette_readcount = len(overlapping_no_cassette_headers) with open(no_cassette_file,'w') as NO_CASSETTE_FILE: for header in sorted(overlapping_no_cassette_headers): # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE) assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\ "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount) # remove the original no_cassette_tmpfiles for tmpfile in no_cassette_tmpfiles.values(): if os.path.exists(tmpfile): os.remove(tmpfile) ### 3. run fastx_collapser to collapse the sequences to unique if not options.collapse_to_unique: text = "### Not running fastx_collapser, since -C option was not used.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items(): if os.path.exists(cutadapt_tmpfile): os.rename(cutadapt_tmpfile, outfiles[end_type]) collapsed_readcount = cutadapt_readcount # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because # fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off. else: collapsed_readcount, uncollapsed_readcount = {}, {} for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items(): outfile = outfiles[end_type] # if there is no file for that end, skip if not os.path.exists(cutadapt_tmpfile): continue command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], cutadapt_tmpfile, outfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="fastx_collapser for %s"%end_type) INFOFILE.write('\n') collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1), "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False) # make sure uncollapsed readcount is the same as before collapsing uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True) if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]: text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count! Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n" else: text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n" if options.verbosity>1: print text INFOFILE.write(text+'\n') # also run fastx_collapser on wrong_start_file and no_cassette_file text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n" if options.verbosity: print text INFOFILE.write(text+'\n') extra_collapsed_readcounts = {} for extra_file in (wrong_start_file, no_cassette_file): command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], extra_file) retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True) # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists # (also it chokes on empty files, AND on lowercase bases! That's a bit ridiculous...) # it also apparently sometimes changes the order of the sequences for no good reason! ARGH. if retcode in (0, None) and os.path.exists('tmp.fa'): os.remove(extra_file) os.rename('tmp.fa', extra_file) extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True, input_collapsed_to_unique=False) ### Final readcount check final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))] final_output.append("# starting total read count:\t%s\n"%starting_readcount) if not options.first_bases_to_trim == 'NONE': final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'% value_and_percentages(trimmed_readcount, [starting_readcount])) final_output.append('# "bad" read count (wrong-start) (%% of total):\t%s\n'% value_and_percentages(untrimmed_readcount, [starting_readcount])) if if_running_cutadapt: for end_type in cutadapt_readcount.keys(): final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'% (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount]))) final_output.append('# "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'% value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount])) for end_type in cutadapt_readcount.keys(): final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount]))) final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'% value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount])) if options.collapse_to_unique: for end_type in cutadapt_readcount.keys(): final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type +'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type], [cutadapt_readcount[end_type]])) if not options.first_bases_to_trim == 'NONE': final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n' %value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount])) if if_running_cutadapt: final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n' %value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount])) for line in final_output: INFOFILE.write(line) if options.verbosity>0: print line, ### Remove tmpfiles # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, # and I don't want to remove the infile! if not options.keep_tmpfiles: for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values(): if os.path.exists(tmpfile): os.remove(tmpfile)
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args # TODO multiple infiles would be nice! except ValueError: parser = define_option_parser() parser.print_help() sys.exit("Error: exactly one infile required!") # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc... ### check inputs adapter_options = '-a --adapter -b --anywhere -g --front' if any( [x in options.other_cutadapt_options for x in adapter_options.split()]): sys.exit( "Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)" % adapter_options + " - use -5/-3 options to specify adapters instead!") ### outfile and tmpfile names # outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that? #infile_suffix = os.path.splitext(infile)[1] #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix outfile_suffix = '.fa' infofile = options.outfile_basename + '_info.txt' wrong_start_file = options.outfile_basename + '_wrong-start.fa' no_cassette_file = options.outfile_basename + '_no-cassette.fa' trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa' # outfiles and tmpfiles should be split by end ONLY if cutadapt is being run! if options.other_cutadapt_options == 'NONE' or not ( options.adapter_5prime or options.adapter_3prime): outfiles = {'': options.outfile_basename + '.fa'} no_cassette_tmpfiles = { '': options.outfile_basename + '_no-cassette-tmpfile.fa' } cutadapt_tmpfiles = { '': options.outfile_basename + '_cutadapt-tmpfile.fa' } cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles) else: ends = "5' 3'".split() outfiles = { end: options.outfile_basename + '_%s.fa' % end.replace("'", "prime") for end in ends } no_cassette_tmpfiles = { end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa' % end.replace("'", "prime") for end in ends } cutadapt_tmpfiles = { end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa' % end.replace("'", "prime") for end in ends } cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles) with open(infofile, 'w') as INFOFILE: ### write header data write_header_data(INFOFILE, options) INFOFILE.write('\n') ### 0. look at the infile; make sure it's readable, etc # (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format) starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity > 1), "original input", options.total_read_number_only, False) ### 1. Trim the first bases (from adapter) # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... # Would that be faster, or better in any other way? # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), # since that's the eventual point of having those constant first bases there... if options.first_bases_to_trim == 'NONE': text = "### Not trimming first bases, since NONE was passed to -F option.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') trimmed_tmpfile = infile trimmed_readcount = starting_readcount untrimmed_readcount = 0 else: trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity) trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity > 1), "first-base-trimming output", options.total_read_number_only, False) untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False) assert trimmed_readcount+untrimmed_readcount==starting_readcount,\ "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\ +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount) ### 2. run cutadapt to strip cassette sequence # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), # to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the # standard install! Or wait until the cutadapt maintainer does it (I submitted it as an issue) # (see ~/experiments/basic_programs/cutadapt_modifications/). if_running_cutadapt = True if options.other_cutadapt_options == 'NONE': if_running_cutadapt = False text = "### Not running cutadapt, since NONE was passed to -A option.\n" elif not (options.adapter_5prime or options.adapter_3prime): if_running_cutadapt = False text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n" # if not running it, just skip it if not if_running_cutadapt: if options.verbosity > 0: print text INFOFILE.write(text + '\n') cutadapt_tmpfiles[''] = trimmed_tmpfile cutadapt_readcount = {'all': trimmed_readcount} no_cassette_readcount = 0 # otherwise run the 5' and 3' ends separately else: cutadapt_readcount = {} for (end_type, adapter_seqs) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]: assert end_type in ends # if the adapter sequence for that side is empty, skip adapter_seqs = adapter_seqs.replace('"', '').replace( "'", '').replace(' ', '') if not adapter_seqs: continue cutadapt_tmpfile = cutadapt_tmpfiles[end_type] all_adapter_options = ' '.join( ['-a %s' % seq for seq in adapter_seqs.split(',')]) full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options for extra_seq_category in ('untrimmed', 'too-short', 'too-long'): if not extra_seq_category in full_cutadapt_options: full_cutadapt_options += ' --%s-output %s' % ( extra_seq_category, no_cassette_tmpfiles[end_type]) command = "cutadapt_mod %s -o %s %s" % ( full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="cutadapt for %s" % end_type) cutadapt_readcount[end_type] = check_readcount( cutadapt_tmpfile, INFOFILE, bool(options.verbosity > 1), "cutadapt output", options.total_read_number_only, False) tmp_no_cassette_readcount = check_readcount( no_cassette_tmpfiles[end_type], None, False, True, False) assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\ "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount) # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles! text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') no_cassette_seqs = [] for no_cassette_tmpfile in no_cassette_tmpfiles.values(): try: no_cassette_seqs.append( dict(parse_fasta(no_cassette_tmpfile))) except IOError: pass # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets overlapping_no_cassette_headers = set.intersection( *[set(d.keys()) for d in no_cassette_seqs]) no_cassette_readcount = len(overlapping_no_cassette_headers) with open(no_cassette_file, 'w') as NO_CASSETTE_FILE: for header in sorted(overlapping_no_cassette_headers): # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE) assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\ "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount) # remove the original no_cassette_tmpfiles for tmpfile in no_cassette_tmpfiles.values(): if os.path.exists(tmpfile): os.remove(tmpfile) ### 3. run fastx_collapser to collapse the sequences to unique if not options.collapse_to_unique: text = "### Not running fastx_collapser, since -C option was not used.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items(): if os.path.exists(cutadapt_tmpfile): os.rename(cutadapt_tmpfile, outfiles[end_type]) collapsed_readcount = cutadapt_readcount # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because # fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off. else: collapsed_readcount, uncollapsed_readcount = {}, {} for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items(): outfile = outfiles[end_type] # if there is no file for that end, skip if not os.path.exists(cutadapt_tmpfile): continue command = "fastx_collapser -v %s -i %s -o %s" % ( FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], cutadapt_tmpfile, outfile) run_command_print_info_output( command, INFOFILE, options.verbosity, shell=True, program_name="fastx_collapser for %s" % end_type) INFOFILE.write('\n') collapsed_readcount[end_type] = check_readcount( outfile, INFOFILE, bool(options.verbosity > 1), "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False) # make sure uncollapsed readcount is the same as before collapsing uncollapsed_readcount[end_type] = check_readcount( outfile, None, False, "", True, input_collapsed_to_unique=True) if not uncollapsed_readcount[end_type] == cutadapt_readcount[ end_type]: text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count! Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n" else: text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n" if options.verbosity > 1: print text INFOFILE.write(text + '\n') # also run fastx_collapser on wrong_start_file and no_cassette_file text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n" if options.verbosity: print text INFOFILE.write(text + '\n') extra_collapsed_readcounts = {} for extra_file in (wrong_start_file, no_cassette_file): command = "fastx_collapser -v %s -i %s -o tmp.fa" % ( FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], extra_file) retcode = run_command_print_info_output(command, None, options.verbosity - 1, shell=True) # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists # (also it chokes on empty files, AND on lowercase bases! That's a bit ridiculous...) # it also apparently sometimes changes the order of the sequences for no good reason! ARGH. if retcode in (0, None) and os.path.exists('tmp.fa'): os.remove(extra_file) os.rename('tmp.fa', extra_file) extra_collapsed_readcounts[extra_file] = check_readcount( extra_file, None, False, "", True, input_collapsed_to_unique=False) ### Final readcount check final_output = [ "### Final read count info for %s (main output files %s)\n" % (infile, ', '.join(outfiles)) ] final_output.append("# starting total read count:\t%s\n" % starting_readcount) if not options.first_bases_to_trim == 'NONE': final_output.append( '# "good" read count after start trimming (%% of total):\t%s\n' % value_and_percentages(trimmed_readcount, [starting_readcount])) final_output.append( '# "bad" read count (wrong-start) (%% of total):\t%s\n' % value_and_percentages(untrimmed_readcount, [starting_readcount])) if if_running_cutadapt: for end_type in cutadapt_readcount.keys(): final_output.append( '# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n' % (end_type, value_and_percentages( cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount]))) final_output.append( '# "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n' % value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount])) for end_type in cutadapt_readcount.keys(): final_output.append( '## final "good" %s reads (in main output file) (%% of total):\t%s\n' % (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount]))) final_output.append( '## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n' % value_and_percentages( starting_readcount - sum(cutadapt_readcount.values()), [starting_readcount])) if options.collapse_to_unique: for end_type in cutadapt_readcount.keys(): final_output.append( '# "good" %s unique sequence count after collapsing reads to unique sequences ' % end_type + '(%% of read count):\t%s\n' % value_and_percentages(collapsed_readcount[end_type], [cutadapt_readcount[end_type]])) if not options.first_bases_to_trim == 'NONE': final_output.append( '# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n' % value_and_percentages( extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount])) if if_running_cutadapt: final_output.append( '# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n' % value_and_percentages( extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount])) for line in final_output: INFOFILE.write(line) if options.verbosity > 0: print line, ### Remove tmpfiles # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, # and I don't want to remove the infile! if not options.keep_tmpfiles: for tmpfile in [trimmed_tmpfile_original ] + cutadapt_tmpfiles_original.values(): if os.path.exists(tmpfile): os.remove(tmpfile)
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, input_collapsed_to_unique=False, no_multi_cassette_warnings=False): """ Decide the proper category for the read, write to appropriate output file; adjust category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; they can all be the SAME file object if desired.) If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. The read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments. If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, and so will multiple if multiple_to_write is 0. """ readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header( readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category = 'unaligned' if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category = 'cassette' write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category = 'genomic-unique' write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list ]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.) if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([ is_cassette_chromosome(aln.iv.chrom) for aln in aln_list ]), "Mixed cassette/other!" if not no_multi_cassette_warnings: print( "Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, " % aln_list[0].read.seq, "first 3 positions %s" % ', '.join([ "%s %s %s" % (a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3] ])) category = 'cassette-multiple' else: category = 'cassette' # first position alphabetically is chosen - MAYBE-TODO add other choice options? aln_to_print = sorted( aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0] # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names? # but that would be tricky, need to strip matching prefixes from them, # what about multiple alignments to SAME chromosome, etc. aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others' write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE) # multiple genomic alignments: # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, # else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments. # - if multiple_to_write>0, print that many normal SAM lines for N alignments # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file? else: category = 'multiple-genomic' if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: aln = aln_list[0] MULTIPLE_GENOMIC_FILE.write( '%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n' % (aln.read.name, aln.read.seq, aln.read.qualstr, len(aln_list))) else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) category_readcounts[category] += readcount return category
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, input_collapsed_to_unique=False, no_multi_cassette_warnings=False): """ Decide the proper category for the read, write to appropriate output file; adjust category counts. Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; they can all be the SAME file object if desired.) If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, with N determined from readname using the fastx-collapser encoding. In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette. The read is printed to the appropriate outfile (all outfiles should be open file handles); for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments. If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, and so will multiple if multiple_to_write is 0. """ readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname) # if there's a single alignment, it's unaligned, cassette or genomic-unique if len(aln_list) == 1: aln = aln_list[0] if not aln.aligned: category = 'unaligned' if unaligned_as_fasta: write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE) else: write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE) elif is_cassette_chromosome(aln.iv.chrom): category = 'cassette' write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE) else: category = 'genomic-unique' write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE) # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic else: assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!" # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen. # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.) if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]): assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!" if not no_multi_cassette_warnings: print ("Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "%aln_list[0].read.seq, "first 3 positions %s"%', '.join(["%s %s %s"%(a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3]])) category = 'cassette-multiple' else: category = 'cassette' # first position alphabetically is chosen - MAYBE-TODO add other choice options? aln_to_print = sorted(aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0] # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names? # but that would be tricky, need to strip matching prefixes from them, # what about multiple alignments to SAME chromosome, etc. aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others' write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE) # multiple genomic alignments: # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, # else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments. # - if multiple_to_write>0, print that many normal SAM lines for N alignments # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file? else: category = 'multiple-genomic' if multiple_to_write == 0: if unaligned_as_fasta: write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE) else: aln = aln_list[0] MULTIPLE_GENOMIC_FILE.write('%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n'%(aln.read.name, aln.read.seq, aln.read.qualstr, len(aln_list))) else: for aln in aln_list[:multiple_to_write]: write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE) category_readcounts[category] += readcount return category