def categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, 
                                    GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=-1, 
                                    input_collapsed_to_unique=False, no_warnings=False):
    """ Decide the proper category for each read, write to appropriate output file; return category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments. 
    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.
    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    Each read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, multiple_to_write lines will be written; if unaligned_as_fasta, unaligned reads
     will be written as fasta instead of SAM format (and so will multiple-genomic if multiple_to_write is 0).
    """
    category_readcounts = {'unaligned':0, 'cassette':0, 'multiple-genomic':0, 'genomic-unique':0, 'cassette-multiple':0}

    for readname,aln_list in sorted(readname_to_aln_list.items()):
        readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname)
        # if there's a single alignment, it's unaligned, cassette or genomic-unique
        if len(aln_list) == 1:
            aln = aln_list[0]
            if not aln.aligned:
                category_readcounts['unaligned'] += readcount
                if unaligned_as_fasta:  write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
                else:                   write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
            elif is_cassette_chromosome(aln.iv.chrom):
                category_readcounts['cassette'] += readcount
                write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
            else:
                category_readcounts['genomic-unique'] += readcount
                write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
        # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
        else:
            assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!"
            # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
            # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
            if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
                assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!"
                category_readcounts['cassette'] += readcount
                if not no_warnings:
                    print "Warning: multiple cassette alignments! Printing all to cassette file.\n\t%s"%(aln_list)
                category_readcounts['cassette-multiple'] += readcount
                for aln in aln_list:
                    write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
            # multiple genomic alignments - how many get written depends on multiple_to_write; 
            #  if it's 0, the outfile should be fasta, or else I guess it should be written as unaligned?
            #   (MAYBE-TODO writing single multiple as unaligned not implemented!)
            else:
                category_readcounts['multiple-genomic'] += readcount
                if multiple_to_write == 0:
                    if unaligned_as_fasta:
                        write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE)
                    else:
                        raise Exception("Writing 0 multiple alignments in SAM format NOT IMPLEMENTED!")
                else:
                    for aln in aln_list[:multiple_to_write]:
                        write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    return category_readcounts
Example #2
0
def subsequence_counts(infile_reader, seq_length=None, input_collapsed_to_unique=False):
    """ Given an iterator of Biopython seq objects and desired subsequence length/end, return subsequence:count dict.

    seq_length: if None, take whole seq; if N>0, take first N bases, if N<0, take last -N bases.
    If input_collapsed_to_unique is True, consider each sequence to be X reads, determined from seq.name, 
     using the format used by fastx_collapser from FastX Toolkit.
    """
    seq_counter = defaultdict(lambda: 0)    # a counter with a default value of 0
    for sequence in infile_reader: 
        N_seqs = get_seq_count_from_collapsed_header(sequence.name) if input_collapsed_to_unique else 1
        # convert Biopython Seq objects to plain strings - Seq objects aren't hashable correctly
        if seq_length > 0:  subsequence = str(sequence.seq[0:seq_length])
        else:               subsequence = str(sequence.seq[seq_length:])
        seq_counter[subsequence] += N_seqs
    return dict(seq_counter)
Example #3
0
def seq_count_and_lengths(seq_iterator, count_only=False, input_collapsed_to_unique=False):
    """ Given an iterator over sequences, return N_seqs and a seq_len:seq_count dict (empty if count_only). 

    Sequence length is determined by len(seq) - will fail if len() doesn't work on the elements of seq_iterator. 
    If input_collapsed_to_unique, decode the read count from seq header instead of counting each seq as 1, 
     using basic_seq_utilities.get_seq_count_from_collapsed_header (see docstring for that).
    """
    total_count = 0
    seqlen_counter = defaultdict(lambda: 0)
    for seq in seq_iterator: 
        N_seqs = get_seq_count_from_collapsed_header(seq.name) if input_collapsed_to_unique else 1
        total_count += N_seqs
        if not count_only:
            seqlen_counter[len(seq)] += N_seqs
    return total_count, dict(seqlen_counter)
Example #4
0
def seq_split_by_length(infile, min_length=None, max_length=None, force_fasta_output=False, include_empty_files=False, 
                        ignore_zero_length_sequences=False, pad_filenames_for_sort=0, 
                        input_collapsed_to_unique=False, quiet=False):
    """ See module docstring and optparse option help messages - avoiding duplication. """
    # file format recognition (I could do it by trying to use FastaReader/FastqReader on it, but it's annoying)
    fasta_extensions = ['fa','fasta']
    fastq_extensions = ['fq','fastq']
    extension = os.path.splitext(infile)[1].lower()[1:]
    if extension in fasta_extensions:   
        infile_reader = FastaReader(infile)
    elif extension in fastq_extensions: 
        infile_reader = FastqReader(infile,qual_scale="solexa")
    else:       sys.exit("Error: input file %s (extension %s) needs to have a %s extension to be recognized!"%(infile, 
                            extension, '/'.join(fasta_extensions+fastq_extensions)))

    if force_fasta_output:
        extension = 'fa'

    ### make the output folder, and outfiles
    infile_base = os.path.splitext(infile)[0]
    outfolder = infile_base
    os.mkdir(outfolder)
    # a (length: open file object) dictionary, so I can keep them all open and close them at the end. 
    # Yes, I know I should really be using with/as, but I don't think you can do multiples of that at once, and I can't have a level of indent for every possible sequence length!
    len_to_outfile_dict = {}

    # a counter with a default value of 0
    seq_counter = defaultdict(lambda: 0)

    for seq in infile_reader: 
        seqlen = len(seq)
        # add the N_seqs to the seq counter
        N_seqs = get_seq_count_from_collapsed_header(seq.name) if input_collapsed_to_unique else 1
        seq_counter[seqlen] += N_seqs
        if ignore_zero_length_sequences and seqlen==0:
            continue
        # special length cases for when min/max length is set
        if min_length is not None and seqlen<min_length:    seqlen = min_length-1
        elif max_length is not None and seqlen>max_length:  seqlen = max_length+1
        # if outfile for that length doesn't exist, create it
        if seqlen not in len_to_outfile_dict.keys():
            seqlen_string = "%0*dbp"%(pad_filenames_for_sort, seqlen)
            if min_length is not None and seqlen<min_length:    seqlen_string += '_or_less'
            elif max_length is not None and seqlen>max_length:  seqlen_string += '_or_more'
            filename = "%s.%s"%(seqlen_string,extension)
            len_to_outfile_dict[seqlen] = open(os.path.join(outfolder,filename), 'w')
        # write the sequence (fasta or fastq!) to the outfile!
        if force_fasta_output or extension in fasta_extensions:
            seq.write_to_fasta_file(len_to_outfile_dict[seqlen])
        else:
            seq.write_to_fastq_file(len_to_outfile_dict[seqlen])

    # optionally add the empty files that had no sequences of that length
    if include_empty_files:
        if min_length is None:  min_length = min(len_to_outfile_dict.keys())
        if max_length is None:  max_length = max(len_to_outfile_dict.keys())
        for seqlen in range(min_length+1,max_length):
            if seqlen not in len_to_outfile_dict.keys():
                len_to_outfile_dict[seqlen] = open(os.path.join(outfilder, "%s_%s.%s"%(infile_base,seqlen,extension)),'w')

    # close all the files
    for FILE in len_to_outfile_dict.values():
        FILE.close()

    # format and print the seq counts by length
    if not quiet:
        if 0 in seq_counter.keys() and ignore_zero_length_sequences:
            print "(discarding zero-length sequences)"
        for line in _format_lengths(seq_counter, include_empty_files, 1):     
            print(line),
def categorize_reads_print_to_files(readname,
                                    aln_list,
                                    category_readcounts,
                                    UNALIGNED_FILE,
                                    CASSETTE_FILE,
                                    MULTIPLE_GENOMIC_FILE,
                                    GENOMIC_UNIQUE_FILE,
                                    unaligned_as_fasta=False,
                                    multiple_to_write=-1,
                                    input_collapsed_to_unique=False,
                                    no_multi_cassette_warnings=False):
    """ Decide the proper category for the read, write to appropriate output file; adjust category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). 

    The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; 
     they can all be the SAME file object if desired.)

    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.

    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    The read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats 
      the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments.
      
    If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, 
     and so will multiple if multiple_to_write is 0.
    """
    readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(
        readname)
    # if there's a single alignment, it's unaligned, cassette or genomic-unique
    if len(aln_list) == 1:
        aln = aln_list[0]
        if not aln.aligned:
            category = 'unaligned'
            if unaligned_as_fasta:
                write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
            else:
                write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
        elif is_cassette_chromosome(aln.iv.chrom):
            category = 'cassette'
            write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
        else:
            category = 'genomic-unique'
            write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
    # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
    else:
        assert all([aln.aligned for aln in aln_list
                    ]), "Shouldn't see multiple unaligned lines per read!"
        # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
        # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
        # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.)
        if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
            assert all([
                is_cassette_chromosome(aln.iv.chrom) for aln in aln_list
            ]), "Mixed cassette/other!"
            if not no_multi_cassette_warnings:
                print(
                    "Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "
                    % aln_list[0].read.seq,
                    "first 3 positions %s" % ', '.join([
                        "%s %s %s" % (a.iv.chrom, a.iv.strand, a.iv.start)
                        for a in aln_list[:3]
                    ]))
                category = 'cassette-multiple'
            else:
                category = 'cassette'
            # first position alphabetically is chosen - MAYBE-TODO add other choice options?
            aln_to_print = sorted(
                aln_list,
                key=lambda a:
                (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0]
            # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names?
            #   but that would be tricky, need to strip matching prefixes from them,
            #   what about multiple alignments to SAME chromosome, etc.
            aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others'
            write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE)
        # multiple genomic alignments:
        # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line,
        #   else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments.
        # - if multiple_to_write>0, print that many normal SAM lines for N alignments
        # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file?
        else:
            category = 'multiple-genomic'
            if multiple_to_write == 0:
                if unaligned_as_fasta:
                    write_fasta_line(readname, aln_list[0].read.seq,
                                     MULTIPLE_GENOMIC_FILE)
                else:
                    aln = aln_list[0]
                    MULTIPLE_GENOMIC_FILE.write(
                        '%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n' %
                        (aln.read.name, aln.read.seq, aln.read.qualstr,
                         len(aln_list)))
            else:
                for aln in aln_list[:multiple_to_write]:
                    write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    category_readcounts[category] += readcount
    return category
def categorize_reads_print_to_files(readname, aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, 
                                    MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=False, multiple_to_write=-1, 
                                    input_collapsed_to_unique=False, no_multi_cassette_warnings=False):
    """ Decide the proper category for the read, write to appropriate output file; adjust category counts. 
    
    Categories: unaligned, cassette (one or more cassette alignments - print warning if multiple), 
     genomic-unique (single non-cassette alignment), multiple-genomic (multiple non-cassette alignments). 

    The reads will be categorized, and printed to the appropriate file (all the uppercase arguments should be open file objects; 
     they can all be the SAME file object if desired.)

    If input_collapsed_to_unique, for the purpose of category counts each read will be counted as N reads, 
     with N determined from readname using the fastx-collapser encoding.

    In the output category counts, cassette-multiple is a special subcategory - anything in it is also counted in cassette.

    The read is printed to the appropriate outfile (all outfiles should be open file handles); 
     for multiple-genomic, only N=multiple_to_write lines will be written; if N=0, one line will be written that treats 
      the read as unaligned, but with XM:i:M optional tag field added, where M is the number of multiple alignments.
      
    If unaligned_as_fasta, unaligned reads will be written as fasta instead of SAM format, 
     and so will multiple if multiple_to_write is 0.
    """
    readcount = 1 if not input_collapsed_to_unique else get_seq_count_from_collapsed_header(readname)
    # if there's a single alignment, it's unaligned, cassette or genomic-unique
    if len(aln_list) == 1:
        aln = aln_list[0]
        if not aln.aligned:
            category = 'unaligned'
            if unaligned_as_fasta:  write_fasta_line(readname, aln.read.seq, UNALIGNED_FILE)
            else:                   write_SAM_line_from_HTSeq_aln(aln, UNALIGNED_FILE)
        elif is_cassette_chromosome(aln.iv.chrom):
            category = 'cassette'
            write_SAM_line_from_HTSeq_aln(aln, CASSETTE_FILE)
        else:
            category = 'genomic-unique'
            write_SAM_line_from_HTSeq_aln(aln, GENOMIC_UNIQUE_FILE)
    # if there are multiple alignments, it's cassette-multiple (weird!) or multiple-genomic
    else:
        assert all([aln.aligned for aln in aln_list]), "Shouldn't see multiple unaligned lines per read!"
        # multiple-cassette - shouldn't really happen, but write to CASSETTE_FILE
        # MAYBE-TODO come up with something better to do for multiple-cassette cases? If they ever happen.
        # (NOTE: sometimes they happen because I'm actually aligning to multiple cassettes - then they're fine.)
        if any([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]):
            assert all([is_cassette_chromosome(aln.iv.chrom) for aln in aln_list]), "Mixed cassette/other!"
            if not no_multi_cassette_warnings:
                print ("Warning: multiple cassette alignments! Printing only one to cassette file. Seq %s, "%aln_list[0].read.seq, 
                       "first 3 positions %s"%', '.join(["%s %s %s"%(a.iv.chrom, a.iv.strand, a.iv.start) for a in aln_list[:3]]))
                category = 'cassette-multiple'
            else:
                category = 'cassette'
            # first position alphabetically is chosen - MAYBE-TODO add other choice options?
            aln_to_print = sorted(aln_list, key=lambda a: (a.iv.chrom, a.iv.strand, a.iv.start, a.iv.end))[0]
            # just add _and_others to the chromosome - MAYBE-TODO add something more informative, like list of names?
            #   but that would be tricky, need to strip matching prefixes from them, 
            #   what about multiple alignments to SAME chromosome, etc.
            aln_to_print.iv.chrom = aln_to_print.iv.chrom + '_and_others'
            write_SAM_line_from_HTSeq_aln(aln_to_print, CASSETTE_FILE)
        # multiple genomic alignments: 
        # - if multiple_to_write=0, treat multiple as unaligned - if unaligned_as_fasta, print fasta line, 
        #   else single unaligned SAM line, with XM:i:M optional tag field added, where M is the number of multiple alignments.
        # - if multiple_to_write>0, print that many normal SAM lines for N alignments
        # MAYBE-TODO add an option to write multiple as unaligned to the main SAM file AND full multiple lines to another file?
        else:
            category = 'multiple-genomic'
            if multiple_to_write == 0:
                if unaligned_as_fasta:
                    write_fasta_line(readname, aln_list[0].read.seq, MULTIPLE_GENOMIC_FILE)
                else:
                    aln = aln_list[0]
                    MULTIPLE_GENOMIC_FILE.write('%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\tXM:i:%s\n'%(aln.read.name, aln.read.seq, 
                                                                                                 aln.read.qualstr, len(aln_list)))
            else:
                for aln in aln_list[:multiple_to_write]:
                    write_SAM_line_from_HTSeq_aln(aln, MULTIPLE_GENOMIC_FILE)
    category_readcounts[category] += readcount
    return category