Example #1
0
def stepwise_pileup_generator(index_positions: Iterator[IndexPosition],
                              alignment_file: pysam.AlignmentFile,
                              ) -> Iterator[MotifPileup]:
    """MotifPileup Generator"""
    # pylint: disable=stop-iteration-return
    # (pylint false positive)

    idx_pos_iterable = iter(index_positions)
    curr_idx = next(idx_pos_iterable)

    pileup_columns = alignment_file.pileup(reference=curr_idx.chrom,
                                           start=curr_idx.start,
                                           end=None,
                                           truncate=True)

    # Functional form required
    # https://github.com/python/mypy/issues/4349
    # noinspection PyPep8Naming
    EmptyPileupColumn = NamedTuple('EmptyPileupColumn', [('reference_pos', int)])
    empty_pileup_column = EmptyPileupColumn(-1)
    empty_pileup_column_iterable = repeat(empty_pileup_column)

    pileup_columns_iterator = chain(pileup_columns,
                                    empty_pileup_column_iterable)
    curr_pileup_column = next(pileup_columns_iterator)
    curr_pileup_pos = curr_pileup_column.reference_pos

    while True:
        try:
            if curr_pileup_pos == -1:
                yield MotifPileup(reads=[], idx_pos=curr_idx)
                for curr_idx in idx_pos_iterable:
                    yield MotifPileup(reads=[], idx_pos=curr_idx)
                break
            elif curr_idx.start > curr_pileup_pos:
                curr_pileup_column = next(pileup_columns_iterator)
                curr_pileup_pos = curr_pileup_column.reference_pos
                continue
            elif curr_idx.start < curr_pileup_pos:
                yield MotifPileup(reads=[], idx_pos=curr_idx)
                curr_idx = next(idx_pos_iterable)
                continue
            elif curr_idx.start == curr_pileup_pos:
                pileup_reads = pileups(curr_pileup_column, curr_idx.watson_base)
                yield MotifPileup(reads=pileup_reads, idx_pos=curr_idx)
                curr_idx = next(idx_pos_iterable)
                curr_pileup_column = next(pileup_columns_iterator)
                curr_pileup_pos = curr_pileup_column.reference_pos
        except StopIteration:
            return  # generator will now raise StopIteration
Example #2
0
def data_generator_pysam(my_args, name, start, stop, is_bulk):
    fasta_file = FastaFile(my_args.fasta)
    ref = fasta_file.fetch(name, start, stop)

    my_arg = {
        'fastafile': fasta_file,
        'stepper': 'samtools',
        'adjust_capq_threshold': 50,
        'contig': name,
        'start': start,
        'stop': stop,
        'min_mapping_quality': 0 if is_bulk else 20,
        'min_base_quality': 13,
    }

    if is_bulk:
        bam_file = AlignmentFile(my_args.bulk, 'rb')
    else:
        bam_file = AlignmentFile(my_args.bam, 'rb')

    read_bases_list = []
    for pileup_column in bam_file.pileup(**my_arg):
        pos = pileup_column.reference_pos

        if pos >= stop:
            break
        if pos < start:
            continue

        read_bases_list = pileup_column.get_query_sequences(mark_matches=True,
                                                            mark_ends=True,
                                                            add_indels=True)

        read_bases = ''.join(read_bases_list).upper()
        n = pileup_column.get_num_aligned()
        if n == 0:
            read_bases = '*'
            base_q = '*'
            map_q = '*'
        else:
            base_q = ''.join([chr(int(i) + PHREDSCORE) \
                for i in pileup_column.get_query_qualities()])
            map_q = ''.join([chr(int(i) + PHREDSCORE) \
                for i in pileup_column.get_mapping_qualities()])

        yield [name, pos, ref[pos - start], str(n), read_bases, base_q, map_q]

    yield None
Example #3
0
    def analyzeAlignment(self, alignmentOutputDirectory):
        print ('\nStep 2.) Parse the alignment and create a new consensus sequence.')
        
        # Load up the Alignment Reference file, we'll need it.
        alignmentReferenceFileName = join(alignmentOutputDirectory,'AlignmentReference.fasta')
        alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0]
        
        # Count the reads in the input file
        totalReadCount = len(list(parse(self.readInput, self.readInputFormat)))
        #self.readInputFormat
        #self.readInput
                
        # We generate a new consensus sequence from the alignment results.
        newConsensusSequence = ""
        
        # Open the bam file
        bamfile = AlignmentFile(join(alignmentOutputDirectory,'alignment.bam'), 'rb')  
        
        # Open alignment analysis text file
        alignmentSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AlignmentSummary.csv')) 
        alignmentSummaryFile.write('Ref_Position,Ref_Base,Reference_Adjustment,Aligned_Count,Unaligned_Count,Match_Count,Mismatch_Count,In_Count,Del_Count,A_Count,G_Count,C_Count,T_Count\n')
        
        # A smaller log. I will provide human-readable descriptions of the
        # bases that were adjusted in the new consensus sequence.
        # TODO: Provide surrounding sequence as well, maybe it's a repeat region....
        # Acutally NAH, I want to just put it in the wrangler log. 
        #adjustedBasesSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AdjustedBases.txt')) 
        
        # Todo: I should keep a more structured array of info for these alignments.
        # Store this info into an object
        #class columnStats():
        alignmentInfo = AlignmentInfo()
        
        # Keep a running total of adjustments made to the reference.
        # If this total is 0, then theoretically the consensus matches the alignment reference, and we're done.
        totalSequenceAdjustments = 0
        
        # Iterate the reference sequence column by column.
        pileupIterator = bamfile.pileup(alignmentRef.id)
        
        for pileupColumn in pileupIterator:
            
            currentAlignmentColumn = AlignmentColumn()
            #columnResults = None
           # columnResults.name='ll'
            #
            """referencePosition = 0
            referenceBase = ''
            referenceAdjustment = '?'
            alignedCount = 0
            unalignedCount = 0
            matchCount = 0
            mismatchCount = 0
            inCount = 0
            delCount = 0
            aCount = 0
            gCount = 0
            cCount = 0
            tCount = 0"""
            
            currentAlignmentColumn.referencePosition = pileupColumn.reference_pos
            currentAlignmentColumn.referenceBase = alignmentRef[pileupColumn.reference_pos].upper()
            currentAlignmentColumn.alignedCount = pileupColumn.nsegments
            currentAlignmentColumn.unalignedCount = totalReadCount - currentAlignmentColumn.alignedCount
            
            # Iterate the Reads at this position           
            for pileupRead in pileupColumn.pileups:
                
                # If this read is a deletion
                if(pileupRead.is_del == 1):
                    currentAlignmentColumn.delCount += 1
                # else if this read is an insertion
                elif(pileupRead.indel > 0):
                    
                    #print ('INSERTION DETECTED, INDEL=' + str(pileupRead.indel))  
                    currentAlignmentColumn.inCount += 1                   
                # Else if it is a refskip (TODO What does this mean? no read aligned? Count these?)
                elif(pileupRead.is_refskip):
                    print('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name)
                    raise Exception('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name)
                # else this means we have a base aligned at this position for this read.
                else:    
                    currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper()                    
                    #print('Reference,Current:' + referenceBase + ',' + currentBase)
                    #print('Curr')
                    if(currentBase == currentAlignmentColumn.referenceBase):
                        currentAlignmentColumn.matchCount += 1
                    else:
                        currentAlignmentColumn.mismatchCount += 1
                   
                # Count the nucleotide 
                if (currentBase == 'A'):
                    currentAlignmentColumn.aCount += 1
                elif (currentBase == 'G'):
                    currentAlignmentColumn.gCount += 1
                elif (currentBase == 'C'):
                    currentAlignmentColumn.cCount += 1
                elif (currentBase == 'T'):
                    currentAlignmentColumn.tCount += 1
                else:
                    print('Unknown Base found in Alignment at position ' + str(currentAlignmentColumn.referencePosition) + ':' + currentBase)
                    raise Exception('Unknown Base in Alignment')
                
                
                # TODO: What if the query insertion sequence is longer than one base?
                # Maybe I can only adjust one base per iteration, is that okay? Probably for the Best, actually..
                # Don't worry bout it for now.
            
            # Calculate highest frequency base
            # I hope this algorithm makes sense, probably there is a smarter way to do it.
            if(currentAlignmentColumn.aCount >= currentAlignmentColumn.gCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.tCount):
                mostFrequentBase = 'A'
                mostFrequentBaseCount = currentAlignmentColumn.aCount
            elif(currentAlignmentColumn.gCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.gCount >= currentAlignmentColumn.tCount):
                mostFrequentBase = 'G'
                mostFrequentBaseCount = currentAlignmentColumn.gCount
            elif(currentAlignmentColumn.cCount >= currentAlignmentColumn.tCount):
                mostFrequentBase = 'C'
                mostFrequentBaseCount = currentAlignmentColumn.cCount
            else:
                mostFrequentBase = 'T'
                mostFrequentBaseCount = currentAlignmentColumn.tCount


            
            # Add the next base to the new consensus sequence            
            if (currentAlignmentColumn.matchCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.inCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.delCount):
                # Aligned bases match the reference, add reference base to the consensus.
                referenceAdjustment='-'
                newConsensusSequence += currentAlignmentColumn.referenceBase
                
            elif (currentAlignmentColumn.inCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.inCount >= currentAlignmentColumn.delCount):
                # Aligned bases show an insertion.
                # Add the Reference Base and the Insertion Base to the consensus.  
                totalSequenceAdjustments += 1 
                referenceAdjustment='I'  
                newConsensusSequence += currentAlignmentColumn.referenceBase + mostFrequentBase         
                
                self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Insertion' +
                    '\n(' + str(currentAlignmentColumn.inCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.inCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads'
                    '\n(' + currentAlignmentColumn.referenceBase + ' > ' + currentAlignmentColumn.referenceBase + mostFrequentBase + ')' +
                    '\n')
                
                #TODO: I need to insert multiple bases, if that is waht the alignment suggests.

            elif (currentAlignmentColumn.delCount >= currentAlignmentColumn.mismatchCount):
                # Reads show a deletion.
                # Don't add anything to the consensus.
                totalSequenceAdjustments += 1
                referenceAdjustment='D'
                
                self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Deletion' +
                    '\n(' + str(currentAlignmentColumn.delCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.delCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads'
                    '\n(' + currentAlignmentColumn.referenceBase + ' > _)' +
                    '\n')
                
            else:
                # Mismatch base.
                # Add the highest read count base to the reference.
                # It might actually be the same base as the reference,
                # Because this just means there are more mismatches than matches.
                # Problematic base, at least we'll notice here.
                # TODO: What to do with highly heterozygous Positions?
                # I should report those that look particularly heterozygous, somewhere.
                newConsensusSequence += mostFrequentBase 
                totalSequenceAdjustments += 1     
                referenceAdjustment='M'   
                
                self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Mismatch' +
                    '\n(' + str(mostFrequentBaseCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * mostFrequentBaseCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads'
                    '\n(' + currentAlignmentColumn.referenceBase + ' > ' + mostFrequentBase + ')' +
                    '\n')
              

            # Write a line to the alignment Summary 
            alignmentSummaryFile.write(str(currentAlignmentColumn.referencePosition) + 
                ',' + str(currentAlignmentColumn.referenceBase) +
                ',' + str(referenceAdjustment) + 
                ',' + str(currentAlignmentColumn.alignedCount) + 
                ',' + str(currentAlignmentColumn.unalignedCount) + 
                ',' + str(currentAlignmentColumn.matchCount) + 
                ',' + str(currentAlignmentColumn.mismatchCount) + 
                ',' + str(currentAlignmentColumn.inCount) + 
                ',' + str(currentAlignmentColumn.delCount) + 
                ',' + str(currentAlignmentColumn.aCount) + 
                ',' + str(currentAlignmentColumn.gCount) + 
                ',' + str(currentAlignmentColumn.cCount) + 
                ',' + str(currentAlignmentColumn.tCount) +
                '\n')
            
            alignmentInfo.alignmentColumns.append(currentAlignmentColumn)
            
        print('\nTotal Sequence Adjustments:' + str(totalSequenceAdjustments) + ' (How many bases the consensus differs from the reference.)\n')    
        
        # Write the newly constructed consensus sequence.
        currentConsensusSequenceFileName = join(alignmentOutputDirectory, 'Consensus.fasta')        
        consensusWriter = createOutputFile(currentConsensusSequenceFileName)          
           
        # TODO: How to i give this a better name? Can I find a gene guess or something?
        sequenceID = "Consensus_Sequence"

        write([SeqRecord(Seq(newConsensusSequence,
            IUPAC.unambiguous_dna),
            id=sequenceID, description="") ], consensusWriter, 'fasta')
        consensusWriter.close()
            
        self.wranglerLog.write('Total Sequence Adjustments:' + str(totalSequenceAdjustments) + '\n')
            
        # Close Summary Files
        alignmentSummaryFile.close()
        #adjustedBasesSummaryFile.close()
        
        return alignmentInfo
Example #4
0
    def phaseHeterozygousReads(self):
    # TODO: Should this method accept a cluster count?
    # That will break some things. What things?
    # This method is only called from this file, in the summarizeAnalysis method.
    
        print('Splitting reads by heterozygous positions')
        
        # Get a list of reads for later.
        parsedReads = list(parse(self.readInput, self.readInputFormat))
        
        heterozygousConsensusDirectory = join(self.outputRootDirectory,'HeterozygousAlignment')

        # Open the bam file
        print ('opening final alignment_bamfile')
        bamfile = AlignmentFile(join(heterozygousConsensusDirectory,'alignment.bam'), 'rb')  
        
        # Load up the Alignment Reference file, we'll need it.
        alignmentReferenceFileName = join(heterozygousConsensusDirectory,'AlignmentReference.fasta')
        alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0]
     
        # get list of AlignedReads
        print ('Making a list of Aligned Reads.')
        readIDs = []
        for read in parsedReads:
            if not read.id in readIDs:
                readIDs.append(read.id)
        readIDs.sort()

        # Heterozygous base list
        heterozygousBasesSummaryFile = createOutputFile(join(heterozygousConsensusDirectory, 'HeterozygousBases.txt'))
        heterozygousBasesSummaryFile.write('List of Heterozygous Bases (0-based):\n')


        if (self.snps is not None and len(self.snps) > 0):
            # A string of SNPs was passed in, I don't need to calculate them myself.
            # TODO: I could write alignment stats here, like I do when i self-calculate the hetero positions.
            # This is just a simple list of 0-based positions.
            for snp in self.snps:
                heterozygousBasesSummaryFile.write(str(snp) + '\n')

        else:

            # get list of Heterozygous Positions
            # TODO: I suppose I don't need to align 100% of reads to determine heterozygosity.
            # Maybe this would speed up if i use a smaller alignment, or stop the loop after X reads
            print('Getting a list of Heterozygous Positions:')
            self.snps = []
            pileupIterator = bamfile.pileup(alignmentRef.id)
            for pileupColumn in pileupIterator:
                readCount = 0
                matchCount = 0
                mismatchCount = 0
                insCount = 0
                delCount = 0

                # dictionary of base counts.



                referenceBase = alignmentRef.seq[pileupColumn.pos].upper()

                # Iterate the Reads at this position. Each read at each position is either:
                # ins, Del, match, mismatch.

                #TODO: is it possible to exclude secondary/supplemetnary in the pileups method?  No.
                for pileupRead in pileupColumn.pileups:

                    #TODO: Important. Filter secondary / supplementary reads. This is causing problems, these secondary reads are FULL of snps.
                    # Difficulty: these parameters are on an aligned segment.

                    alignedSegmentObject = pileupRead.alignment

                    if(False):
                        pass
                    elif(alignedSegmentObject.is_secondary):
                        #print ('Secondary read at Position ' + str(pileupColumn.pos))
                        pass
                    elif(alignedSegmentObject.is_supplementary):
                        #print ('Supplementary read at Position ' + str(pileupColumn.pos))
                        pass


                    # Just trying some things, not sure what these mean.
                    #elif (alignedSegmentObject.is_unmapped):
                    #    print('UNMAPPED READ!!!!!!!!!!!!!!!!!!! what does that mean?')
                    #elif (alignedSegmentObject.is_qcfail):
                    #    print('This read was a QC failure. What does that mean?????????????')

                    else:
                        readCount += 1
                        # indels
                        if(pileupRead.is_del == 1):
                            delCount += 1
                        elif(pileupRead.indel > 0):
                            insCount += 1
                        else:
                            currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper()

                            if(currentBase == referenceBase):
                                matchCount += 1
                            else:
                                mismatchCount += 1


                    # This is a cheap way to stop analysis early. I will only analyze the first 250 reads.
                    # Potential problem: are these reads sorted somehow? Maybe my numbers are biased by only looking at the
                    # first reads
                    # Todo: This is another parameter that can be tuned. Add to inputs? Maybe.
                    maxAnalyzedReadCounts = 1000

                    if(readCount > maxAnalyzedReadCounts):
                        break

                matchProportion =      (1.0 * matchCount / readCount)
                insertionProportion =  (1.0 * insCount / readCount)
                deletionProportion =   (1.0 * delCount / readCount)
                mismatchProportion =   (1.0 * mismatchCount / readCount)

                #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Match/Mismatch : ' + str(matchCount) + '/' + str(mismatchCount))
                #print ('Match Percentage ' + str(matchProportion))

                # TODO: Should accepted match proprtion be a commandline parameter?
                # if > 75% of bases match, this is not a heterzygous position
                baseCutoff = .70

                if(matchProportion > baseCutoff or insertionProportion > baseCutoff or deletionProportion > baseCutoff):
                    pass
                    #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount))
                    #print ('This position does not look heterozygous.')


                # If coverage is very low, we should not use this position
                # This logic is flawed, i think this is never working.
                elif ((1.0 * pileupColumn.n / readCount) < .25):
                    pass

                elif (mismatchProportion > baseCutoff):
                    pass

                # These are the hardcoded values I used for the DRA analysis. Cheating.

                # # I want to write a condition where we don't use the position if it's not clearly polymorphic.
                # #elif (False):
                # #    pass
                # # If the mismatch proportion is too high, what happens? What if there are 2 different bases that are mismatched, like if both my alleles have a different snp from reference. I'll miss that right now.
                #

                # # TEMP, this is very temporary. This is specific to a reference.
                # # TODO : Fix these hard coded values.
                # TODO: I don't really need this code, this is to ignore regions of my DRA reference.
                # Instead, I can pass in a list of 1-based polymorphic positions to sort based on those. A "whitelist" instead of a "blacklist"
                # # In a perfect world....I could tell what positions are heterozygous, but I can't.



                # # I can tell if this sequence is a homopolymer though, but looking at the bases around it.....But that's not the correct thing to do.
                # # I can keep this logic but make it a parameter. Big deletion regions are hard to analyze so I'm just ignoring them for now.
                # elif(5890 <= pileupColumn.pos <= 5970):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass
                # elif (6203 <= pileupColumn.pos <= 6212):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass
                # # Big String of A's
                # elif (774 <= pileupColumn.pos <= 796):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass
                # #Known homopolymer positions....this is terrible programming.
                # # I could at least pass these in ad ignored positions....
                # elif (pileupColumn.pos in (403,430, 1479, 1510, 1683,
                #         1991, 1996, 1997, 2003, 2009, 2093, 2100, 2133, 2134, 2191,
                #         2262, 2289, 2294, 2342, 2449, 2450, 2524, 2647, 2663, 2732,
                #         2895, 2902, 3113, 3114, 3180, 3197, 3362, 3396, 3453, 3542,
                #         3551, 3665, 3832, 3903, 3953, 4108, 4109, 4400, 4639, 4698,
                #         4703, 4769, 4785, 4786, 4828, 4878, 5084, 5301, 5302, 5449,
                #         5575, 5597, 6155, 6279, 6280, 6314, 6375, 6376, 6712, 6755,
                #         6790, 7084, 7631, 7718, 7769, 7971, 7978, 8132, 8133, 8134,
                #         8314, 8315, 8352, 8476, 8477, 8478, 8642, 8650, 8651, 8652,
                #         8653, 8654, 8655, 8656, 8657, 8698, 8725, 8753, 8759
                #         )):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass


                else:
                    #heterozygousBasesSummaryFile.write (str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount) + '\n')
                    heterozygousBasesSummaryFile.write(str(pileupColumn.pos) + ', Coverage ' + str(
                        pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(
                        insCount) + '/' + str(matchCount) + '/' + str(mismatchCount)
                        + ' : ' + str(round(deletionProportion,2)) + '/'
                        + str(round(insertionProportion, 2)) + '/'
                        + str(round(matchProportion, 2)) + '/'
                        + str(round(mismatchProportion, 2))
                        + '\n')
                    self.snps.append(pileupColumn.pos)





        heterozygousBasesSummaryFile.close()
            #print ('Pileup Column # ' + str(pileupIterator))

        print('Calculating read distance arrays:')            
        # I'm making this distance array. In this array, a 0 represents a Match.  a 1 represents indels or substitutions.
        # This way I can calculate "distance" in an arbitrary number of dimensions
        # Distance is a euclidian way to represent how far away a read is from the consensus,
        # based on the heterozygous positions.  Each heterozygous position is a "dimension" in this space
        distanceArrays = {}
        for readID in readIDs:
            # TODO: A Bug! Initializing this list as 0s will bias the results.
            # TODO: Pileupcolumn loop is not hitting each read. Only...half sometimes. Some reads are not analyzed.
            # Why? SPOTTED IT! bamfile.pileup has a default to maximum read depth of 8000

            #distanceArrays[readID] = list([999] * len(self.snps))
            distanceArrays[readID] = list([0] * len(self.snps))


        # I spotted the bug!!! pileup defaults to maximum 8000 read depth. That's bad!.
        pileupIterator = bamfile.pileup(alignmentRef.id,max_depth=99999999)
        #pileupIterator = bamfile.pileup(alignmentRef.id)
        for pileupColumn in pileupIterator:
            currentColumn = pileupColumn.pos
            
            # Only do this if the column number exists in our list of heterozygous positions
            if currentColumn in self.snps:
                
                heterozygousPositionIndex = self.snps.index(currentColumn)
                currentAnalyzedReadCount = 0 # A debugging variable, i dont think I actually use this count.
                
                referenceBase = alignmentRef.seq[currentColumn].upper()
                for pileupRead in pileupColumn.pileups:
                    currentAnalyzedReadCount += 1
                    readID = pileupRead.alignment.query_name
                    
                    #print('Pos:' + str(currentColumn) + ', Refbase:' + str(referenceBase) + ', Read:' + str(readID))
                    
                    # In this model, the distance is either 0 or 1. This was intentional but
                    # Maybe we can tune the algorithm using these distances.
                    # This could actually be tuned to do the heterozygous split using ONLY snps.
                    # TODO: if we're having problems splitting based on homopolymers check this spot.
                    # Maybe, I want to count indels as 0, no distance.
                    # TODO: Something to try: indels are -1. SNPS are 1. Match = 0
                    # Maybe that would help the sorting?
                    # TODO: Newest idea. Default to 0. 1 is match, -1 is indels. -1 is mismatches. I think that's it.
                    
                    if(pileupRead.is_del == 1):
                        distanceArrays[readID][heterozygousPositionIndex] = -1
                    elif(pileupRead.indel > 0):
                        distanceArrays[readID][heterozygousPositionIndex] = -1
                    else:   
                        currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper()  
                        if(currentBase == referenceBase):
                            #print('Assinging Match. Column=' + str(currentColumn) + ', CurrentBase:' + str(currentBase) + ', HeterozygousPosIndex=' + str(heterozygousPositionIndex))
                            distanceArrays[readID][heterozygousPositionIndex] = 1
                        else:
                            distanceArrays[readID][heterozygousPositionIndex] = -1

                print('At position ' + str(heterozygousPositionIndex + 1) + ' I analyzed ' + str(currentAnalyzedReadCount) + ' reads.')

        self.printDistanceArrays(distanceArrays, join(self.heterozygousDirectory, 'DistanceArrays.csv'))

        # TODO: Im making 3 clusters. that worked. I need to make a parameter for cluster count.
        clusteredReadIDs = self.clusterReads(distanceArrays, 2)

        # Dictionary of results to return. Key is location of the consensus sequence.
        # Value is the # of reads represented in this consensus alignment.
        coverageResults = {}

        for zeroBasedClusterIndex, readCluster in enumerate(clusteredReadIDs):
            # I want to call the Strand (1 and 2), not Strand (0 and 1).
            clusterIndex = zeroBasedClusterIndex + 1

            clusteredReadIDs = readCluster.keys()

            clusterOutputDir = join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'ClusteredReads')

            distanceArrayFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'DistanceArrays.csv')
            self.printDistanceArrays(readCluster, distanceArrayFileName)

            readOutputFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'Reads.' + self.readInputFormat)
            readOutputFile = createOutputFile(readOutputFileName)

            # Loop parsed reads, grab reads belonging to this cluster.
            # FYI it looks like each input is clustered in the output, i haven't found a missing read yet. I should still check.
            for readObject in parsedReads:

                #print ('ReadClusterIndex=' + str(zeroBasedClusterIndex))
                #print ('AllReadID=' + str(readObject.id))

                for clusteredReadID in clusteredReadIDs:
                    #print ('clusteredReadID=' + str(clusteredReadID))

                    if (readObject.id == clusteredReadID):
                        write([readObject], readOutputFile, self.readInputFormat)
                        break

            readOutputFile.close()

            currentWranglerObject = AlleleWrangler(
                readOutputFileName
                , join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'Alignment')
                , join(self.heterozygousDirectory, 'AlignmentReference.fasta')
                , 6
                , self.numberThreads
                , False
                , self.snps)
            currentCoverageResults = currentWranglerObject.analyzeReads()

            # Merge the dictionaries of coverage values ane return them.
            for key in currentCoverageResults.keys():
                coverageResults[key] = currentCoverageResults[key]

        print ('Done Phasing Reads.')
        return coverageResults