def stepwise_pileup_generator(index_positions: Iterator[IndexPosition], alignment_file: pysam.AlignmentFile, ) -> Iterator[MotifPileup]: """MotifPileup Generator""" # pylint: disable=stop-iteration-return # (pylint false positive) idx_pos_iterable = iter(index_positions) curr_idx = next(idx_pos_iterable) pileup_columns = alignment_file.pileup(reference=curr_idx.chrom, start=curr_idx.start, end=None, truncate=True) # Functional form required # https://github.com/python/mypy/issues/4349 # noinspection PyPep8Naming EmptyPileupColumn = NamedTuple('EmptyPileupColumn', [('reference_pos', int)]) empty_pileup_column = EmptyPileupColumn(-1) empty_pileup_column_iterable = repeat(empty_pileup_column) pileup_columns_iterator = chain(pileup_columns, empty_pileup_column_iterable) curr_pileup_column = next(pileup_columns_iterator) curr_pileup_pos = curr_pileup_column.reference_pos while True: try: if curr_pileup_pos == -1: yield MotifPileup(reads=[], idx_pos=curr_idx) for curr_idx in idx_pos_iterable: yield MotifPileup(reads=[], idx_pos=curr_idx) break elif curr_idx.start > curr_pileup_pos: curr_pileup_column = next(pileup_columns_iterator) curr_pileup_pos = curr_pileup_column.reference_pos continue elif curr_idx.start < curr_pileup_pos: yield MotifPileup(reads=[], idx_pos=curr_idx) curr_idx = next(idx_pos_iterable) continue elif curr_idx.start == curr_pileup_pos: pileup_reads = pileups(curr_pileup_column, curr_idx.watson_base) yield MotifPileup(reads=pileup_reads, idx_pos=curr_idx) curr_idx = next(idx_pos_iterable) curr_pileup_column = next(pileup_columns_iterator) curr_pileup_pos = curr_pileup_column.reference_pos except StopIteration: return # generator will now raise StopIteration
def data_generator_pysam(my_args, name, start, stop, is_bulk): fasta_file = FastaFile(my_args.fasta) ref = fasta_file.fetch(name, start, stop) my_arg = { 'fastafile': fasta_file, 'stepper': 'samtools', 'adjust_capq_threshold': 50, 'contig': name, 'start': start, 'stop': stop, 'min_mapping_quality': 0 if is_bulk else 20, 'min_base_quality': 13, } if is_bulk: bam_file = AlignmentFile(my_args.bulk, 'rb') else: bam_file = AlignmentFile(my_args.bam, 'rb') read_bases_list = [] for pileup_column in bam_file.pileup(**my_arg): pos = pileup_column.reference_pos if pos >= stop: break if pos < start: continue read_bases_list = pileup_column.get_query_sequences(mark_matches=True, mark_ends=True, add_indels=True) read_bases = ''.join(read_bases_list).upper() n = pileup_column.get_num_aligned() if n == 0: read_bases = '*' base_q = '*' map_q = '*' else: base_q = ''.join([chr(int(i) + PHREDSCORE) \ for i in pileup_column.get_query_qualities()]) map_q = ''.join([chr(int(i) + PHREDSCORE) \ for i in pileup_column.get_mapping_qualities()]) yield [name, pos, ref[pos - start], str(n), read_bases, base_q, map_q] yield None
def analyzeAlignment(self, alignmentOutputDirectory): print ('\nStep 2.) Parse the alignment and create a new consensus sequence.') # Load up the Alignment Reference file, we'll need it. alignmentReferenceFileName = join(alignmentOutputDirectory,'AlignmentReference.fasta') alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0] # Count the reads in the input file totalReadCount = len(list(parse(self.readInput, self.readInputFormat))) #self.readInputFormat #self.readInput # We generate a new consensus sequence from the alignment results. newConsensusSequence = "" # Open the bam file bamfile = AlignmentFile(join(alignmentOutputDirectory,'alignment.bam'), 'rb') # Open alignment analysis text file alignmentSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AlignmentSummary.csv')) alignmentSummaryFile.write('Ref_Position,Ref_Base,Reference_Adjustment,Aligned_Count,Unaligned_Count,Match_Count,Mismatch_Count,In_Count,Del_Count,A_Count,G_Count,C_Count,T_Count\n') # A smaller log. I will provide human-readable descriptions of the # bases that were adjusted in the new consensus sequence. # TODO: Provide surrounding sequence as well, maybe it's a repeat region.... # Acutally NAH, I want to just put it in the wrangler log. #adjustedBasesSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AdjustedBases.txt')) # Todo: I should keep a more structured array of info for these alignments. # Store this info into an object #class columnStats(): alignmentInfo = AlignmentInfo() # Keep a running total of adjustments made to the reference. # If this total is 0, then theoretically the consensus matches the alignment reference, and we're done. totalSequenceAdjustments = 0 # Iterate the reference sequence column by column. pileupIterator = bamfile.pileup(alignmentRef.id) for pileupColumn in pileupIterator: currentAlignmentColumn = AlignmentColumn() #columnResults = None # columnResults.name='ll' # """referencePosition = 0 referenceBase = '' referenceAdjustment = '?' alignedCount = 0 unalignedCount = 0 matchCount = 0 mismatchCount = 0 inCount = 0 delCount = 0 aCount = 0 gCount = 0 cCount = 0 tCount = 0""" currentAlignmentColumn.referencePosition = pileupColumn.reference_pos currentAlignmentColumn.referenceBase = alignmentRef[pileupColumn.reference_pos].upper() currentAlignmentColumn.alignedCount = pileupColumn.nsegments currentAlignmentColumn.unalignedCount = totalReadCount - currentAlignmentColumn.alignedCount # Iterate the Reads at this position for pileupRead in pileupColumn.pileups: # If this read is a deletion if(pileupRead.is_del == 1): currentAlignmentColumn.delCount += 1 # else if this read is an insertion elif(pileupRead.indel > 0): #print ('INSERTION DETECTED, INDEL=' + str(pileupRead.indel)) currentAlignmentColumn.inCount += 1 # Else if it is a refskip (TODO What does this mean? no read aligned? Count these?) elif(pileupRead.is_refskip): print('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name) raise Exception('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name) # else this means we have a base aligned at this position for this read. else: currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper() #print('Reference,Current:' + referenceBase + ',' + currentBase) #print('Curr') if(currentBase == currentAlignmentColumn.referenceBase): currentAlignmentColumn.matchCount += 1 else: currentAlignmentColumn.mismatchCount += 1 # Count the nucleotide if (currentBase == 'A'): currentAlignmentColumn.aCount += 1 elif (currentBase == 'G'): currentAlignmentColumn.gCount += 1 elif (currentBase == 'C'): currentAlignmentColumn.cCount += 1 elif (currentBase == 'T'): currentAlignmentColumn.tCount += 1 else: print('Unknown Base found in Alignment at position ' + str(currentAlignmentColumn.referencePosition) + ':' + currentBase) raise Exception('Unknown Base in Alignment') # TODO: What if the query insertion sequence is longer than one base? # Maybe I can only adjust one base per iteration, is that okay? Probably for the Best, actually.. # Don't worry bout it for now. # Calculate highest frequency base # I hope this algorithm makes sense, probably there is a smarter way to do it. if(currentAlignmentColumn.aCount >= currentAlignmentColumn.gCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.tCount): mostFrequentBase = 'A' mostFrequentBaseCount = currentAlignmentColumn.aCount elif(currentAlignmentColumn.gCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.gCount >= currentAlignmentColumn.tCount): mostFrequentBase = 'G' mostFrequentBaseCount = currentAlignmentColumn.gCount elif(currentAlignmentColumn.cCount >= currentAlignmentColumn.tCount): mostFrequentBase = 'C' mostFrequentBaseCount = currentAlignmentColumn.cCount else: mostFrequentBase = 'T' mostFrequentBaseCount = currentAlignmentColumn.tCount # Add the next base to the new consensus sequence if (currentAlignmentColumn.matchCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.inCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.delCount): # Aligned bases match the reference, add reference base to the consensus. referenceAdjustment='-' newConsensusSequence += currentAlignmentColumn.referenceBase elif (currentAlignmentColumn.inCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.inCount >= currentAlignmentColumn.delCount): # Aligned bases show an insertion. # Add the Reference Base and the Insertion Base to the consensus. totalSequenceAdjustments += 1 referenceAdjustment='I' newConsensusSequence += currentAlignmentColumn.referenceBase + mostFrequentBase self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Insertion' + '\n(' + str(currentAlignmentColumn.inCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.inCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads' '\n(' + currentAlignmentColumn.referenceBase + ' > ' + currentAlignmentColumn.referenceBase + mostFrequentBase + ')' + '\n') #TODO: I need to insert multiple bases, if that is waht the alignment suggests. elif (currentAlignmentColumn.delCount >= currentAlignmentColumn.mismatchCount): # Reads show a deletion. # Don't add anything to the consensus. totalSequenceAdjustments += 1 referenceAdjustment='D' self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Deletion' + '\n(' + str(currentAlignmentColumn.delCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.delCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads' '\n(' + currentAlignmentColumn.referenceBase + ' > _)' + '\n') else: # Mismatch base. # Add the highest read count base to the reference. # It might actually be the same base as the reference, # Because this just means there are more mismatches than matches. # Problematic base, at least we'll notice here. # TODO: What to do with highly heterozygous Positions? # I should report those that look particularly heterozygous, somewhere. newConsensusSequence += mostFrequentBase totalSequenceAdjustments += 1 referenceAdjustment='M' self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Mismatch' + '\n(' + str(mostFrequentBaseCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * mostFrequentBaseCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads' '\n(' + currentAlignmentColumn.referenceBase + ' > ' + mostFrequentBase + ')' + '\n') # Write a line to the alignment Summary alignmentSummaryFile.write(str(currentAlignmentColumn.referencePosition) + ',' + str(currentAlignmentColumn.referenceBase) + ',' + str(referenceAdjustment) + ',' + str(currentAlignmentColumn.alignedCount) + ',' + str(currentAlignmentColumn.unalignedCount) + ',' + str(currentAlignmentColumn.matchCount) + ',' + str(currentAlignmentColumn.mismatchCount) + ',' + str(currentAlignmentColumn.inCount) + ',' + str(currentAlignmentColumn.delCount) + ',' + str(currentAlignmentColumn.aCount) + ',' + str(currentAlignmentColumn.gCount) + ',' + str(currentAlignmentColumn.cCount) + ',' + str(currentAlignmentColumn.tCount) + '\n') alignmentInfo.alignmentColumns.append(currentAlignmentColumn) print('\nTotal Sequence Adjustments:' + str(totalSequenceAdjustments) + ' (How many bases the consensus differs from the reference.)\n') # Write the newly constructed consensus sequence. currentConsensusSequenceFileName = join(alignmentOutputDirectory, 'Consensus.fasta') consensusWriter = createOutputFile(currentConsensusSequenceFileName) # TODO: How to i give this a better name? Can I find a gene guess or something? sequenceID = "Consensus_Sequence" write([SeqRecord(Seq(newConsensusSequence, IUPAC.unambiguous_dna), id=sequenceID, description="") ], consensusWriter, 'fasta') consensusWriter.close() self.wranglerLog.write('Total Sequence Adjustments:' + str(totalSequenceAdjustments) + '\n') # Close Summary Files alignmentSummaryFile.close() #adjustedBasesSummaryFile.close() return alignmentInfo
def phaseHeterozygousReads(self): # TODO: Should this method accept a cluster count? # That will break some things. What things? # This method is only called from this file, in the summarizeAnalysis method. print('Splitting reads by heterozygous positions') # Get a list of reads for later. parsedReads = list(parse(self.readInput, self.readInputFormat)) heterozygousConsensusDirectory = join(self.outputRootDirectory,'HeterozygousAlignment') # Open the bam file print ('opening final alignment_bamfile') bamfile = AlignmentFile(join(heterozygousConsensusDirectory,'alignment.bam'), 'rb') # Load up the Alignment Reference file, we'll need it. alignmentReferenceFileName = join(heterozygousConsensusDirectory,'AlignmentReference.fasta') alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0] # get list of AlignedReads print ('Making a list of Aligned Reads.') readIDs = [] for read in parsedReads: if not read.id in readIDs: readIDs.append(read.id) readIDs.sort() # Heterozygous base list heterozygousBasesSummaryFile = createOutputFile(join(heterozygousConsensusDirectory, 'HeterozygousBases.txt')) heterozygousBasesSummaryFile.write('List of Heterozygous Bases (0-based):\n') if (self.snps is not None and len(self.snps) > 0): # A string of SNPs was passed in, I don't need to calculate them myself. # TODO: I could write alignment stats here, like I do when i self-calculate the hetero positions. # This is just a simple list of 0-based positions. for snp in self.snps: heterozygousBasesSummaryFile.write(str(snp) + '\n') else: # get list of Heterozygous Positions # TODO: I suppose I don't need to align 100% of reads to determine heterozygosity. # Maybe this would speed up if i use a smaller alignment, or stop the loop after X reads print('Getting a list of Heterozygous Positions:') self.snps = [] pileupIterator = bamfile.pileup(alignmentRef.id) for pileupColumn in pileupIterator: readCount = 0 matchCount = 0 mismatchCount = 0 insCount = 0 delCount = 0 # dictionary of base counts. referenceBase = alignmentRef.seq[pileupColumn.pos].upper() # Iterate the Reads at this position. Each read at each position is either: # ins, Del, match, mismatch. #TODO: is it possible to exclude secondary/supplemetnary in the pileups method? No. for pileupRead in pileupColumn.pileups: #TODO: Important. Filter secondary / supplementary reads. This is causing problems, these secondary reads are FULL of snps. # Difficulty: these parameters are on an aligned segment. alignedSegmentObject = pileupRead.alignment if(False): pass elif(alignedSegmentObject.is_secondary): #print ('Secondary read at Position ' + str(pileupColumn.pos)) pass elif(alignedSegmentObject.is_supplementary): #print ('Supplementary read at Position ' + str(pileupColumn.pos)) pass # Just trying some things, not sure what these mean. #elif (alignedSegmentObject.is_unmapped): # print('UNMAPPED READ!!!!!!!!!!!!!!!!!!! what does that mean?') #elif (alignedSegmentObject.is_qcfail): # print('This read was a QC failure. What does that mean?????????????') else: readCount += 1 # indels if(pileupRead.is_del == 1): delCount += 1 elif(pileupRead.indel > 0): insCount += 1 else: currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper() if(currentBase == referenceBase): matchCount += 1 else: mismatchCount += 1 # This is a cheap way to stop analysis early. I will only analyze the first 250 reads. # Potential problem: are these reads sorted somehow? Maybe my numbers are biased by only looking at the # first reads # Todo: This is another parameter that can be tuned. Add to inputs? Maybe. maxAnalyzedReadCounts = 1000 if(readCount > maxAnalyzedReadCounts): break matchProportion = (1.0 * matchCount / readCount) insertionProportion = (1.0 * insCount / readCount) deletionProportion = (1.0 * delCount / readCount) mismatchProportion = (1.0 * mismatchCount / readCount) #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Match/Mismatch : ' + str(matchCount) + '/' + str(mismatchCount)) #print ('Match Percentage ' + str(matchProportion)) # TODO: Should accepted match proprtion be a commandline parameter? # if > 75% of bases match, this is not a heterzygous position baseCutoff = .70 if(matchProportion > baseCutoff or insertionProportion > baseCutoff or deletionProportion > baseCutoff): pass #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount)) #print ('This position does not look heterozygous.') # If coverage is very low, we should not use this position # This logic is flawed, i think this is never working. elif ((1.0 * pileupColumn.n / readCount) < .25): pass elif (mismatchProportion > baseCutoff): pass # These are the hardcoded values I used for the DRA analysis. Cheating. # # I want to write a condition where we don't use the position if it's not clearly polymorphic. # #elif (False): # # pass # # If the mismatch proportion is too high, what happens? What if there are 2 different bases that are mismatched, like if both my alleles have a different snp from reference. I'll miss that right now. # # # TEMP, this is very temporary. This is specific to a reference. # # TODO : Fix these hard coded values. # TODO: I don't really need this code, this is to ignore regions of my DRA reference. # Instead, I can pass in a list of 1-based polymorphic positions to sort based on those. A "whitelist" instead of a "blacklist" # # In a perfect world....I could tell what positions are heterozygous, but I can't. # # I can tell if this sequence is a homopolymer though, but looking at the bases around it.....But that's not the correct thing to do. # # I can keep this logic but make it a parameter. Big deletion regions are hard to analyze so I'm just ignoring them for now. # elif(5890 <= pileupColumn.pos <= 5970): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass # elif (6203 <= pileupColumn.pos <= 6212): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass # # Big String of A's # elif (774 <= pileupColumn.pos <= 796): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass # #Known homopolymer positions....this is terrible programming. # # I could at least pass these in ad ignored positions.... # elif (pileupColumn.pos in (403,430, 1479, 1510, 1683, # 1991, 1996, 1997, 2003, 2009, 2093, 2100, 2133, 2134, 2191, # 2262, 2289, 2294, 2342, 2449, 2450, 2524, 2647, 2663, 2732, # 2895, 2902, 3113, 3114, 3180, 3197, 3362, 3396, 3453, 3542, # 3551, 3665, 3832, 3903, 3953, 4108, 4109, 4400, 4639, 4698, # 4703, 4769, 4785, 4786, 4828, 4878, 5084, 5301, 5302, 5449, # 5575, 5597, 6155, 6279, 6280, 6314, 6375, 6376, 6712, 6755, # 6790, 7084, 7631, 7718, 7769, 7971, 7978, 8132, 8133, 8134, # 8314, 8315, 8352, 8476, 8477, 8478, 8642, 8650, 8651, 8652, # 8653, 8654, 8655, 8656, 8657, 8698, 8725, 8753, 8759 # )): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass else: #heterozygousBasesSummaryFile.write (str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount) + '\n') heterozygousBasesSummaryFile.write(str(pileupColumn.pos) + ', Coverage ' + str( pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str( insCount) + '/' + str(matchCount) + '/' + str(mismatchCount) + ' : ' + str(round(deletionProportion,2)) + '/' + str(round(insertionProportion, 2)) + '/' + str(round(matchProportion, 2)) + '/' + str(round(mismatchProportion, 2)) + '\n') self.snps.append(pileupColumn.pos) heterozygousBasesSummaryFile.close() #print ('Pileup Column # ' + str(pileupIterator)) print('Calculating read distance arrays:') # I'm making this distance array. In this array, a 0 represents a Match. a 1 represents indels or substitutions. # This way I can calculate "distance" in an arbitrary number of dimensions # Distance is a euclidian way to represent how far away a read is from the consensus, # based on the heterozygous positions. Each heterozygous position is a "dimension" in this space distanceArrays = {} for readID in readIDs: # TODO: A Bug! Initializing this list as 0s will bias the results. # TODO: Pileupcolumn loop is not hitting each read. Only...half sometimes. Some reads are not analyzed. # Why? SPOTTED IT! bamfile.pileup has a default to maximum read depth of 8000 #distanceArrays[readID] = list([999] * len(self.snps)) distanceArrays[readID] = list([0] * len(self.snps)) # I spotted the bug!!! pileup defaults to maximum 8000 read depth. That's bad!. pileupIterator = bamfile.pileup(alignmentRef.id,max_depth=99999999) #pileupIterator = bamfile.pileup(alignmentRef.id) for pileupColumn in pileupIterator: currentColumn = pileupColumn.pos # Only do this if the column number exists in our list of heterozygous positions if currentColumn in self.snps: heterozygousPositionIndex = self.snps.index(currentColumn) currentAnalyzedReadCount = 0 # A debugging variable, i dont think I actually use this count. referenceBase = alignmentRef.seq[currentColumn].upper() for pileupRead in pileupColumn.pileups: currentAnalyzedReadCount += 1 readID = pileupRead.alignment.query_name #print('Pos:' + str(currentColumn) + ', Refbase:' + str(referenceBase) + ', Read:' + str(readID)) # In this model, the distance is either 0 or 1. This was intentional but # Maybe we can tune the algorithm using these distances. # This could actually be tuned to do the heterozygous split using ONLY snps. # TODO: if we're having problems splitting based on homopolymers check this spot. # Maybe, I want to count indels as 0, no distance. # TODO: Something to try: indels are -1. SNPS are 1. Match = 0 # Maybe that would help the sorting? # TODO: Newest idea. Default to 0. 1 is match, -1 is indels. -1 is mismatches. I think that's it. if(pileupRead.is_del == 1): distanceArrays[readID][heterozygousPositionIndex] = -1 elif(pileupRead.indel > 0): distanceArrays[readID][heterozygousPositionIndex] = -1 else: currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper() if(currentBase == referenceBase): #print('Assinging Match. Column=' + str(currentColumn) + ', CurrentBase:' + str(currentBase) + ', HeterozygousPosIndex=' + str(heterozygousPositionIndex)) distanceArrays[readID][heterozygousPositionIndex] = 1 else: distanceArrays[readID][heterozygousPositionIndex] = -1 print('At position ' + str(heterozygousPositionIndex + 1) + ' I analyzed ' + str(currentAnalyzedReadCount) + ' reads.') self.printDistanceArrays(distanceArrays, join(self.heterozygousDirectory, 'DistanceArrays.csv')) # TODO: Im making 3 clusters. that worked. I need to make a parameter for cluster count. clusteredReadIDs = self.clusterReads(distanceArrays, 2) # Dictionary of results to return. Key is location of the consensus sequence. # Value is the # of reads represented in this consensus alignment. coverageResults = {} for zeroBasedClusterIndex, readCluster in enumerate(clusteredReadIDs): # I want to call the Strand (1 and 2), not Strand (0 and 1). clusterIndex = zeroBasedClusterIndex + 1 clusteredReadIDs = readCluster.keys() clusterOutputDir = join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'ClusteredReads') distanceArrayFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'DistanceArrays.csv') self.printDistanceArrays(readCluster, distanceArrayFileName) readOutputFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'Reads.' + self.readInputFormat) readOutputFile = createOutputFile(readOutputFileName) # Loop parsed reads, grab reads belonging to this cluster. # FYI it looks like each input is clustered in the output, i haven't found a missing read yet. I should still check. for readObject in parsedReads: #print ('ReadClusterIndex=' + str(zeroBasedClusterIndex)) #print ('AllReadID=' + str(readObject.id)) for clusteredReadID in clusteredReadIDs: #print ('clusteredReadID=' + str(clusteredReadID)) if (readObject.id == clusteredReadID): write([readObject], readOutputFile, self.readInputFormat) break readOutputFile.close() currentWranglerObject = AlleleWrangler( readOutputFileName , join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'Alignment') , join(self.heterozygousDirectory, 'AlignmentReference.fasta') , 6 , self.numberThreads , False , self.snps) currentCoverageResults = currentWranglerObject.analyzeReads() # Merge the dictionaries of coverage values ane return them. for key in currentCoverageResults.keys(): coverageResults[key] = currentCoverageResults[key] print ('Done Phasing Reads.') return coverageResults