def determineAncestor(op1, op2, startPosition, endPosition, aligned1, aligned2, sequence1, sequence2, opIndex1, opIndex2, event, strain1, strain2): ancestralOperon = copy.deepcopy(event.operon1Alignment) updatedUnaligned = [] unaligned1 = [] unaligned2 = [] deletionSizes = None duplicationSizes = None print "Aligned Sections" print aligned1 print aligned2 unaligned = getUnaligned(op1, startPosition[0]-1, endPosition[0]-1) print "Unaligned sections" print unaligned #numUnique, unaligned, numDuplicateFound = compareDuplicates(aligned2, unaligned) numUnique = len(op1)-len(aligned2) numDuplicateFound = 0 alignedRange = (startPosition[0]-1, endPosition[0]-1) for geneList in unaligned: if geneList: numUniqueFound, deletionSizes, duplicationSizes, updatedGeneList = findUniqueGenes(geneList, sequence1, opIndex1, alignedRange) numUnique -= numUniqueFound numDuplicateFound += numUniqueFound updatedUnaligned.append(updatedGeneList) else: updatedUnaligned.append([]) unaligned = getUnaligned(op2, startPosition[1]-1, endPosition[1]-1) print "Unaligned sections" print unaligned # numUnique, unaligned, numDuplicateFound = compareDuplicates(aligned1, unaligned) numUnique = len(op2)-len(aligned1) numDuplicateFound = 0 alignedRange = (startPosition[1]-1, endPosition[1]-1) for geneList in unaligned: if geneList: numUniqueFound, deletionSizes, duplicationSizes, updatedGeneList = findUniqueGenes(geneList, sequence2, opIndex2, alignedRange) numUnique -= numUniqueFound numDuplicateFound += numUniqueFound updatedUnaligned.append(updatedGeneList) else: updatedUnaligned.append([]) print updatedUnaligned while updatedUnaligned[0] or updatedUnaligned[2]: if updatedUnaligned[0]: gene = updatedUnaligned[0].pop() if gene != '-': unaligned1.insert(0, gene) if updatedUnaligned[2]: gene = updatedUnaligned[2].pop() if gene != '-': unaligned1.insert(0, gene) while updatedUnaligned[1] or updatedUnaligned[3]: if updatedUnaligned[1]: gene = updatedUnaligned[1].pop() if gene != '-': unaligned2.insert(0, gene) if updatedUnaligned[3]: gene = updatedUnaligned[3].pop() if gene != '-': unaligned2.insert(0, gene) print('Differences detected between these two operons!') operon1Gaps = event.operon1Gaps operon2Gaps = event.operon2Gaps operon1GapIndexes = event.operon1GapIndexes operon2GapIndexes = event.operon2GapIndexes print operon1Gaps print operon1GapIndexes print operon2Gaps print operon2GapIndexes print('These are the extra genes for operon 1: %s' %(operon1Gaps)) print('These are the indexes for extra genes in operon 1: %s' %(operon1GapIndexes)) print('These are the extra genes for operon 2: %s' %(operon2Gaps)) print('These are the indexes for extra genes in operon 2: %s' %(operon2GapIndexes)) #Checks if these extra genes are duplicates by checking if they exist within the alignment and removes them if they do operon1Gaps, duplicateSizesWithinAlignment1 = checkForMatchesInAlignment(operon1Gaps, event.operon1Alignment) operon2Gaps, duplicateSizesWithinAlignment2 = checkForMatchesInAlignment(operon2Gaps, event.operon2Alignment) #increment the duplicate counters #incrementDuplicateSizeCounters(duplicateSizesWithinAlignment1) #incrementDuplicateSizeCounters(duplicateSizesWithinAlignment2) strain1 = addDuplicationEventsToStrain(duplicateSizesWithinAlignment1, strain1) strain2 = addDuplicationEventsToStrain(duplicateSizesWithinAlignment2, strain2) i = len(operon1Gaps) j = len(operon2Gaps) while (i > 0) or (j > 0): #Select the gap with the biggest index b/c we will be performing the insertion rear to front of operon to avoid messing up the indexes of the other gaps if i > 0 and j > 0 and operon1GapIndexes[i-1] > operon2GapIndexes[j-1]: #This means both queues have gaps however the index in queue 1 is bigger so we'll deal with that one first #print('Gap being processed: %s' % (operon1Gaps[i])) numUniqueFound, deletionSizes, duplicationSizes, _ = findUniqueGenes(operon1Gaps[i-1], sequence1, opIndex1) strain1 = addDuplicationEventsToStrain(duplicationSizes, strain1) strain2 = addDeletionEventsToStrain(deletionSizes, strain2) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon1Gaps[i-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon1Gaps[i-1]) > 0: #Insert gap into operon operon1Gaps[i-1].reverse() for gene in operon1Gaps[i-1]: ancestralOperon.insert(operon1GapIndexes[i-1], gene) i = i - 1 elif i > 0 and j > 0 and operon1GapIndexes[i-1] < operon2GapIndexes[j-1]: #This means both queues have gaps however the index in queue 2 is bigger so we'll insert that one first #print('Gap being processed: %s' % (operon2Gaps[j-1])) numUniqueFound, deletionSizes, duplicationSizes, _ = findUniqueGenes(operon2Gaps[j-1], sequence2, opIndex2) strain2 = addDuplicationEventsToStrain(duplicationSizes, strain2) strain1 = addDeletionEventsToStrain(deletionSizes, strain1) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon2Gaps[j-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon2Gaps[j-1]) > 0: #Insert gap into operon operon2Gaps[j-1].reverse() for gene in operon2Gaps[j-1]: ancestralOperon.insert(operon2GapIndexes[j-1], gene) j = j - 1 elif i > 0: #This means that queue 2 has no more gaps so we process the remaining gaps in queue 1 #print('Gap being processed: %s' % (operon1Gaps[i-1])) numUniqueFound, deletionSizes, duplicationSizes, _ = findUniqueGenes(operon1Gaps[i-1], sequence1, opIndex1) strain1 = addDuplicationEventsToStrain(duplicationSizes, strain1) strain2 = addDeletionEventsToStrain(deletionSizes, strain2) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon1Gaps[i-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon1Gaps[i-1]) > 0: #Insert gap into operon operon1Gaps[i-1].reverse() for gene in operon1Gaps[i-1]: ancestralOperon.insert(operon1GapIndexes[i-1], gene) i = i - 1 elif j > 0: #This means that queue 1 has no more gaps to process so we deal with the remaining gaps in queue 2 #print('Gap being processed: %s' % (operon2Gaps[j-1])) numUniqueFound, deletionSizes, duplicationSizes, _ = findUniqueGenes(operon2Gaps[j-1], sequence2, opIndex2) strain2 = addDuplicationEventsToStrain(duplicationSizes, strain2) strain1 = addDeletionEventsToStrain(deletionSizes, strain1) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon2Gaps[j-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon2Gaps[j-1]) > 0: #Insert gap into operon operon2Gaps[j-1].reverse() for gene in operon2Gaps[j-1]: ancestralOperon.insert(operon2GapIndexes[j-1], gene) j = j - 1 #Set ancestral operon ancestralOperon = unaligned1 + ancestralOperon + unaligned2 event.setAncestralOperonGeneSequence(ancestralOperon) return ancestralOperon, strain1, strain2
def reconstructOperonSequence(event, strain1, strain2): ancestralOperon = copy.deepcopy( event.operon1Alignment ) #The alignments will be identical except when codon mismatches or substitutions occur if event.score == 0: print('No differences detected between these two operons') event.setAncestralOperonGeneSequence(ancestralOperon) else: print('Differences detected between these two operons!') operon1Gaps = event.operon1Gaps operon2Gaps = event.operon2Gaps operon1GapIndexes = event.operon1GapIndexes operon2GapIndexes = event.operon2GapIndexes operon1GapPositions = event.operon1GapPositions operon2GapPositions = event.operon2GapPositions print('These are the extra genes for operon 1: %s' % (operon1Gaps)) print('These are the indexes for extra genes in operon 1: %s' % (operon1GapIndexes)) print('These are the positions of the extra genes in operon 1: %s' % (operon1GapPositions)) print('These are the extra genes for operon 2: %s' % (operon2Gaps)) print('These are the indexes for extra genes in operon 2: %s' % (operon2GapIndexes)) print('These are the positions of the extra genes in operon 2: %s' % (operon2GapPositions)) #Step 1: Check if these extra genes are the result of a duplicate event within the alignment, remove them if they are operon1Gaps, operon1GapPositions, duplicateSizesWithinAlignment1, duplicationDetails1 = checkForMatchesWithinAlignment( operon1Gaps, event.operon1Alignment, operon1GapPositions, event.fragmentDetails1) operon2Gaps, operon2GapPositions, duplicateSizesWithinAlignment2, duplicationDetails2 = checkForMatchesWithinAlignment( operon2Gaps, event.operon2Alignment, operon2GapPositions, event.fragmentDetails2) #Add the details to the respective strain strain1 = addDuplicationEventsToStrain(strain1, duplicateSizesWithinAlignment1, duplicationDetails1) strain2 = addDuplicationEventsToStrain(strain2, duplicateSizesWithinAlignment2, duplicationDetails2) #Step 2: Check if these extra genes are the result of a duplication event within another operon, remove them if they are, else insert them i = len(operon1Gaps) j = len(operon2Gaps) while (i > 0) or (j > 0): #Select the gap with the biggest index b/c we will be performing the insertion rear to front of operon to avoid messing up the indexes of the other gaps if i > 0 and j > 0 and operon1GapIndexes[ i - 1] > operon2GapIndexes[j - 1]: #This means both queues have gaps however the index in queue 1 is bigger so we'll deal with that one first #print('Gap being processed: %s' % (operon1Gaps[i])) duplicationSizes, duplicationDetails, operon1Gaps[ i - 1], operon1GapPositions[i - 1] = checkForMatchesWithinOperons( strain1.genomeFragments, event.fragmentDetails1, operon1Gaps[i - 1], operon1GapPositions[i - 1]) strain1 = addDuplicationEventsToStrain( strain1, duplicationSizes, duplicationDetails) #Adds duplication details to strain #print('Gap being processed: %s' % (operon1Gaps[i-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon1Gaps[i - 1]) > 0: deletionDetails = '' deletionSizes = [] #Insert gap into operon operon1Gaps[i - 1].reverse() for k in range(0, len(operon1Gaps[i - 1])): ancestralOperon.insert(operon1GapIndexes[i - 1], operon1Gaps[i - 1][k]) if event.fragmentDetails1.isNegativeOrientation == False: #This compute the correct gene position based on whether operon was in the negative orientation or not originally genePos = operon1GapPositions[i - 1][ k] + event.fragmentDetails1.startPositionInGenome else: genePos = event.fragmentDetails1.startPositionInGenome + len( event.fragmentDetails1.sequence ) - operon1GapPositions[i - 1][k] - 1 deletionDetails += operon1Gaps[i - 1][k] + ' ' + str( genePos) + ', ' deletionDetails = deletionDetails[0:(len(deletionDetails) - 2)] deletionDetails += ';' #End of deleted segment deletionSizes.append(len(operon1Gaps[i - 1])) #Size of segment strain2 = addDeletionEventsToStrain( strain2, deletionSizes, deletionDetails ) #Remember, if the genes are detected a deletions, it means it was lost in the other strain!! i = i - 1 elif i > 0 and j > 0 and operon1GapIndexes[ i - 1] < operon2GapIndexes[j - 1]: #This means both queues have gaps however the index in queue 2 is bigger so we'll insert that one first #print('Gap being processed: %s' % (operon2Gaps[j-1])) duplicationSizes, duplicationDetails, operon2Gaps[ j - 1], operon2GapPositions[j - 1] = checkForMatchesWithinOperons( strain2.genomeFragments, event.fragmentDetails2, operon2Gaps[j - 1], operon2GapPositions[j - 1]) strain2 = addDuplicationEventsToStrain( strain2, duplicationSizes, duplicationDetails) #Adds duplication details to strain #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon2Gaps[j-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon2Gaps[j - 1]) > 0: deletionDetails = '' deletionSizes = [] #Insert gap into operon operon2Gaps[j - 1].reverse() for k in range(0, len(operon2Gaps[j - 1])): ancestralOperon.insert(operon2GapIndexes[j - 1], operon2Gaps[j - 1][k]) if event.fragmentDetails2.isNegativeOrientation == False: #This compute the correct gene position based on whether operon was in the negative orientation or not originally genePos = operon2GapPositions[j - 1][ k] + event.fragmentDetails2.startPositionInGenome else: genePos = event.fragmentDetails2.startPositionInGenome + len( event.fragmentDetails2.sequence ) - operon2GapPositions[j - 1][k] - 1 deletionDetails += operon2Gaps[j - 1][k] + ' ' + str( genePos) + ', ' deletionDetails = deletionDetails[0:(len(deletionDetails) - 2)] deletionDetails += ';' #End of deleted segment deletionSizes.append(len(operon2Gaps[j - 1])) #Size of segment strain1 = addDeletionEventsToStrain( strain1, deletionSizes, deletionDetails ) #Remember, if the genes are detected a deletions, it means it was lost in the other strain!! j = j - 1 elif i > 0: #This means that queue 2 has no more gaps so we process the remaining gaps in queue 1 #print('Gap being processed: %s' % (operon1Gaps[i-1])) duplicationSizes, duplicationDetails, operon1Gaps[ i - 1], operon1GapPositions[i - 1] = checkForMatchesWithinOperons( strain1.genomeFragments, event.fragmentDetails1, operon1Gaps[i - 1], operon1GapPositions[i - 1]) strain1 = addDuplicationEventsToStrain( strain1, duplicationSizes, duplicationDetails) #Adds duplication details to strain #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon1Gaps[i-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon1Gaps[i - 1]) > 0: deletionDetails = '' deletionSizes = [] #Insert gap into operon operon1Gaps[i - 1].reverse() for k in range(0, len(operon1Gaps[i - 1])): ancestralOperon.insert(operon1GapIndexes[i - 1], operon1Gaps[i - 1][k]) if event.fragmentDetails1.isNegativeOrientation == False: #This compute the correct gene position based on whether operon was in the negative orientation or not originally genePos = operon1GapPositions[i - 1][ k] + event.fragmentDetails1.startPositionInGenome else: genePos = event.fragmentDetails1.startPositionInGenome + len( event.fragmentDetails1.sequence ) - operon1GapPositions[i - 1][k] - 1 deletionDetails += operon1Gaps[i - 1][k] + ' ' + str( genePos) + ', ' deletionDetails = deletionDetails[0:(len(deletionDetails) - 2)] deletionDetails += ';' #End of deleted segment deletionSizes.append(len(operon1Gaps[i - 1])) #Size of segment strain2 = addDeletionEventsToStrain( strain2, deletionSizes, deletionDetails ) #Remember, if the genes are detected a deletions, it means it was lost in the other strain!! i = i - 1 elif j > 0: #This means that queue 1 has no more gaps to process so we deal with the remaining gaps in queue 2 #print('Gap being processed: %s' % (operon2Gaps[j-1])) duplicationSizes, duplicationDetails, operon2Gaps[ j - 1], operon2GapPositions[j - 1] = checkForMatchesWithinOperons( strain2.genomeFragments, event.fragmentDetails2, operon2Gaps[j - 1], operon2GapPositions[j - 1]) strain2 = addDuplicationEventsToStrain( strain2, duplicationSizes, duplicationDetails) #Adds duplication details to strain #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon2Gaps[j-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon2Gaps[j - 1]) > 0: deletionDetails = '' deletionSizes = [] #Insert gap into operon operon2Gaps[j - 1].reverse() for k in range(0, len(operon2Gaps[j - 1])): ancestralOperon.insert(operon2GapIndexes[j - 1], operon2Gaps[j - 1][k]) if event.fragmentDetails2.isNegativeOrientation == False: #This compute the correct gene position based on whether operon was in the negative orientation or not originally genePos = operon2GapPositions[j - 1][ k] + event.fragmentDetails2.startPositionInGenome else: genePos = event.fragmentDetails2.startPositionInGenome + len( event.fragmentDetails2.sequence ) - operon2GapPositions[j - 1][k] - 1 deletionDetails += operon2Gaps[j - 1][k] + ' ' + str( genePos) + ', ' deletionDetails = deletionDetails[0:(len(deletionDetails) - 2)] deletionDetails += ';' #End of deleted segment deletionSizes.append(len(operon2Gaps[j - 1])) #Size of segment strain1 = addDeletionEventsToStrain( strain1, deletionSizes, deletionDetails ) #Remember, if the genes are detected a deletions, it means it was lost in the other strain!! j = j - 1 #Set ancestral operon event.setAncestralOperonGeneSequence(ancestralOperon) #print('This is the resulting ancestral operon: %s' % (ancestralOperon)) #print('\n\n') #print('These are the extra genes remaining for operon 1: %s' %(operon1Gaps)) #print('These are the extra genes remaining for operon 2: %s' %(operon2Gaps)) #print('These are the duplicate sizes operon 1: %s' %(duplicateSizesWithinAlignment1)) #print('These are the duplicate sizes operon 2: %s\n\n' %(duplicateSizesWithinAlignment2)) return event, strain1, strain2
def findOrthologsBySelfGlobalAlignment(strain, coverageTracker, sibling): print('Performing self-global alignment on strain: %s' % (strain.name)) if strain.name == 'NC_015634': print('BREAK') lossEvents = [] duplicationEvents = [] fragments = strain.genomeFragments for i in range(0, len(coverageTracker)): if coverageTracker[i] == False: #Operon has not been marked bestScore = 1000 #Make the best score some large numer bestEvent = None #Initialize the event minDistance = 1000 #Used to track the minimum distance from singleton to operon that has an identical gene filteredList = iter( filter( lambda x: x.fragmentIndex == i, fragments)) #Get the fragment we need based on the index unmarkedFragment = next(filteredList, None) if len(unmarkedFragment.sequence) > 1: #We're processing an operon for j in range(0, len(coverageTracker)): filteredList = iter( filter(lambda x: x.fragmentIndex == j, fragments) ) #Get the fragment we need based on the index currFragment = next(filteredList, None) if i != j and coverageTracker[j] == True and len( currFragment.sequence) > 1: op1 = unmarkedFragment.sequence op2 = currFragment.sequence event = Event(0) event.setFragmentDetails1(unmarkedFragment) event.setFragmentDetails2(currFragment) event.setGenome1Name(strain.name) event.setGenome2Name(strain.name) event.setTechnique('Self Global Alignment (Operon)') event.setAncestralOperonGeneSequence( copy.deepcopy( op1)) #Set the ancestral operon sequence score, event = performGlobalAlignment( op1, op2, event) #Perform the global alignment event.setScore(score) #Compute whether this comparison is interesting threshold = max(len(op1), len(op2)) threshold = threshold // 3 numOperonDifferences = computeOperonDifferences( op1, op2) if numOperonDifferences <= threshold and score < bestScore: bestScore = score bestEvent = event #Make sure an origin or a terminus doesn't get mapped with a singleton gene elif len( unmarkedFragment.sequence ) == 1 and unmarkedFragment.description != 'Origin' and unmarkedFragment.description != 'Terminus': for j in range(0, len(coverageTracker)): if i != j and coverageTracker[j] == True: filteredList = iter( filter(lambda x: x.fragmentIndex == j, fragments) ) #Get the fragment we need based on the index currFragment = next(filteredList, None) op1 = unmarkedFragment.sequence op2 = currFragment.sequence event = Event(0) event.setFragmentDetails1(unmarkedFragment) event.setFragmentDetails2(currFragment) event.setGenome1Name(strain.name) event.setGenome2Name(strain.name) event.setTechnique('Self Global Alignment (Singleton)') event.setAncestralOperonGeneSequence( copy.deepcopy( op1)) #Set the ancestral operon sequence if op1[0] in op2 and abs( i - j ) < minDistance: #Checks if the singleton gene is located in the operon and if the distance is smaller minDistance = abs(i - j) event.setScore(0) bestEvent = event if bestEvent != None: #A match was found meaning the operon is a duplicate therefor do not add it into the ancestor globals.trackingId += 1 bestEvent.trackingEventId = globals.trackingId coverageTracker[i] = True duplicationEvents.append(bestEvent) duplicationDetails = '' position = bestEvent.fragmentDetails1.startPositionInGenome op = copy.deepcopy(bestEvent.fragmentDetails1.sequence) if bestEvent.fragmentDetails1.isNegativeOrientation == True: #Reverses the genes if the operon was originally negative to ensure the correct position is computed op.reverse() for gene in op: duplicationDetails += gene + ' ' + str(position) + ', ' position += 1 duplicationDetails = duplicationDetails[0:( len(duplicationDetails) - 2)] duplicationDetails += ';' #Increment the duplicate counter with size of operon since the operon is a duplication strain = addDuplicationEventsToStrain( strain, [len(bestEvent.fragmentDetails1.sequence)], duplicationDetails) print('\n&&&&&& Self Global Alignment &&&&&') print(bestEvent.toString()) print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n') else: #No match was found therefore it must have been lost in the sibling therefore we include it in the ancestor coverageTracker[i] = True globals.trackingId += 1 event = Event(globals.trackingId) event.setScore(-1) event.setFragmentDetails1(unmarkedFragment) event.setFragmentDetails2(unmarkedFragment) event.setGenome1Name(strain.name) event.setGenome2Name(strain.name) event.setTechnique('Self Global Alignment (No match!)') event.setAncestralOperonGeneSequence( copy.deepcopy(unmarkedFragment.sequence)) lossEvents.append(event) deletionDetails = '' position = event.fragmentDetails1.startPositionInGenome op = copy.deepcopy(event.fragmentDetails1.sequence) if event.fragmentDetails1.isNegativeOrientation == True: #Reverses the genes if the operon was originally negative to ensure the correct position is computed op.reverse() for gene in op: deletionDetails += gene + ' ' + str(position) + ', ' position += 1 deletionDetails = deletionDetails[0:(len(deletionDetails) - 2)] deletionDetails += ';' #Increment the loss counter with the size of the operon since the operon is a loss sibling = addDeletionEventsToStrain( sibling, [len(event.fragmentDetails1.sequence)], deletionDetails) print('\n&&&&&& Self Global Alignment &&&&&') print(event.toString()) print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n') return duplicationEvents, lossEvents, coverageTracker, strain, sibling
def findOrthologsBySelfGlobalAlignment(strain, coverageTracker, sibling): if globals.printToConsole: print('Performing self-global alignment on strain: %s' % (strain.name)) lossEvents = [] duplicationEvents = [] fragments = strain.genomeFragments for i in range(0, len(coverageTracker)): if coverageTracker[i] == False: #Operon has not been marked bestScore = -1000 #Make the best score some large numer bestEvent = None #Initialize the event minDistance = 1000 #Used to track the minimum distance from singleton to operon that has an identical gene bestJ = -999 filteredList = iter( filter( lambda x: x.fragmentIndex == i, fragments)) #Get the fragment we need based on the index unmarkedFragment = next(filteredList, None) if len(unmarkedFragment.sequence) > 1: #We're processing an operon for j in range(0, len(coverageTracker)): filteredList = iter( filter(lambda x: x.fragmentIndex == j, fragments) ) #Get the fragment we need based on the index currFragment = next(filteredList, None) if i != j and currFragment.isDuplicate == False and len( currFragment.sequence) > 1: op1 = unmarkedFragment.sequence op2 = currFragment.sequence event = Event(0) event.setFragmentDetails1(unmarkedFragment) event.setFragmentDetails2(currFragment) event.setGenome1Name(strain.name) event.setGenome2Name(strain.name) event.setTechnique('Self Global Alignment (Operon)') event.setAncestralOperonGeneSequence( copy.deepcopy( op1)) #Set the ancestral operon sequence score, event = performGlobalAlignment( op1, op2, event) #Perform the global alignment event.setScore(score) #Compute whether this comparison is interesting #threshold = max(len(op1), len(op2)) #threshold = threshold//3 #numOperonDifferences = computeOperonDifferences(op1, op2) #if numOperonDifferences <= threshold and score < bestScore: if (score > 0 and score > bestScore) or ( score == bestScore and abs(i - j) < minDistance): bestScore = score bestEvent = event minDistance = abs(i - j) bestJ = j #Make sure an origin or a terminus doesn't get mapped with a singleton gene elif len( unmarkedFragment.sequence ) == 1 and unmarkedFragment.description != 'Origin' and unmarkedFragment.description != 'Terminus': for j in range(0, len(coverageTracker)): filteredList = iter( filter(lambda x: x.fragmentIndex == j, fragments) ) #Get the fragment we need based on the index currFragment = next(filteredList, None) if i != j and currFragment.isDuplicate == False: op1 = unmarkedFragment.sequence op2 = currFragment.sequence event = Event(0) event.setFragmentDetails1(unmarkedFragment) event.setFragmentDetails2(currFragment) event.setGenome1Name(strain.name) event.setGenome2Name(strain.name) event.setTechnique('Self Global Alignment (Singleton)') event.setAncestralOperonGeneSequence( copy.deepcopy( op1)) #Set the ancestral operon sequence if op1[0] in op2 and abs( i - j ) < minDistance: #Checks if the singleton gene is located in the operon and if the distance is smaller minDistance = abs(i - j) event.setScore(0) bestEvent = event bestJ = j if bestEvent != None: #A match was found meaning the operon is a duplicate therefor do not add it into the ancestor #Handle special case where two unmarked operons are selected as the best matches cycleDuplication = False if coverageTracker[i] == False and coverageTracker[ bestJ] == False: bestEvent.fragmentDetails2.isDuplicate = True coverageTracker[bestJ] = True cycleDuplication = True bestEvent.setScore(-1) bestEvent.setAncestralOperonGeneSequence( copy.deepcopy(bestEvent.fragmentDetails2.sequence) ) #insert source as ancestral operon bestEvent.setTechnique( 'Self Global Alignment (Cyclic Duplication!)') lossEvents.append(bestEvent) globals.trackingId += 1 coverageTracker[i] = True bestEvent.trackingEventId = globals.trackingId if len(bestEvent.fragmentDetails1.sequence) > 1 and len( bestEvent.fragmentDetails2.sequence) > 1: handleDuplicateDetails(bestEvent, strain, sibling, cycleDuplication) else: #Singleton was mapped to an operon if len(bestEvent.fragmentDetails1.sequence) == 1: gene = bestEvent.fragmentDetails1.sequence[0] position = bestEvent.fragmentDetails1.startPositionInGenome else: gene = bestEvent.fragmentDetails2.sequence[0] position = bestEvent.fragmentDetails2.startPositionInGenome tempString = gene + ' ' + str(position) + ';' strain = addDuplicationEventsToStrain( strain, [1], tempString) duplicationEvents.append(bestEvent) #This is now being handled with the function handleDuplicateDetails #duplicationDetails = '' #position = bestEvent.fragmentDetails1.startPositionInGenome #op = copy.deepcopy(bestEvent.fragmentDetails1.sequence) #if bestEvent.fragmentDetails1.isNegativeOrientation == True: #Reverses the genes if the operon was originally negative to ensure the correct position is computed #op.reverse() #for gene in op: #duplicationDetails += gene + ' ' + str(position) + ', ' #position += 1 #duplicationDetails = duplicationDetails[0:(len(duplicationDetails) - 2)] #duplicationDetails += ';' #Increment the duplicate counter with size of operon since the operon is a duplication #strain = addDuplicationEventsToStrain(strain, [len(bestEvent.fragmentDetails1.sequence)], duplicationDetails) if globals.printToConsole: print('\n&&&&&& Self Global Alignment &&&&&') print(bestEvent.toString()) print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n') else: #No match was found therefore it must have been lost in the sibling therefore we include it in the ancestor coverageTracker[i] = True globals.trackingId += 1 event = Event(globals.trackingId) event.setScore(-1) event.setFragmentDetails1(unmarkedFragment) event.setFragmentDetails2(unmarkedFragment) event.setGenome1Name(strain.name) event.setGenome2Name(strain.name) event.setTechnique('Self Global Alignment (No match!)') event.setAncestralOperonGeneSequence( copy.deepcopy(unmarkedFragment.sequence)) lossEvents.append(event) position = event.fragmentDetails1.startPositionInGenome op = copy.deepcopy(event.fragmentDetails1.sequence) tempString = '' for n in range(0, len(op)): gene = op[n] if event.fragmentDetails1.isNegativeOrientation == False: tempString += gene + ' ' + str(n + position) + ', ' else: tempString = gene + ' ' + str(position + len(op) - n - 1) + ', ' + tempString tempString = tempString[0:( len(tempString) - 2)] #Remove the last comma and space tempString += ';' #Increment the loss counter with the size of the operon since the operon is a loss sibling = addDeletionEventsToStrain( sibling, [len(event.fragmentDetails1.sequence)], tempString) if globals.printToConsole: print('\n&&&&&& Self Global Alignment &&&&&') print(event.toString()) print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n') return duplicationEvents, lossEvents, coverageTracker, strain, sibling
def findOrthologsBySelfGlobalAlignment(strain, coverageTracker, targetStrain): print('Performing self-global alignment on strain: %s' % (strain.getName())) lossEvents = [] duplicationEvents = [] sequence = strain.getSequence() for x in range(0, len(coverageTracker)): #Check if marked if coverageTracker[x] == False: bestScore = 1000 #Make the best score some large numer bestEvent = None #Initialize the event minDistance = 1000 #Used to track the minimum distance from singleton to operon that has an identical gene if len(sequence[x].split(',')) > 1: #This is an operon for y in range(0, len(sequence)): #make sure we're not comparing the same operons and that the second operon is NOT a singleton if x != y and len(sequence[y].split( ',')) > 1 and coverageTracker[y] == True: op1 = sequence[x] op2 = sequence[y] #Gene content differences ie which one has more genes, operons 1 and 2, and the number of unique genes between the operons being compared (ie does the operon have any unique genes) geneContentDifferences, operon1, operon2, numUniqueGenes = formatAndComputeOperonDifferences( op1, op2) #Checks if either operons are in the - orientation negativeOrientationOp1 = reverseSequence(op1) negativeOrientationOp2 = reverseSequence(op2) #Tracks whether we reversed the operons operon1Reversed = False operon2Reversed = False #Create event for this comparison event = Event(0) event.setGenome1Operon(operon1) event.setGenome2Operon(operon2) event.setGenome1Name(strain.getName()) event.setGenome2Name(strain.getName()) event.isOriginallyNegativeOrientationOp1( negativeOrientationOp1 ) #This tracks the original orientation of op1 event.isOriginallyNegativeOrientationOp2( negativeOrientationOp2 ) #This tracks the original orientation of op2 event.setOperon1Index(x) event.setOperon2Index(y) event.setTechnique('Self Global Alignment (Operon)') #If the orientation of the operons does not match, then flip the operon in the negative orientation to the positive orientation if negativeOrientationOp1 != negativeOrientationOp2: if negativeOrientationOp1: operon1.reverse() operon1Reversed = True negativeOrientationOp1 = False if negativeOrientationOp2: operon2.reverse() operon2Reversed = True negativeOrientationOp2 = False if negativeOrientationOp1 != negativeOrientationOp2: print( 'Check code! These operons should be in the same orientation!' ) #Track whether these operons were reversed event.isReversedOp1( operon1Reversed ) #This tracks whether op1 was reversed event.isReversedOp2( operon2Reversed ) #This tracks whether op2 was reversed event.setAncestralOperonGeneSequence( copy.deepcopy( operon1)) #Set the ancestral operon sequence score, event = performGlobalAlignment( operon1, operon2, event) #Perform the global alignment event.setScore(score) threshold = max(len(operon1), len(operon2)) threshold = threshold // 3 if geneContentDifferences <= threshold and score < bestScore: bestScore = score bestEvent = event else: #This is a singleton gene for y in range(0, len(sequence)): if x != y and coverageTracker[ y] == True: #Make sure we're not comparing the same singleton genes op1 = sequence[x] op2 = sequence[y] #Gene content differences ie which one has more genes, operons 1 and 2, and the number of unique genes between the operons being compared (ie does the operon have any unique genes) geneContentDifferences, operon1, operon2, numUniqueGenes = formatAndComputeOperonDifferences( op1, op2) #Checks if either operons are in the - orientation negativeOrientationOp1 = reverseSequence(op1) negativeOrientationOp2 = reverseSequence(op2) #Tracks whether we reversed the operons operon1Reversed = False operon2Reversed = False #Create event for this comparison event = Event(0) event.setGenome1Operon(operon1) event.setGenome2Operon(operon2) event.setGenome1Name(strain.getName()) event.setGenome2Name(strain.getName()) event.isOriginallyNegativeOrientationOp1( negativeOrientationOp1 ) #This tracks the original orientation of op1 event.isOriginallyNegativeOrientationOp2( negativeOrientationOp2 ) #This tracks the original orientation of op2 event.setOperon1Index(x) event.setOperon2Index(y) event.setTechnique('Self Global Alignment (Singleton)') #If the orientation of the operons does not match, then flip the operon in the negative orientation to the positive orientation if negativeOrientationOp1 != negativeOrientationOp2: if negativeOrientationOp1: operon1.reverse() operon1Reversed = True negativeOrientationOp1 = False if negativeOrientationOp2: operon2.reverse() operon2Reversed = True negativeOrientationOp2 = False if negativeOrientationOp1 != negativeOrientationOp2: print( 'Check code! These operons should be in the same orientation!' ) #Track whether these operons were reversed event.isReversedOp1( operon1Reversed ) #This tracks whether op1 was reversed event.isReversedOp2( operon2Reversed ) #This tracks whether op2 was reversed event.setAncestralOperonGeneSequence( copy.deepcopy( operon1)) #Set the ancestral operon sequence if operon1[0] in operon2 and abs( x - y ) < minDistance: #Checks if the singleton gene is located in the operon and if the distance is smaller minDistance = abs(x - y) event.setScore(0) bestEvent = event #Take the event and append it to the duplicate event list if bestEvent != None: globals.trackingId += 1 bestEvent.trackingEventId = globals.trackingId coverageTracker[x] = True duplicationEvents.append(bestEvent) #Increment the duplicate counter with size of operon since the operon is a duplication #incrementDuplicateSizeCounters([len(event.genome1Operon)]) strain = addDuplicationEventsToStrain( [len(event.genome1Operon)], strain) print('\n&&&&&& Self Global Alignment &&&&&') bestEvent.printEvent() print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n') else: coverageTracker[x] = True globals.trackingId += 1 event = Event(globals.trackingId) event.setScore(-1) event.setGenome2Operon([]) event.setGenome1Name(strain.getName()) event.setGenome2Name('None') event.isOriginallyNegativeOrientationOp1( reverseSequence(sequence[x])) event.isOriginallyNegativeOrientationOp2(False) event.setOperon1Index(x) event.setOperon2Index(-1) event.setTechnique('Self Global Alignment (No match!)') event.isReversedOp1(False) event.isReversedOp2(False) #Set the ancestral operon sequence ancestralGenes = [] operonGenes = sequence[x] operonGenes = operonGenes.replace('-', '') operonGenes = operonGenes.replace('[', '') operonGenes = operonGenes.replace(']', '') operonGenesList = operonGenes.split(',') for gene in operonGenesList: ancestralGenes.append(gene.strip()) event.setAncestralOperonGeneSequence(ancestralGenes) event.setGenome1Operon(copy.deepcopy(ancestralGenes)) lossEvents.append(event) #Increment the loss counter with the size of the operon since the operon is a loss #incrementDeletionSizeCounters([len(event.genome1Operon)]) targetStrain = addDeletionEventsToStrain( [len(event.genome1Operon)], targetStrain) print('\n&&&&&& Self Global Alignment &&&&&') event.printEvent() print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n') return duplicationEvents, lossEvents, coverageTracker, targetStrain, strain
def reconstructOperonSequence(event, strain1, strain2): ancestralOperon = copy.deepcopy(event.operon1Alignment) if event.score == 0: print('No differences detected between these two operons') event.setAncestralOperonGeneSequence(ancestralOperon) else: print('Differences detected between these two operons!') operon1Gaps = event.operon1Gaps operon2Gaps = event.operon2Gaps operon1GapIndexes = event.operon1GapIndexes operon2GapIndexes = event.operon2GapIndexes print('These are the extra genes for operon 1: %s' % (operon1Gaps)) print('These are the indexes for extra genes in operon 1: %s' % (operon1GapIndexes)) print('These are the extra genes for operon 2: %s' % (operon2Gaps)) print('These are the indexes for extra genes in operon 2: %s' % (operon2GapIndexes)) #Checks if these extra genes are duplicates by checking if they exist within the alignment and removes them if they do operon1Gaps, duplicateSizesWithinAlignment1 = checkForMatchesInAlignment( operon1Gaps, event.operon1Alignment) operon2Gaps, duplicateSizesWithinAlignment2 = checkForMatchesInAlignment( operon2Gaps, event.operon2Alignment) #increment the duplicate counters #incrementDuplicateSizeCounters(duplicateSizesWithinAlignment1) #incrementDuplicateSizeCounters(duplicateSizesWithinAlignment2) strain1 = addDuplicationEventsToStrain(duplicateSizesWithinAlignment1, strain1) strain2 = addDuplicationEventsToStrain(duplicateSizesWithinAlignment2, strain2) i = len(operon1Gaps) j = len(operon2Gaps) while (i > 0) or (j > 0): #Select the gap with the biggest index b/c we will be performing the insertion rear to front of operon to avoid messing up the indexes of the other gaps if i > 0 and j > 0 and operon1GapIndexes[ i - 1] > operon2GapIndexes[j - 1]: #This means both queues have gaps however the index in queue 1 is bigger so we'll deal with that one first #print('Gap being processed: %s' % (operon1Gaps[i])) numUniqueFound, deletionSizes, duplicationSizes, updateUnaligned = findUniqueGenes( operon1Gaps[i - 1], strain1.formattedSequence, strain1.sequenceConversion[event.operon1Index]) strain1 = addDuplicationEventsToStrain(duplicationSizes, strain1) strain2 = addDeletionEventsToStrain(deletionSizes, strain2) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon1Gaps[i-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon1Gaps[i - 1]) > 0: #Insert gap into operon operon1Gaps[i - 1].reverse() for gene in operon1Gaps[i - 1]: ancestralOperon.insert(operon1GapIndexes[i - 1], gene) i = i - 1 elif i > 0 and j > 0 and operon1GapIndexes[ i - 1] < operon2GapIndexes[j - 1]: #This means both queues have gaps however the index in queue 2 is bigger so we'll insert that one first #print('Gap being processed: %s' % (operon2Gaps[j-1])) numUniqueFound, deletionSizes, duplicationSizes, updateUnaligned = findUniqueGenes( operon2Gaps[j - 1], strain2.formattedSequence, strain2.sequenceConversion[event.operon2Index]) strain2 = addDuplicationEventsToStrain(duplicationSizes, strain2) strain1 = addDeletionEventsToStrain(deletionSizes, strain1) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon2Gaps[j-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon2Gaps[j - 1]) > 0: #Insert gap into operon operon2Gaps[j - 1].reverse() for gene in operon2Gaps[j - 1]: ancestralOperon.insert(operon2GapIndexes[j - 1], gene) j = j - 1 elif i > 0: #This means that queue 2 has no more gaps so we process the remaining gaps in queue 1 #print('Gap being processed: %s' % (operon1Gaps[i-1])) numUniqueFound, deletionSizes, duplicationSizes, updateUnaligned = findUniqueGenes( operon1Gaps[i - 1], strain1.formattedSequence, strain1.sequenceConversion[event.operon1Index]) strain1 = addDuplicationEventsToStrain(duplicationSizes, strain1) strain2 = addDeletionEventsToStrain(deletionSizes, strain2) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon1Gaps[i-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon1Gaps[i - 1]) > 0: #Insert gap into operon operon1Gaps[i - 1].reverse() for gene in operon1Gaps[i - 1]: ancestralOperon.insert(operon1GapIndexes[i - 1], gene) i = i - 1 elif j > 0: #This means that queue 1 has no more gaps to process so we deal with the remaining gaps in queue 2 #print('Gap being processed: %s' % (operon2Gaps[j-1])) numUniqueFound, deletionSizes, duplicationSizes, updateUnaligned = findUniqueGenes( operon2Gaps[j - 1], strain2.formattedSequence, strain2.sequenceConversion[event.operon2Index]) strain2 = addDuplicationEventsToStrain(duplicationSizes, strain2) strain1 = addDeletionEventsToStrain(deletionSizes, strain1) #incrementDuplicateSizeCounters(duplicationSizes) #incrementDeletionSizeCounters(deletionSizes) #print('Gap being processed: %s' % (operon2Gaps[j-1])) #print('Number of unique genes found: %s' %(numUniqueFound)) #print('Number of deletion genes found: %s' %(deletionSizes)) #print('Number of duplicate genes found: %s' %(duplicationSizes)) if len(operon2Gaps[j - 1]) > 0: #Insert gap into operon operon2Gaps[j - 1].reverse() for gene in operon2Gaps[j - 1]: ancestralOperon.insert(operon2GapIndexes[j - 1], gene) j = j - 1 #Set ancestral operon event.setAncestralOperonGeneSequence(ancestralOperon) #print('This is the resulting ancestral operon: %s' % (ancestralOperon)) #print('\n\n') #print('These are the extra genes remaining for operon 1: %s' %(operon1Gaps)) #print('These are the extra genes remaining for operon 2: %s' %(operon2Gaps)) #print('These are the duplicate sizes operon 1: %s' %(duplicateSizesWithinAlignment1)) #print('These are the duplicate sizes operon 2: %s\n\n' %(duplicateSizesWithinAlignment2)) return event, strain1, strain2