Exemple #1
0
	def translate_selection_reverse_complement(self):
		'''Translate reverse-complement of selected DNA'''
		start, finish = self.get_selection()
		if finish == -1:
			raise ValueError('Cannot translate an empty selection')
		else:
			DNA = genbank.gb.GetDNA(start, finish)
			protein = dna.Translate(dna.RC(DNA))
			self.translate_output(protein, DNA, 'complement strand')
Exemple #2
0
    def assemble_de_novo(self):
        '''
		
		'''

        #somewhere I need to pick which type of file to use if there are multiple. I'm gonna go with the AB1 data for now.

        #i'm starting with the simple case of two sequences
        seq1 = self.seqdata['p423-GPD-RV']['AB1'].getDNA()
        seq2 = self.seqdata['p423-GPD-FW']['AB1'].getDNA()
        print(self.getOverlap(seq1, seq2))
        print(self.getOverlap(seq1, dna.RC(seq2)))
Exemple #3
0
	def RCObject(self):
		'''
		Reverse-complement the entire object.
		'''
		#this will need a lot of work to make it work right
		
		#update variable to keep track of whether it has been reverse-complemented or not
		if self.getRC() is True:
			self.setRC(False)
		elif self.getRC() is False:
			self.setRC(True)
			
		#update trace
		#how?
		
		#update qual val
		#how?
		
		#update sequence
		self.setDNA(dna.RC(self.getDNA()))
Exemple #4
0
	def assemble_de_novo(self, seq_type='SEQ.CLIPPED'):
		'''
		This method aims to achieve the de-novo assembly of sequence reads without using a reference sequence.
		The sequence reads are arranged from longest to shortest.
		The longest sequence is used as a starting point of the contig.
		The method then takes the contig and find which other sequence generates the largest overlap.
		This sequence is added to the growing contig.
		This is repeated until no other overlaps are found or untill all sequences are aligned.
		
		#it works pretty well but needs some further improvement
		#mainly the entire sequence objects need to be reverse-complemented for the reverse, not just the sequence.
		#the objects should also be stored in a list for later use.
		
		seq_type defines what type of sequence data should be used (AB1, SFC, SEQ, FASTQ...). There may be several sources for a single sequence.
		
		#I should improve the performance of this algorithm and then convert it to cython...
		'''
		#sort the sequence objects based on sequence length, longest first, shortest last
		seq_list = [self.seqdata[key] for key in self.seqdata.keys()]
		seq_list = sorted(seq_list, key=lambda k: k[seq_type].getDNA(), reverse=True) 
		
		#set up contig
		order = [seq_list[0]] #stores the sequence objects in order, not currently used.....
		contig = seq_list[0][seq_type].getDNA() #add the longest sequence
		used_index = [0]
		
		#take the longest seq, find the best match, combine them. Start over. Do until all sequences have been assembled.
		for n in range(len(seq_list)-1): #do it as many times as the list is long -1
			best_score = 0 #keep track of best score
			best = None #keep the best (with the best alignment score) sequence object
			index_of_best = None
			for o in range(len(seq_list)):
				if o in used_index:
					pass
				else:
					#get sequences
					seq1 = contig
					seq2 = seq_list[o][seq_type].getDNA()
				
					#align sequences in FW
					forward = self.getOverlap(seq1, seq2)
					
					#align sequences with one in RV
					reverse = self.getOverlap(seq1, dna.RC(seq2))
					
					#was the fw or rv alignment best?
					threshold_score = 10
#					print('FW score: %s' % forward['score'])
#					print('RV score: %s' % reverse['score'])
					if forward['score'] >= threshold_score > reverse['score']: #only the fw is above threshold
						if forward['score'] > best_score:
							best_score = forward['score']
							best = forward
							index_of_best = o
							
					elif forward['score'] < threshold_score <= reverse['score']: #only the rv is above threshold
						if reverse['score'] > best_score:
							best_score = reverse['score']
							best = reverse
							index_of_best = o
							
					elif forward['score'] > reverse['score'] >= threshold_score: #both are above, but fw is largest
						if forward['score'] > best_score:
							best_score = forward['score']
							best = forward
							index_of_best = o
						
					elif threshold_score <= forward['score'] < reverse['score']: #both are above, but rv is largest
						if reverse['score'] > best_score:
							best_score = reverse['score']
							best = reverse
							index_of_best = o
						
					elif forward['score'] < threshold_score and reverse['score'] < threshold_score: #both are below the threshold
						pass						
						
					elif forward['score'] == reverse['score'] >= threshold_score: #they can't both be above the threshold and be equal, because only one alignment is biologically correct
						print('Both FW and RV have the same score.')
						raise ValueError
						
					else: 
						print('Unanticipated scores for FW and RV.')
						raise ValueError
					
					#I should add the object to the "order" list. Reverse-complemented if necessary.
					#RCObject
				
			#make sure that something actually aligned
			if best == None:
				print('No further alignments possible even though sequences are left.')
				self.setContig(contig)
				break
				
			#now take the "best" sequence and add it to the contig
			bases = []
			for i,j in itertools.izip_longest(best['seq1'],best['seq2']):
				if i == j and i.upper() in 'GATCN':
					bases.append(i)
				elif i.upper() in 'GATC' and j.upper() in '-N':
					bases.append(i)
				elif i.upper() in '-N' and j.upper() in 'GATC':
					bases.append(j)
				elif i != j and i.upper() in 'GATC' and j.upper() in 'GATC': #both are GATC but they are not the same
					bases.append(i) #just pick one. I will change it later so that basecalls are used to pick the best one.
				else:
					print('unanticipated base value: %s, %s' % (i,j))
					raise ValueError
			contig = ''.join(bases)
			
			#add that sequence to the used index
			used_index.append(index_of_best)
	
		#add the finished contig
		self.setContig(contig)