Esempio n. 1
0
	def __init__(self, name, len, vec=None):
		self.name = name # unique identifier
		self.len = len # length of vector
		self.__seqvector = SeqVector(0, self.len, 0)
		if vec is not None:
			self.__seqvector.vec = vec
		self.annotations = {}
Esempio n. 2
0
class DF(SeqVector):
	number_of_nucleotides = len(SeqVector.mapping)

	def __init__(self, name, len, vec=None):
		self.name = name # unique identifier
		self.len = len # length of vector
		self.__seqvector = SeqVector(0, self.len, 0)
		if vec is not None:
			self.__seqvector.vec = vec
		self.annotations = {}

	def __add__(self, other):
		"""
		Adding one DF to self
		"""
		self.__seqvector.vec += other.__seqvector.vec
		return self

	def __iter__(self):
		"""
		Iterate through the nucleotides
		"""
		for nt in SeqVector.rev_mapping:
			yield nt

	def __len__(self):
		return self.len

	def __str__(self):
		return self.name + '\n' + \
				str(self.__seqvector.vec)

	def __getitem__(self, nt):
		return self.__seqvector.vec[SeqVector.mapping[nt], :]

	def add_annotation(self, k, v):
		self.annotations[k] = v

	def add_to_vec(self, nt, positions, counts):
		"""
		*add* to vec[nt] every counts[i] for positions[i]
		"""
		self.__seqvector.add_to_vec(nt, positions, counts)

	def change_vec_mask(self, mask):
		self.len = len(mask)
		self.__seqvector.change_vec_mask(mask)

	def consensus_seq(self, gapped=False, percent_cutoff=.7):
		return self.__seqvector.consensus_seq(gapped, percent_cutoff)

	def get_compressed_vec(self, ignoreN=True):
		"""
		Calls self.__seqvector.get_compressed_vec
		"""
		return self.__seqvector.get_compressed_vec(ignoreN)

	def get_counts_at_pos(self, i, ignoreN=True):
		"""
		Return count of position i (slice)
		"""
		result = self.__seqvector.vec[:, i]
		if ignoreN:
			nt_ind = SeqVector.mapping['N']
			inds = [i for i in xrange(DF.number_of_nucleotides) if i!=nt_ind]
			return result[inds]
		else:
			return result

	def get_vec_diff_sqsum(self, other, ignoreN=True):
		"""
		Returns the sum of {squared of sum diffs} of two DFs
		"""
		#t = self.vec*1./self.get_compressed_vec(ignoreN) - other.vec*1./other.get_compressed_vec(ignoreN)
		t = self.__seqvector.vec - other.__seqvector.vec
		nt_ind = SeqVector.mapping['N'] if ignoreN else -1
		result = 0
		for j in xrange(self.len):
			c = math.sqrt(sum(t[i,j]**2 for i in xrange(DF.number_of_nucleotides) if i!=nt_ind))
			if not np.isnan(c): result += c
		return result

	def normalized_vec(self, ignoreN=True):
		p = self.get_compressed_vec(ignoreN)
		for i in xrange(self.len):
			p[i] = max(1, p[i])
		self.__seqvector.vec = self.__seqvector.vec * 1. / p
		# the code below was my feeble attempt to correct for the fact
		# that now each positions counts may not add up to 1....*sigh*
#		ind = SeqVector.mapping['A']
#		n_ind = SeqVector.mapping['N']
#		for i in xrange(self.len):
#			if ignoreN:
#				p = 1. - self.__seqvector.vec[:, i].sum() + self.__seqvector.vec[n_ind, i]
#			else:
#				p = 1. - self.__seqvector.vec[:, i].sum()
#			self.__seqvector.vec[ind, i] += p

	def normalized_vec_add(self, other, vec_pre_normalized, ignoreN):
		if vec_pre_normalized:
			self.__seqvector.vec = self.__seqvector.vec * 0.5 + \
					other.__seqvector.vec * 0.5
		else:
			self.__seqvector.vec = self.__seqvector.vec * 0.5 / self.get_compressed_vec(ignoreN)
			self.__seqvector.vec += other.__seqvector.vec * 0.5 / other.get_compressed_vec(ignoreN)

	@property
	def nonzero(self):
		"""
		Return a list of non-zero columns
		because .vec is row(nucleotide) x column(position)
		we take the unified set of columns and return them
		"""
		_x = self.__seqvector.vec.nonzero()[1]
		_x = list(set(_x)) # uniquify them
		_x.sort() # sort positions
		return _x

	@staticmethod
	def nucleotides():
		for nt in SeqVector.mapping: yield nt

	@property
	def vec(self):
		return self.__seqvector.vec

	@property
	def nt_count(self, ignoreN=True):
		inds = range(self.number_of_nucleotides)
		if ignoreN:
			inds.remove(SeqVector.mapping['N'])
		return self.__seqvector.vec[inds, :].sum()


	def assign_vec(self, vec):
		self.__seqvector.vec = vec