Exemple #1
0
	def mergeBlock(self, data_ls_2d, mask_ls_2d, gene_id_set, bad_gene_id):
		"""
		08-28-05
			merge the block of same genes
		02-24-06
			just average
		"""
		gene_id_set.remove(bad_gene_id)
		gene_id = gene_id_set.pop()
		if self.debug:
			print gene_id
		ar = array(data_ls_2d, mask = mask_ls_2d, fill_value=100000000)	#02-24-06 fill_value for graph_modeling
		if len(data_ls_2d)==1:	#no need to do average
			ar = array(data_ls_2d[0], mask=mask_ls_2d[0])
			if self.debug:
				print ar
			return gene_id, ar
		max_ls = []
		"""
		for i in range(ar.shape[0]):
			signed_max_value = maximum(ar[i,:])
			signed_min_value = minimum(ar[i,:])
			max_value = max(abs(signed_max_value), abs(signed_min_value))	#02-17-06
			max_ls.append(max_value)
			if self.debug:
				print "ar", ar
				print "max_value", max_value
				print "max_ls", max_ls
			ar[i,:] = ar[i,:]/max_value
			if self.debug:
				print "ar divided by max_value", ar
		"""
		new_ar = average(ar) #*max(max_ls)	#02-17-06
		if self.debug:
			print "average(ar)", average(ar)
			print "max(max_ls)", max(max_ls)
			print "new_ar(after average and multiplication of max(max_ls)", new_ar
		
		"""
		#02-24-06
		for i in range(len(ar)):
			for j in range(i+1, len(ar)):
				edge_data = graph_modeling.ind_cor(ar[i].tolist(), ar[j].tolist(), -1)
				#print "correlation between %s and %s is %s"%(i, j, edge_data.value)
				#raw_input("Continue? : ")
				self.cor_list.append(edge_data.value)
		"""
		return gene_id, new_ar
Exemple #2
0
	def data_read_in(self, infname, no_of_nas):
		"""
		05-09-05
		"""
		sys.stderr.write("Reading data...")
		list_of_mas = []
		reader = csv.reader(open(infname, 'r'),delimiter='\t')
		list_of_gene_ids = []
		for row in reader:
			data_ls = []
			mask_ls = []
			for item in row[1:]:	#ignore the first edge id
				if item=='NA':
					data_ls.append(1e20)
					mask_ls.append(1)
				else:
					data_ls.append(float(item))
					mask_ls.append(0)
			if no_of_nas:
				if sum(mask_ls)>no_of_nas:	#too many NAs
					continue
			
			list_of_gene_ids.append(row[0])
			list_of_mas.append(array(data_ls, mask=mask_ls))
		"""
		#the rest NA replaced with mean
		for i in range(len(list_of_mas)):
			ma = list_of_mas[i]
			list_of_mas[i]  = filled(ma, MLab.mean(ma.compressed()))
		"""
		del reader
		sys.stderr.write("Done.\n")
		return list_of_mas, list_of_gene_ids
	def data_read_in(self, infname, no_of_nas):
		"""
		05-09-05
		"""
		sys.stderr.write("Reading data...")
		list_of_mas = []
		reader = csv.reader(open(infname, 'r'),delimiter='\t')
		reader.next()	#ignore the first line
		for row in reader:
			data_ls = []
			mask_ls = []
			for item in row[1:]:	#ignore the first edge id
				if item=='NA':
					data_ls.append(1.1)
					mask_ls.append(1)
				else:
					data_ls.append(float(item))
					mask_ls.append(0)
			if no_of_nas:
				if sum(mask_ls)>no_of_nas:	#too many NAs
					continue
			list_of_mas.append(array(data_ls, mask=mask_ls))
		del reader
		sys.stderr.write("Done.\n")
		return list_of_mas
Exemple #4
0
	def get_ma_array_out_of_list(self, expr_list, take_log, round_one=0):
		"""
		12-22-05
		12-22-05
			in the second round, take random to avoid high correlation caused by a series of 10
		"""
		new_row = []
		mask_ls = []
		for i in range(len(expr_list)):
			if expr_list[i] == 'NA':
				new_row.append(1e20)
				mask_ls.append(1)
			elif expr_list[i] == '':
				#ignore empty entry
				continue
			else:
				value = float(expr_list[i])
				if take_log:	#12-22-05
					if value<=10:
						if round_one:
							value = 10
						else:
							value = random.uniform(math.e, 10)	#12-22-05 to avoid high correlation caused by a series of 10
					value = math.log(value)	#12-22-05
				new_row.append(value)
				mask_ls.append(0)
		ma_array = array(new_row, mask=mask_ls)
		return ma_array
Exemple #5
0
	def transform_one_file(self, src_pathname, delimiter, outputdir, b_instance, threshold, type, no_of_valids):
		"""
		08-09-05
			add type
		08-29-05
			add no_of_valids to cut genes with too few valid values
		"""
		reader = csv.reader(file(src_pathname), delimiter=delimiter)
		filename = os.path.basename(src_pathname)
		output_filename = os.path.join(outputdir, filename)
		std_list = []
		for row in reader:
			gene_id = row[0]
			new_row = []
			mask_ls = []
			for i in range(1, len(row)):
				if row[i] == 'NA':
					new_row.append(1e20)
					mask_ls.append(1)
				elif row[i] == '':
					#ignore empty entry
					continue
				else:
					value = float(row[i])
					if type==1:
						if value<=10:
							value = 10
						value = math.log(value)
					new_row.append(value)
					mask_ls.append(0)
			ma_array = array(new_row, mask=mask_ls)
			if self.debug:
				print "The data vector is ",ma_array
				print "Its mask is ", ma_array.mask()
			if len(ma_array.compressed())>=no_of_valids:	#at least two samples, otherwise, correlation can't be calculated
				#08-29-05	no_of_valids controls not too many NA's, which is for graph_modeling
				std = MLab.std(ma_array.compressed())	#disregard the NAs
				if self.debug:
					print "std is ",std
					raw_input("Continue?(Y/n)")
				std_list.append(std)
		del reader
		if len(std_list)>100:
			r.png('%s.png'%output_filename)
			r.hist(std_list, main='histogram',xlab='std',ylab='freq')
			r.dev_off()
	def transpose_and_output(self, outfname, list_of_top_mas):
		"""
		05-09-05
			--ls_NA_fillin()
		"""
		sys.stderr.write("Outputing the data...")
		ls_2d = []
		for ma in list_of_top_mas:
			ls_2d.append(ma.raw_data())
		matrix = array(ls_2d)
		matrix = transpose(matrix)
		writer = csv.writer(open(outfname, 'w'), delimiter='\t')
		writer.writerow(matrix.shape)
		writer.writerow(["column", "column"]+range(len(matrix[0])))
		for i in range(matrix.shape[0]):
			ls_with_NA_filled = self.ls_NA_fillin(matrix[i])
			writer.writerow([i, i]+ls_with_NA_filled)
		
		sys.stderr.write("Done.\n")