Ejemplo n.º 1
0
	def all_chem_adjust(self, mz_heat, cp_heat, output):
		"""Does vector compression, ignoring biological activity. Uses all baskets from a run, not
		just those that are within a biological cutoff range.
		"""
		new_cp = cp.cp(None)
		for feat in cp_heat.features():
			new_cp[str(feat)] = cp.feature(str(feat))
		
		runcount = 0
		n = 0
		av_dist = 0
			
		widgets = ['VectorMove: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ',\
		ETA(), ' ', FileTransferSpeed()]
		
		pbar = ProgressBar(widgets=widgets, maxval=len(cp_heat.fingerprints())).start()
		
		largest_scaler = None
		#This is a dictionary of run (as string) - [run_vector, add_vector] pairs
		add_vectors = dict()
		for run in cp_heat.fingerprints():
			output.write("\t" + str(run) + "\n")
			runcount += 1
			pbar.update(runcount)
			
			labels = run.keys()
			#This is the original vector fingerprint in log scale
			run_vec = numpy.log(run.values(), dtype=float)
			print run_vec
			#This is going to be a list of vectors, one from each basket, in log scale
			bask_vectors = []
			#This is the number of vectors, one for each connection, with multiple connections per basket
			vector_num = 0
			for bask in mz_heat.grab_basks(str(run)):
			        #If there's only one run, then the vector has nothing to connect to
				if len(bask.keys()) <= 1:
					continue
				#This is the average value of the scaler, for use in line plots later
				av_scaler = 0.0
				for connect_run in bask.keys():
					cprun = connect_run.replace("_", "")
					#Don't connect the query run to itself, that's not useful
					if cprun == str(run):
						continue
					#If the run isn't in the cp_heatmap, just continue; that means it also wasn't used for creating synthetic fingerprints
					if cprun not in cp_heat.map:
						continue
					#Get the vector difference between the target and the source, but it's in log scale. Also, make sure label values are
					# in the same order between the vectors. Remember log scale!
					vec_dif = numpy.log(numpy.array([cp_heat[cprun][val] for val in labels]) - run_vec)
					scaler = self.bask_prob(bask, cp_heat[cprun]) + self.bask_prob(bask, run)
					print scaler
					if scaler >= largest_scaler:
						largest_scaler = scaler
					av_scaler = numpy.logaddexp(av_scaler, scaler)
					bask_vectors.append(vec_dif + scaler)
					print "bask_ind", vec_dif+scaler
					vector_num += 1
				if not vector_num == 0:
					output.write("\t\t\t{}; {}\n".format(str(bask), numpy.exp(av_scaler) / vector_num))
			#This is the sum of the basket vectors, still in log scale
			print "all", bask_vectors
			add_vector = logsumexp(bask_vectors, axis=0)
			print "summed", add_vector
			if not vector_num == 0:
				add_vector -= numpy.log(vector_num)
			print "averaged", add_vector
			print 'large', largest_scaler
			return
#			if not largest_scaler == 0:
#				add_vector /= 2 * largest_scaler
                        add_vectors[str(run)] = [run_vec, add_vector]
                
                for run, (run_vec, add_vector) in add_vectors.items():
			add_vector = numpy.exp(add_vector - (numpy.log(2) + largest_scaler))
			run_vec = numpy.exp(run_vec) + add_vector
			new_cp[str(run)] = cp.fingerprint(str(run))
			for param, value in zip(labels, run_vec):
				new_cp[param][str(run)] = new_cp[str(run)][param] = value
				
			output.write("\t\tRun Movement: {}\n".format(numpy.sqrt(add_vector.dot(add_vector))))
				
			av_dist += numpy.sqrt(add_vector.dot(add_vector))
			n += 1
		pbar.finish()

		output.write("Average Movement: " + str(av_dist / n) + "\n")			
		return new_cp
Ejemplo n.º 2
0
	def chem_adjust(self, mz_heat, cp_heat, output):
		new_cp = cp.cp(None)
		for feat in cp_heat.features():
			new_cp[str(feat)] = cp.feature(str(feat))
		
		runcount = 0
		n = 0
		av_dist = 0
			
		widgets = ['VectorMove: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ',\
		ETA(), ' ', FileTransferSpeed()]
		
		pbar = ProgressBar(widgets=widgets, maxval=len(cp_heat.fingerprints())).start()
		
		
		
		for run in cp_heat.fingerprints():
			output.write("\t" + str(run) + "\n")
			runcount += 1
			pbar.update(runcount)

			all_basks = [bask for bask in mz_heat.grab_basks(str(run))]

			inruns = set([inrun + "_" for inrun in cp_heat.cluster(str(run), max_tolerance=0.5, min_tolerance=0.65)])
#			antiruns = set([antirun + "_" for antirun in cp_heat.anticluster(str(run), pmax=-0.2, fraction=1)])
			
			basks = []
			for bask in all_basks:
				bruns = set(bask.keys())
#				if len(bruns & antiruns) > 0:
#					continue
				if len(bruns & inruns) < 2:
					continue
				basks.append(bask)
			
			labels = run.keys()
			run_vec = numpy.array(run.values(), dtype=float)
			add_vector = numpy.zeros(len(labels))
			vector_num = 0
			largest_scaler = 0.0
			for bask in basks:
				av_scaler = 0.0
				for connect_run in bask.keys():
					cprun = connect_run.replace("_", "")
					if cprun == str(run):
						continue
					if cprun not in cp_heat.map:
						continue
					vec_dif = numpy.array([cp_heat[cprun][val] for val in labels]) - run_vec
					scaler = self.bask_prob(bask, cp_heat[cprun]) * self.bask_prob(bask, run)
					if scaler >= largest_scaler:
						largest_scaler = scaler
					av_scaler += scaler
					add_vector += vec_dif * scaler
					vector_num += 1
				if not vector_num == 0:
					av_scaler /= vector_num
					output.write("\t\t\t{}; {}\n".format(str(bask), av_scaler))
			if not vector_num == 0:
				add_vector /= vector_num
			if not largest_scaler == 0:
				add_vector /= 2 * largest_scaler

				run_vec += add_vector
			new_cp[str(run)] = cp.fingerprint(str(run))
			for param, value in zip(labels, run_vec):
				new_cp[param][str(run)] = new_cp[str(run)][param] = value
				
			output.write("\t\tRun Movement: {}\n".format(numpy.sqrt(add_vector.dot(add_vector))))
				
			av_dist += numpy.sqrt(add_vector.dot(add_vector))
			n += 1
		pbar.finish()

		output.write("Average Movement: " + str(av_dist / n) + "\n")			
		return new_cp