def all_chem_adjust(self, mz_heat, cp_heat, output): """Does vector compression, ignoring biological activity. Uses all baskets from a run, not just those that are within a biological cutoff range. """ new_cp = cp.cp(None) for feat in cp_heat.features(): new_cp[str(feat)] = cp.feature(str(feat)) runcount = 0 n = 0 av_dist = 0 widgets = ['VectorMove: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ',\ ETA(), ' ', FileTransferSpeed()] pbar = ProgressBar(widgets=widgets, maxval=len(cp_heat.fingerprints())).start() largest_scaler = None #This is a dictionary of run (as string) - [run_vector, add_vector] pairs add_vectors = dict() for run in cp_heat.fingerprints(): output.write("\t" + str(run) + "\n") runcount += 1 pbar.update(runcount) labels = run.keys() #This is the original vector fingerprint in log scale run_vec = numpy.log(run.values(), dtype=float) print run_vec #This is going to be a list of vectors, one from each basket, in log scale bask_vectors = [] #This is the number of vectors, one for each connection, with multiple connections per basket vector_num = 0 for bask in mz_heat.grab_basks(str(run)): #If there's only one run, then the vector has nothing to connect to if len(bask.keys()) <= 1: continue #This is the average value of the scaler, for use in line plots later av_scaler = 0.0 for connect_run in bask.keys(): cprun = connect_run.replace("_", "") #Don't connect the query run to itself, that's not useful if cprun == str(run): continue #If the run isn't in the cp_heatmap, just continue; that means it also wasn't used for creating synthetic fingerprints if cprun not in cp_heat.map: continue #Get the vector difference between the target and the source, but it's in log scale. Also, make sure label values are # in the same order between the vectors. Remember log scale! vec_dif = numpy.log(numpy.array([cp_heat[cprun][val] for val in labels]) - run_vec) scaler = self.bask_prob(bask, cp_heat[cprun]) + self.bask_prob(bask, run) print scaler if scaler >= largest_scaler: largest_scaler = scaler av_scaler = numpy.logaddexp(av_scaler, scaler) bask_vectors.append(vec_dif + scaler) print "bask_ind", vec_dif+scaler vector_num += 1 if not vector_num == 0: output.write("\t\t\t{}; {}\n".format(str(bask), numpy.exp(av_scaler) / vector_num)) #This is the sum of the basket vectors, still in log scale print "all", bask_vectors add_vector = logsumexp(bask_vectors, axis=0) print "summed", add_vector if not vector_num == 0: add_vector -= numpy.log(vector_num) print "averaged", add_vector print 'large', largest_scaler return # if not largest_scaler == 0: # add_vector /= 2 * largest_scaler add_vectors[str(run)] = [run_vec, add_vector] for run, (run_vec, add_vector) in add_vectors.items(): add_vector = numpy.exp(add_vector - (numpy.log(2) + largest_scaler)) run_vec = numpy.exp(run_vec) + add_vector new_cp[str(run)] = cp.fingerprint(str(run)) for param, value in zip(labels, run_vec): new_cp[param][str(run)] = new_cp[str(run)][param] = value output.write("\t\tRun Movement: {}\n".format(numpy.sqrt(add_vector.dot(add_vector)))) av_dist += numpy.sqrt(add_vector.dot(add_vector)) n += 1 pbar.finish() output.write("Average Movement: " + str(av_dist / n) + "\n") return new_cp
def chem_adjust(self, mz_heat, cp_heat, output): new_cp = cp.cp(None) for feat in cp_heat.features(): new_cp[str(feat)] = cp.feature(str(feat)) runcount = 0 n = 0 av_dist = 0 widgets = ['VectorMove: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ',\ ETA(), ' ', FileTransferSpeed()] pbar = ProgressBar(widgets=widgets, maxval=len(cp_heat.fingerprints())).start() for run in cp_heat.fingerprints(): output.write("\t" + str(run) + "\n") runcount += 1 pbar.update(runcount) all_basks = [bask for bask in mz_heat.grab_basks(str(run))] inruns = set([inrun + "_" for inrun in cp_heat.cluster(str(run), max_tolerance=0.5, min_tolerance=0.65)]) # antiruns = set([antirun + "_" for antirun in cp_heat.anticluster(str(run), pmax=-0.2, fraction=1)]) basks = [] for bask in all_basks: bruns = set(bask.keys()) # if len(bruns & antiruns) > 0: # continue if len(bruns & inruns) < 2: continue basks.append(bask) labels = run.keys() run_vec = numpy.array(run.values(), dtype=float) add_vector = numpy.zeros(len(labels)) vector_num = 0 largest_scaler = 0.0 for bask in basks: av_scaler = 0.0 for connect_run in bask.keys(): cprun = connect_run.replace("_", "") if cprun == str(run): continue if cprun not in cp_heat.map: continue vec_dif = numpy.array([cp_heat[cprun][val] for val in labels]) - run_vec scaler = self.bask_prob(bask, cp_heat[cprun]) * self.bask_prob(bask, run) if scaler >= largest_scaler: largest_scaler = scaler av_scaler += scaler add_vector += vec_dif * scaler vector_num += 1 if not vector_num == 0: av_scaler /= vector_num output.write("\t\t\t{}; {}\n".format(str(bask), av_scaler)) if not vector_num == 0: add_vector /= vector_num if not largest_scaler == 0: add_vector /= 2 * largest_scaler run_vec += add_vector new_cp[str(run)] = cp.fingerprint(str(run)) for param, value in zip(labels, run_vec): new_cp[param][str(run)] = new_cp[str(run)][param] = value output.write("\t\tRun Movement: {}\n".format(numpy.sqrt(add_vector.dot(add_vector)))) av_dist += numpy.sqrt(add_vector.dot(add_vector)) n += 1 pbar.finish() output.write("Average Movement: " + str(av_dist / n) + "\n") return new_cp