def __init__(self, id, organism, membership, ratios, seqtype, config_params=None): """creates a ScoringFunction""" scoring.ScoringFunctionBase.__init__(self, id, organism, membership, ratios, config_params=config_params) # attributes accessible by subclasses self.seqtype = seqtype self.__setup_meme_suite(config_params) self.num_motif_func = util.get_iter_fun(config_params['MEME'], "nmotifs", config_params['num_iterations']) self.__last_motif_infos = None self.__last_iteration_result = {} self.all_pvalues = None self.last_result = None self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params) self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params) used_genes = sorted(ratios.row_names) self.used_seqs = organism.sequences_for_genes_scan( used_genes, seqtype=self.seqtype) logging.debug("building reverse map...") start_time = util.current_millis() self.reverse_map = self.__build_reverse_map(ratios) logging.debug("reverse map built in %d ms.", util.current_millis() - start_time) self.__last_results = None # caches the results of the previous meme run
def do_compute(self, iteration_result, ref_matrix=None): """compute method, iteration is the 0-based iteration number""" # networks are cached if self.__networks == None: self.__networks = retrieve_networks(self.__organism) matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) # a dictionary that holds the scores of each gene in a given cluster network_iteration_scores = {cluster: {} for cluster in xrange(1, self.num_clusters() + 1)} network_scores = {} for network in self.__networks: logging.info("Compute scores for network '%s', WEIGHT: %f", network.name, network.weight) start_time = util.current_millis() network_score = self.__compute_network_cluster_scores(network) network_scores[network.name] = network_score self.__update_score_matrix(matrix, network_score, network.weight) elapsed = util.current_millis() - start_time logging.info("NETWORK '%s' SCORING TIME: %f s.", network.name, (elapsed / 1000.0)) # additional scoring information, not used for the actual clustering self.__update_network_iteration_scores(network_iteration_scores, network_score, network.weight) iteration_scores = compute_iteration_scores(network_iteration_scores) with open(self.network_scores_pickle_path(), 'w') as outfile: cPickle.dump(network_scores, outfile) # immediately use in means computation self.network_scores = network_scores matrix.subtract_with_quantile(0.99) return matrix
def update(self, matrix, row_scores, column_scores, iteration, num_iterations, add_fuzz=True): """top-level update method""" if add_fuzz: row_scores, column_scores = self.__fuzzify(row_scores, column_scores, iteration, num_iterations) rpc = map(len, self.__cluster_row_members.values()) #logging.info('\x1b[31mMembership:\t\x1b[0mRows per cluster: %i to %i (median %d)' \ # %( min(rpc), max(rpc), np.median(rpc) ) ) start = util.current_millis() #logging.info("\x1b[31mMembership:\t\x1b[0mGET_DENSITY_SCORES()...") rd_scores, cd_scores = get_density_scores(self, row_scores, column_scores) elapsed = util.current_millis() - start #logging.info("\x1b[31mMembership:\t\x1b[0mGET_DENSITY_SCORES() took %f s.", elapsed / 1000.0) start = util.current_millis() #logging.info("\x1b[31mMembership:\t\x1b[0mCOMPENSATE_SIZE()...") compensate_size(self, matrix, rd_scores, cd_scores) elapsed = util.current_millis() - start #logging.info("\x1b[31mMembership:\t\x1b[0mCOMPENSATE_SIZE() took %f s.", elapsed / 1000.0) self.__update_memberships(rd_scores, cd_scores)
def __init__(self, organism, membership, matrix, meme_suite, seqtype, sequence_filters=[], pvalue_filter=None, scaling_func=None, update_in_iteration=lambda iteration: True, motif_in_iteration=lambda iteration: True, config_params=None): """creates a ScoringFunction""" scoring.ScoringFunctionBase.__init__(self, membership, matrix, scaling_func, config_params) # attributes accessible by subclasses self.organism = organism self.meme_suite = meme_suite self.seqtype = seqtype self.run_in_iteration = run_in_iteration self.__sequence_filters = sequence_filters self.__pvalue_filter = pvalue_filter self.__last_computed_result = None # precompute the sequences for all genes that are referenced in the # input ratios, they are used as a basis to compute the background # distribution for every cluster self.used_seqs = organism.sequences_for_genes_scan( sorted(matrix.row_names()), seqtype=self.seqtype) logging.info("used sequences for motifing retrieved") logging.info("building reverse map...") start_time = util.current_millis() self.reverse_map = self.__build_reverse_map(matrix) logging.info("reverse map built in %d ms.", util.current_millis() - start_time)
def seed_column_members(data_matrix, row_membership, num_clusters, num_clusters_per_column): """Default column membership seeder ('best') In case of multiple input ratio matrices, we assume that these matrices have been combined into data_matrix""" num_rows = data_matrix.num_rows num_cols = data_matrix.num_columns # create a submatrix for each cluster cscores = np.zeros([data_matrix.num_columns, num_clusters]) for cluster_num in xrange(1, num_clusters + 1): current_cluster_rows = [] for row_index in xrange(num_rows): if row_membership[row_index][0] == cluster_num: current_cluster_rows.append(data_matrix.row_names[row_index]) submatrix = data_matrix.submatrix_by_name( row_names=current_cluster_rows) _, scores = scoring.compute_column_scores_submatrix(submatrix) cscores.T[cluster_num - 1] = -scores start_time = util.current_millis() column_members = [util.rorder(cscores[i], num_clusters_per_column) for i in xrange(num_cols)] elapsed = util.current_millis() - start_time logging.debug("seed column members in %f s.", elapsed % 1000.0) return column_members
def compute_row_scores(membership, matrix, num_clusters, use_multiprocessing): """for each cluster 1, 2, .. num_clusters compute the row scores for the each row name in the input name matrix""" #clusters = xrange(1, num_clusters + 1) start_time = util.current_millis() cluster_row_scores = __compute_row_scores_for_clusters( membership, matrix, num_clusters, use_multiprocessing) #logging.info("__compute_row_scores_for_clusters() in %f s.", # (util.current_millis() - start_time) / 1000.0) start_time = util.current_millis() cluster_row_scores = __replace_non_numeric_values(cluster_row_scores, membership, matrix, num_clusters) #logging.info("__replace_non_numeric_values() in %f s.", # (util.current_millis() - start_time) / 1000.0) # rearrange result into a DataMatrix, where rows are indexed by gene # and columns represent clusters start_time = util.current_millis() values = np.zeros((matrix.num_rows(), num_clusters)) # note that cluster is 0 based on a matrix for cluster in xrange(num_clusters): row_scores = cluster_row_scores[cluster] values[:, cluster] = row_scores result = dm.DataMatrix(matrix.num_rows(), num_clusters, row_names=matrix.row_names(), values=values) #logging.info("made result matrix in %f s.", # (util.current_millis() - start_time) / 1000.0) return result.sorted_by_row_name()
def compute_row_scores(membership, matrix, num_clusters, config_params): """for each cluster 1, 2, .. num_clusters compute the row scores for the each row name in the input name matrix""" start_time = util.current_millis() cluster_row_scores = __compute_row_scores_for_clusters( membership, matrix, num_clusters, config_params) # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version logging.debug("__compute_row_scores_for_clusters() in %f s.", (util.current_millis() - start_time) / 1000.0) # rearrange result into a DataMatrix, where rows are indexed by gene # and columns represent clusters start_time = util.current_millis() values = np.zeros((matrix.num_rows, num_clusters)) # note that cluster is 0 based on a matrix for cluster in xrange(num_clusters): row_scores = cluster_row_scores[cluster] values[:, cluster] = row_scores result = dm.DataMatrix(matrix.num_rows, num_clusters, row_names=matrix.row_names, values=values) logging.debug("made result matrix in %f s.", (util.current_millis() - start_time) / 1000.0) return result
def seed_column_members(data_matrix, row_membership, num_clusters, num_clusters_per_column): """Default column membership seeder ('best') In case of multiple input ratio matrices, we assume that these matrices have been combined into data_matrix""" num_rows = data_matrix.num_rows num_cols = data_matrix.num_columns # create a submatrix for each cluster cscores = np.zeros([data_matrix.num_columns, num_clusters]) for cluster_num in xrange(1, num_clusters + 1): current_cluster_rows = [] for row_index in xrange(num_rows): if row_membership[row_index][0] == cluster_num: current_cluster_rows.append(data_matrix.row_names[row_index]) submatrix = data_matrix.submatrix_by_name( row_names=current_cluster_rows) _, scores = scoring.compute_column_scores_submatrix(submatrix) cscores.T[cluster_num - 1] = -scores start_time = util.current_millis() column_members = [ util.rorder(cscores[i], num_clusters_per_column) for i in xrange(num_cols) ] elapsed = util.current_millis() - start_time logging.info("seed column members in %f s.", elapsed % 1000.0) return column_members
def compute_row_scores(membership, matrix, num_clusters, use_multiprocessing): """for each cluster 1, 2, .. num_clusters compute the row scores for the each row name in the input name matrix""" start_time = util.current_millis() cluster_row_scores = __compute_row_scores_for_clusters( membership, matrix, num_clusters, use_multiprocessing) # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version logging.info("__compute_row_scores_for_clusters() in %f s.", (util.current_millis() - start_time) / 1000.0) # rearrange result into a DataMatrix, where rows are indexed by gene # and columns represent clusters start_time = util.current_millis() values = np.zeros((matrix.num_rows, num_clusters)) # note that cluster is 0 based on a matrix for cluster in xrange(num_clusters): row_scores = cluster_row_scores[cluster] values[:, cluster] = row_scores result = dm.DataMatrix(matrix.num_rows, num_clusters, row_names=matrix.row_names, values=values) logging.info("made result matrix in %f s.", (util.current_millis() - start_time) / 1000.0) return result
def seed_column_members(data_matrix, row_membership, num_clusters, num_clusters_per_column): """Default column membership seeder ('best') In case of multiple input ratio matrices, we assume that these matrices have been combined into data_matrix""" num_rows = data_matrix.num_rows num_cols = data_matrix.num_columns # create a submatrix for each cluster column_scores = [] for cluster_num in xrange(1, num_clusters + 1): current_cluster_rows = [] for row_index in xrange(num_rows): if row_membership[row_index][0] == cluster_num: current_cluster_rows.append(data_matrix.row_names[row_index]) submatrix = data_matrix.submatrix_by_name( row_names=current_cluster_rows) scores = -(scoring.compute_column_scores_submatrix(submatrix).values)[0] column_scores.append(scores) column_members = [] start_time = util.current_millis() for column_index in xrange(num_cols): scores_to_order = [] for row_index in xrange(num_clusters): scores_to_order.append(column_scores[row_index][column_index]) column_members.append(order(scores_to_order)[:num_clusters_per_column]) elapsed = util.current_millis() - start_time logging.info("seed column members in %f s.", elapsed % 1000.0) return column_members
def __compute(self): """compute method, iteration is the 0-based iteration number""" # networks are cached if self.__networks == None: self.__networks = retrieve_networks(self.__organism) matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) #network_iteration_scores = self.__create_network_iteration_scores() #score_means = {} # a dictionary indexed with network names for network in self.__networks: #logging.info("Compute scores for network '%s', WEIGHT: %f", # network.name(), network.weight()) start_time = util.current_millis() network_score = self.__compute_network_cluster_scores(network) self.__update_score_matrix(matrix, network_score, network.weight()) elapsed = util.current_millis() - start_time #logging.info("NETWORK '%s' SCORING TIME: %f s.", # network.name(), (elapsed / 1000.0)) #score_means[network.name()] = self.__compute_cluster_score_means( # network_score) #self.__update_network_iteration_scores(network_iteration_scores, # network_score, weight) #iteration_scores = __compute_iteration_scores( # network_iteration_scores) return matrix - matrix.quantile(0.99)
def run_iterations(self, row_scoring, col_scoring): self.report_params() self.write_start_info() for iteration in range(self['start_iteration'], self['num_iterations'] + 1): logging.info("Iteration # %d", iteration) iteration_result = {'iteration': iteration} rscores = row_scoring.compute(iteration_result) start_time = util.current_millis() cscores = col_scoring.compute(iteration_result) elapsed = util.current_millis() - start_time logging.info("computed column_scores in %f s.", elapsed / 1000.0) self.membership().update(self.ratio_matrix, rscores, cscores, self['num_iterations'], iteration_result) if iteration > 0 and self.CHECKPOINT_INTERVAL and iteration % self.CHECKPOINT_INTERVAL == 0: self.save_checkpoint_data(iteration, row_scoring, col_scoring) mean_net_score = 0.0 mean_mot_pvalue = 0.0 if 'networks' in iteration_result.keys(): mean_net_score = iteration_result['networks'] mean_mot_pvalue = "NA" if 'motif-pvalue' in iteration_result.keys(): mean_mot_pvalue = "" mean_mot_pvalues = iteration_result['motif-pvalue'] mean_mot_pvalue = "" for seqtype in mean_mot_pvalues.keys(): mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype])) logging.info('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue) if iteration == 1 or (iteration % RESULT_FREQ == 0): self.write_results(iteration_result) if iteration == 1 or (iteration % STATS_FREQ == 0): self.write_stats(iteration_result) # run infos should be written with the same frequency as stats self.write_runlog(row_scoring, iteration) gc.collect() #print "# ROW SCORING: ", sizes.asizeof(self.row_scoring) #print "# MOT SCORING: ", sizes.asizeof(self.motif_scoring) #print "# NET SCORING: ", sizes.asizeof(self.network_scoring) #print "# COL SCORING: ", sizes.asizeof(col_scoring) #print "# MEMBERSHIP: ", sizes.asizeof(self.membership()) logging.info("Postprocessing: Adjusting the clusters....") self.membership().postadjust() iteration = self['num_iterations'] + 1 iteration_result = {'iteration': iteration } logging.info("Adjusted. Now re-run scoring (iteration: %d)", iteration_result['iteration']) row_scoring.compute_force(iteration_result) self.write_results(iteration_result) self.write_stats(iteration_result) self.write_finish_info() print "Done !!!!"
def __init__(self, organism, membership, matrix, meme_suite, seqtype, sequence_filters=[], scaling_func=None, num_motif_func=None, update_in_iteration=lambda iteration: True, motif_in_iteration=lambda iteration: True, config_params=None): """creates a ScoringFunction""" # run_in_iteration does not apply here, since we actually have # two schedules, motif_in_iteration and update_in_iteration here scoring.ScoringFunctionBase.__init__(self, membership, matrix, scaling_func, run_in_iteration=None, config_params=config_params) # attributes accessible by subclasses self.organism = organism self.meme_suite = meme_suite self.seqtype = seqtype self.update_in_iteration = update_in_iteration self.motif_in_iteration = motif_in_iteration self.num_motif_func = num_motif_func self.__sequence_filters = sequence_filters self.__last_run_results = None self.__last_iteration_result = {} self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params) self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params) used_genes = sorted(matrix.row_names) self.used_seqs = organism.sequences_for_genes_scan( used_genes, seqtype=self.seqtype) # precompute the sequences for all genes that are referenced in the # input ratios, they are used as a basis to compute the background # distribution for every cluster """ self.seq_cache = sequence_cache.SequenceCache('sequence_cache.db') dist = self.config_params['scan_distances'][self.seqtype] self.seq_cache.add_sequence_type(self.seqtype, dist[0], dist[1]) logging.info("used sequences retrieved, building cache...") start_time = util.current_millis() seq_data = [(gene, seq[1], seq[0].contig, seq[0].start, seq[0].end, seq[0].reverse) for gene, seq in self.used_seqs.items()] self.seq_cache.add_sequences(self.seqtype, dist[0], dist[1], seq_data) logging.info("used sequences cache built in %d ms.", util.current_millis() - start_time) """ logging.info("building reverse map...") start_time = util.current_millis() self.reverse_map = self.__build_reverse_map(matrix) logging.info("reverse map built in %d ms.", util.current_millis() - start_time)
def compute(self, iteration_result, ref_matrix=None): """compute method, iteration is the 0-based iteration number""" start_time = util.current_millis() result = compute_column_scores(self.membership(), self.matrix(), self.num_clusters()) elapsed = util.current_millis() - start_time #logging.info("\x1b[31mScoring:\t\x1b[0mCOLUMN SCORING TIME: %f s.", (elapsed / 1000.0)) return result
def compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_REF_MATRIX, SET_SET_TYPE iteration = iteration_result['iteration'] if self.__run_in_iteration(iteration): logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[ scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.matrix() SET_MEMBERSHIP = self.membership() SET_REF_MATRIX = ref_matrix for set_type in self.__set_types: SET_SET_TYPE = set_type #logging.info("PROCESSING SET TYPE [%s]", repr(set_type)) logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() if use_multiprocessing: pool = mp.Pool() results = pool.map(compute_cluster_score, [(cluster, self.bonferroni_cutoff()) for cluster in xrange(1, self.num_clusters() + 1)]) pool.close() pass else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append(compute_cluster_score( (cluster, self.bonferroni_cutoff()))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] self.__last_min_enriched_set[set_type][cluster] = ( min_set, min_pvalue) for row in xrange(len(self.gene_names())): matrix[row][cluster - 1] = scores[row] logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) self.__last_computed_result = matrix return self.__last_computed_result
def weighted_row_means(matrix, weights): """compute weighted row means""" start_time = util.current_millis() # multiply each column of matrix with each component of the # weight vector: Using matrix multiplication resulted in speedup # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)! scaled = weights * matrix elapsed = util.current_millis() - start_time #logging.info("APPLIED WEIGHTS TO COLUMNS in %f s.", elapsed / 1000.0) scale = np.sum(np.ma.masked_array(weights, np.isnan(weights))) return util.row_means(scaled) / scale
def __fuzzify(self, row_scores, column_scores, iteration, num_iterations): """Provide an iteration-specific fuzzification""" #logging.info("\x1b[31mMembership:\t\x1b[0m__fuzzify(), setup...") start_time = util.current_millis() fuzzy_coeff = std_fuzzy_coefficient(iteration, num_iterations) num_row_fuzzy_values = row_scores.num_rows() * row_scores.num_columns() num_col_fuzzy_values = (column_scores.num_rows() * column_scores.num_columns()) row_sd_values = [] # optimization: unwrap the numpy arrays to access them directly row_score_values = row_scores.values() col_score_values = column_scores.values() # iterate the row names directly row_names = row_scores.row_names() for col in xrange(row_scores.num_columns()): cluster_rows = self.rows_for_cluster(col + 1) for row in xrange(row_scores.num_rows()): if row_names[row] in cluster_rows: row_sd_values.append(row_score_values[row][col]) # Note: If there are no non-NaN values in row_sd_values, row_rnorm # will have all NaNs row_rnorm = util.sd_rnorm(row_sd_values, num_row_fuzzy_values, fuzzy_coeff) col_sd_values = [] row_names = column_scores.row_names() for col in xrange(column_scores.num_columns()): cluster_cols = self.columns_for_cluster(col + 1) for row in xrange(column_scores.num_rows()): if row_names[row] in cluster_cols: col_sd_values.append(col_score_values[row][col]) # Note: If there are no non-NaN values in col_sd_values, col_rnorm # will have all NaNs col_rnorm = util.sd_rnorm(col_sd_values, num_col_fuzzy_values, fuzzy_coeff) #elapsed = util.current_millis() - start_time #logging.info("fuzzify() SETUP finished in %f s.", elapsed / 1000.0) #logging.info("fuzzifying scores...") #start_time = util.current_millis() # add fuzzy values to the row/column scores row_score_values += np.array(row_rnorm).reshape( row_scores.num_rows(), row_scores.num_columns()) col_score_values += np.array(col_rnorm).reshape( column_scores.num_rows(), column_scores.num_columns()) elapsed = util.current_millis() - start_time #logging.info("\x1b[31mMembership:\t\x1b[0mfuzzify() finished in %f s.", elapsed / 1000.0) return row_scores, column_scores
def compute(self, iteration_result, ref_matrix=None): """compute method, iteration is the 0-based iteration number""" start_time = util.current_millis() result = compute_row_scores( self.membership(), self.matrix(), self.num_clusters(), self.config_params[scoring.KEY_MULTIPROCESSING]) elapsed = util.current_millis() - start_time logging.info("ROW SCORING TIME: %f s.", (elapsed / 1000.0)) self.run_log.log(True, self.scaling(iteration_result['iteration'])) return result
def compute(self, iteration_result, ref_matrix=None): """compute method for the specified iteration Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ iteration = iteration_result['iteration'] logging.info('Scoring motifs...') global_start_time = util.current_millis() if self.motif_in_iteration(iteration): # meme.iter in R logging.info('Running Motifing...') self.__last_iteration_result = {} self.__last_pvalues = self.compute_pvalues(self.__last_iteration_result, iteration) # running MEME and store the result for the non-motifing iterations # to reuse if self.__last_pvalues != None and self.update_in_iteration(iteration): # mot.iter in R logging.info('Recomputing motif scores...') # running the scoring itself # values are returned here - consider remapping them per gene remapped = {} for cluster in self.__last_pvalues: pvalues_k = self.__last_pvalues[cluster] pvalues_genes = {} for feature_id, pvalue in pvalues_k.items(): pvalues_genes[self.reverse_map[feature_id]] = pvalue remapped[cluster] = pvalues_genes # convert remapped to an actual scoring matrix matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) for row_index in xrange(matrix.num_rows()): row = matrix.row_name(row_index) for cluster in xrange(1, self.num_clusters() + 1): if (cluster in remapped.keys() and row in remapped[cluster].keys()): matrix[row_index][cluster - 1] = remapped[cluster][row] global_elapsed = util.current_millis() - global_start_time logging.info("GLOBAL MOTIF TIME: %d seconds", (global_elapsed / 1000.0)) self.__last_computed_result = matrix self.update_log.log(self.update_in_iteration(iteration), self.scaling(iteration)) self.motif_log.log(self.motif_in_iteration(iteration), self.scaling(iteration)) iteration_result['motifs'] = self.__last_iteration_result return self.__last_computed_result
def __init__(self, organism, membership, ratios, meme_suite, seqtype, sequence_filters=[], scaling_func=None, num_motif_func=None, update_in_iteration=lambda iteration: True, motif_in_iteration=lambda iteration: True, config_params=None): """creates a ScoringFunction""" # run_in_iteration does not apply here, since we actually have # two schedules, motif_in_iteration and update_in_iteration here scoring.ScoringFunctionBase.__init__(self, organism, membership, ratios, scaling_func, schedule=None, config_params=config_params) # attributes accessible by subclasses self.meme_suite = meme_suite self.seqtype = seqtype self.update_in_iteration = update_in_iteration self.motif_in_iteration = motif_in_iteration self.num_motif_func = num_motif_func self.__sequence_filters = sequence_filters self.__last_motif_infos = None self.__last_iteration_result = {} self.all_pvalues = None self.last_result = None self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params) self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params) used_genes = sorted(ratios.row_names) self.used_seqs = organism.sequences_for_genes_scan( used_genes, seqtype=self.seqtype) logging.info("building reverse map...") start_time = util.current_millis() self.reverse_map = self.__build_reverse_map(ratios) logging.info("reverse map built in %d ms.", util.current_millis() - start_time)
def run_iterations(self, row_scoring, col_scoring): self.report_params() self.write_start_info() for iteration in range(self['start_iteration'], self['num_iterations'] + 1): start_time = util.current_millis() self.run_iteration(row_scoring, col_scoring, iteration) # garbage collection after everything in iteration went out of scope gc.collect() elapsed = util.current_millis() - start_time logging.info("performed iteration %d in %f s.", iteration, elapsed / 1000.0) """run post processing after the last iteration. We store the results in num_iterations + 1 to have a clean separation""" if self['postadjust']: logging.info("Postprocessing: Adjusting the clusters....") # run combiner using the weights of the last iteration rscores = row_scoring.combine_cached(self['num_iterations']) rd_scores = memb.get_row_density_scores(self.membership(), rscores) logging.info("Recomputed combined + density scores.") memb.postadjust(self.membership(), rd_scores) logging.info("Adjusted. Now re-run scoring (iteration: %d)", self['num_iterations']) iteration_result = {'iteration': self['num_iterations'] + 1} combined_scores = row_scoring.compute_force(iteration_result) # write the combined scores for benchmarking/diagnostics with open(self.combined_rscores_pickle_path(), 'w') as outfile: cPickle.dump(combined_scores, outfile) self.write_results(iteration_result) self.write_stats(iteration_result) self.update_iteration(iteration) if self['debug']: # write complete result into a cmresults.tsv conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-postproc.tsv.bz2') with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, self['num_iterations'] + 1, self['num_clusters'], self['output_dir']) conn.close() self.write_finish_info() logging.info("Done !!!!")
def run_iteration(self, row_scoring, col_scoring, iteration): logging.info("Iteration # %d", iteration) iteration_result = {'iteration': iteration} rscores = row_scoring.compute(iteration_result) start_time = util.current_millis() cscores = col_scoring.compute(iteration_result) elapsed = util.current_millis() - start_time if elapsed > 0.0001: logging.info("computed column_scores in %f s.", elapsed / 1000.0) self.membership().update(self.ratio_matrix, rscores, cscores, self['num_iterations'], iteration_result) if (iteration > 0 and self['checkpoint_interval'] and iteration % self['checkpoint_interval'] == 0): self.save_checkpoint_data(iteration, row_scoring, col_scoring) mean_net_score = 0.0 mean_mot_pvalue = 0.0 if 'networks' in iteration_result.keys(): mean_net_score = iteration_result['networks'] mean_mot_pvalue = "NA" if 'motif-pvalue' in iteration_result.keys(): mean_mot_pvalue = "" mean_mot_pvalues = iteration_result['motif-pvalue'] mean_mot_pvalue = "" for seqtype in mean_mot_pvalues.keys(): mean_mot_pvalue = mean_mot_pvalue + ( " '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype])) logging.info('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue) if iteration == 1 or (iteration % self['result_freq'] == 0): self.write_results(iteration_result) if iteration == 1 or (iteration % self['stats_freq'] == 0): self.write_stats(iteration_result) self.update_iteration(iteration) if self['debug']: # write complete result into a cmresults.tsv conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration) with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, iteration, self['num_clusters'], self['output_dir']) conn.close()
def quantile_normalize_scores(matrices, weights=None): """quantile normalize scores against each other""" flat_values = as_sorted_flat_values(matrices) #logging.info("COMPUTING WEIGHTED MEANS...") start_time = util.current_millis() if weights != None: tmp_mean = weighted_row_means(flat_values, weights) else: tmp_mean = util.row_means(flat_values) elapsed = util.current_millis() - start_time #logging.info("weighted means in %f s.", elapsed / 1000.0) start_time = util.current_millis() result = qm_result_matrices(matrices, tmp_mean) elapsed = util.current_millis() - start_time #logging.info("result matrices built in %f s.", elapsed / 1000.0) return result
def get_col_density_scores(membership, col_scores): num_clusters = membership.num_clusters() cscore_range = abs(col_scores.max() - col_scores.min()) colscore_bandwidth = max(cscore_range / 100.0, 0.001) cd_scores = dm.DataMatrix(col_scores.num_rows, col_scores.num_columns, col_scores.row_names, col_scores.column_names) cds_values = cd_scores.values start_time = util.current_millis() for cluster in xrange(1, num_clusters + 1): # instead of assigning the cc_scores values per row, we can assign to the # transpose and let numpy do the assignment cds_values.T[cluster - 1] = get_cc_scores(membership, col_scores, colscore_bandwidth, cluster) elapsed = util.current_millis() - start_time logging.info("CC_SCORES IN %f s.", elapsed / 1000.0) return cd_scores
def run_iteration(self, row_scoring, col_scoring, iteration): logging.info("Iteration # %d", iteration) iteration_result = {'iteration': iteration} rscores = row_scoring.compute(iteration_result) start_time = util.current_millis() cscores = col_scoring.compute(iteration_result) elapsed = util.current_millis() - start_time if elapsed > 0.0001: logging.info("computed column_scores in %f s.", elapsed / 1000.0) self.membership().update(self.ratio_matrix, rscores, cscores, self['num_iterations'], iteration_result) if (iteration > 0 and self['checkpoint_interval'] and iteration % self['checkpoint_interval'] == 0): self.save_checkpoint_data(iteration, row_scoring, col_scoring) mean_net_score = 0.0 mean_mot_pvalue = 0.0 if 'networks' in iteration_result.keys(): mean_net_score = iteration_result['networks'] mean_mot_pvalue = "NA" if 'motif-pvalue' in iteration_result.keys(): mean_mot_pvalue = "" mean_mot_pvalues = iteration_result['motif-pvalue'] mean_mot_pvalue = "" for seqtype in mean_mot_pvalues.keys(): mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype])) logging.info('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue) if iteration == 1 or (iteration % self['result_freq'] == 0): self.write_results(iteration_result) if iteration == 1 or (iteration % self['stats_freq'] == 0): self.write_stats(iteration_result) self.update_iteration(iteration) if self['debug']: # write complete result into a cmresults.tsv conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration) with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, iteration, self['num_clusters'], self['output_dir']) conn.close()
def run_iteration(self, iteration): logging.info("Iteration # %d", iteration) iteration_result = {'iteration': iteration, 'score_means': {}} rscores = self.row_scoring.compute(iteration_result) start_time = util.current_millis() cscores = self.column_scoring.compute(iteration_result) elapsed = util.current_millis() - start_time if elapsed > 0.0001: logging.debug("computed column_scores in %f s.", elapsed / 1000.0) self.membership().update(self.ratios, rscores, cscores, self['num_iterations'], iteration_result) mean_net_score = 0.0 mean_mot_pvalue = 0.0 if 'networks' in iteration_result.keys(): mean_net_score = iteration_result['networks'] mean_mot_pvalue = "NA" if 'motif-pvalue' in iteration_result.keys(): mean_mot_pvalue = "" mean_mot_pvalues = iteration_result['motif-pvalue'] mean_mot_pvalue = "" for seqtype in mean_mot_pvalues.keys(): mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype])) logging.debug('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue) # Reduce I/O, will write the results to database only on a debug run if not self['minimize_io']: if iteration == 1 or (iteration % self['result_freq'] == 0): self.write_results(iteration_result) if iteration == 1 or (iteration % self['stats_freq'] == 0): self.write_stats(iteration_result) self.update_iteration(iteration) if 'dump_results' in self['debug'] and (iteration == 1 or (iteration % self['debug_freq'] == 0)): # write complete result into a cmresults.tsv conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration) with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, iteration, self['num_clusters'], self['output_dir'])
def get_row_density_scores(membership, row_scores): """getting density scores improves small clusters""" num_clusters = membership.num_clusters() rscore_range = abs(row_scores.max() - row_scores.min()) rowscore_bandwidth = max(rscore_range / 100.0, 0.001) rd_scores = dm.DataMatrix(row_scores.num_rows, row_scores.num_columns, row_scores.row_names, row_scores.column_names) rds_values = rd_scores.values start_time = util.current_millis() for cluster in xrange(1, num_clusters + 1): # instead of assigning the rr_scores values per row, we can assign to the # transpose and let numpy do the assignment rds_values.T[cluster - 1] = get_rr_scores(membership, row_scores, rowscore_bandwidth, cluster) elapsed = util.current_millis() - start_time logging.info("RR_SCORES IN %f s.", elapsed / 1000.0) return rd_scores
def update(self, matrix, row_scores, column_scores, num_iterations, iteration_result): """top-level update method""" start = util.current_millis() row_scores, column_scores = fuzzify(self, row_scores, column_scores, num_iterations, iteration_result, self.__config_params['add_fuzz']) elapsed = util.current_millis() - start logging.info("fuzzify took %f s.", elapsed / 1000.0) # pickle the (potentially fuzzed) row scores to use them # in the post adjustment step. We only need to do that in the last # iteration iteration = iteration_result['iteration'] if iteration == num_iterations: with open(self.pickle_path(), 'w') as outfile: cPickle.dump(row_scores, outfile) start = util.current_millis() rd_scores, cd_scores = get_density_scores(self, row_scores, column_scores) elapsed = util.current_millis() - start logging.info("GET_DENSITY_SCORES() took %f s.", elapsed / 1000.0) start = util.current_millis() compensate_size(self, matrix, rd_scores, cd_scores) elapsed = util.current_millis() - start logging.info("COMPENSATE_SIZE() took %f s.", elapsed / 1000.0) start_time = util.current_millis() update_for_rows(self, rd_scores, self.__config_params['multiprocessing'], self.__config_params['debug']) elapsed = util.current_millis() - start_time logging.info("update_for rdscores finished in %f s.", elapsed / 1000.0) start_time = util.current_millis() update_for_cols(self, cd_scores, self.__config_params['multiprocessing'], self.__config_params['debug']) elapsed = util.current_millis() - start_time logging.info("update_for cdscores finished in %f s.", elapsed / 1000.0)
def do_compute(self, iteration_result, ref_matrix=None): """compute method, iteration is the 0-based iteration number""" matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) network_scores = {} for network in self.networks(): logging.debug("Compute scores for network '%s', WEIGHT: %f", network.name, network.weight) start_time = util.current_millis() network_score = self.__compute_network_cluster_scores(network) network_scores[network.name] = network_score self.__update_score_matrix(matrix, network_score, network.weight) elapsed = util.current_millis() - start_time logging.debug("NETWORK '%s' SCORING TIME: %f s.", network.name, (elapsed / 1000.0)) # compute and store score means self.score_means = self.__update_score_means(network_scores) return matrix
def compute(self, iteration_result, ref_matrix=None): """compute scores for one iteration""" result_matrices = [] score_scalings = [] reference_matrix = ref_matrix iteration = iteration_result['iteration'] for scoring_function in self.__scoring_functions: # This is actually a hack in order to propagate # a reference matrix to the compute function # This could have negative impact on scalability if reference_matrix == None and len(result_matrices) > 0: reference_matrix = result_matrices[0] matrix = scoring_function.compute(iteration_result, reference_matrix) if matrix != None: result_matrices.append(matrix) score_scalings.append(scoring_function.scaling(iteration)) if self.__log_subresults: self.__log_subresult(scoring_function, matrix) if len(result_matrices) > 1: #logging.info( # "\x1b[31mScoring:\t\x1b[0mCOMBINING THE SCORES OF %d matrices (quantile normalize)", # len(result_matrices)) start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time #logging.info("\x1b[31mScoring:\t\x1b[0mSCORES COMBINED IN %f s", elapsed / 1000.0) if len(result_matrices) > 0: combined_score = (result_matrices[0] * self.__scoring_functions[0].scaling(iteration)) for index in xrange(1, len(result_matrices)): combined_score += ( result_matrices[index] * self.__scoring_functions[index].scaling(iteration)) return combined_score else: return None
def __init__(self, id, organism, membership, ratios, seqtype, config_params=None): """creates a ScoringFunction""" scoring.ScoringFunctionBase.__init__(self, id, organism, membership, ratios, config_params=config_params) # attributes accessible by subclasses self.seqtype = seqtype self.__setup_meme_suite(config_params) self.num_motif_func = util.get_iter_fun( config_params['MEME'], "nmotifs", config_params['num_iterations']) self.__last_motif_infos = None self.__last_iteration_result = {} self.all_pvalues = None self.last_result = None self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params) self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params) used_genes = sorted(ratios.row_names) self.used_seqs = organism.sequences_for_genes_scan( used_genes, seqtype=self.seqtype) logging.debug("building reverse map...") start_time = util.current_millis() self.reverse_map = self.__build_reverse_map(ratios) logging.debug("reverse map built in %d ms.", util.current_millis() - start_time) self.__last_results = None # caches the results of the previous meme run
def __combine(self, result_matrices, score_scalings, iteration): if len(result_matrices) > 1 and self.__config_params['quantile_normalize']: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) if len(result_matrices) > 0: start_time = util.current_millis() combined_score = (result_matrices[0] * self.__scoring_functions[0].scaling(iteration)) for index in xrange(1, len(result_matrices)): combined_score += ( result_matrices[index] * self.__scoring_functions[index].scaling(iteration)) elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) return combined_score else: return None
def quantile_normalize_scores(matrices, weights=None): """quantile normalize scores against each other""" logging.info("COMPUTING WEIGHTED MEANS...") start_time = util.current_millis() # rearranges the scores in the input matrices into a matrix # with |matrices| columns where the columns contain the values # of each matrix in sorted order flat_values = np.transpose(np.asarray([np.sort(matrix.values.flatten()) for matrix in matrices])) elapsed = util.current_millis() - start_time logging.info("flattened/sorted score matrices in %f s.", elapsed / 1000.0) start_time = util.current_millis() if weights is not None: # multiply each column of matrix with each component of the # weight vector: Using matrix multiplication resulted in speedup # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)! scaled = weights * flat_values scale = np.sum(np.ma.masked_array(weights, np.isnan(weights))) tmp_mean = util.row_means(scaled) / scale else: tmp_mean = util.row_means(flat_values) elapsed = util.current_millis() - start_time logging.info("weighted means in %f s.", elapsed / 1000.0) start_time = util.current_millis() result = qm_result_matrices(matrices, tmp_mean) elapsed = util.current_millis() - start_time logging.info("result matrices built in %f s.", elapsed / 1000.0) return result
def quantile_normalize_scores(matrices, weights=None): """quantile normalize scores against each other""" logging.info("COMPUTING WEIGHTED MEANS...") start_time = util.current_millis() # rearranges the scores in the input matrices into a matrix # with |matrices| columns where the columns contain the values # of each matrix in sorted order flat_values = np.transpose( np.asarray([np.sort(matrix.values.flatten()) for matrix in matrices])) elapsed = util.current_millis() - start_time logging.info("flattened/sorted score matrices in %f s.", elapsed / 1000.0) start_time = util.current_millis() if weights is not None: # multiply each column of matrix with each component of the # weight vector: Using matrix multiplication resulted in speedup # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)! scaled = weights * flat_values scale = np.sum(np.ma.masked_array(weights, np.isnan(weights))) tmp_mean = util.row_means(scaled) / scale else: tmp_mean = util.row_means(flat_values) elapsed = util.current_millis() - start_time logging.info("weighted means in %f s.", elapsed / 1000.0) start_time = util.current_millis() result = qm_result_matrices(matrices, tmp_mean) elapsed = util.current_millis() - start_time logging.info("result matrices built in %f s.", elapsed / 1000.0) return result
def compute(self, iteration_result, ref_matrix=None): """compute method for the specified iteration Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ iteration = iteration_result['iteration'] if self.run_in_iteration(iteration): logging.info('Scoring motifs...') global_start_time = util.current_millis() # here is the main compute of the cluster scores pvalues = self.compute_pvalues(iteration_result) # values are returned here - consider remapping them per gene remapped = {} for cluster in pvalues: pvalues_k = pvalues[cluster] pvalues_genes = {} for feature_id, pvalue in pvalues_k.items(): pvalues_genes[self.reverse_map[feature_id]] = pvalue remapped[cluster] = pvalues_genes # convert remapped to an actual scoring matrix matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) for row_index in xrange(matrix.num_rows()): row = matrix.row_name(row_index) for cluster in xrange(1, self.num_clusters() + 1): if (cluster in remapped.keys() and row in remapped[cluster].keys()): matrix[row_index][cluster - 1] = remapped[cluster][row] global_elapsed = util.current_millis() - global_start_time logging.info("GLOBAL MOTIF TIME: %d seconds", (global_elapsed / 1000.0)) self.__last_computed_result = matrix return self.__last_computed_result
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args'][ 'functions'] m.write_tsv_file(os.path.join( config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores( result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([ mat.values[index_map[row], cluster - 1] for row in row_members ]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.debug('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def run_iteration(self, iteration, force=False): """Run a single cMonkey iteration Keyword arguments: iteration -- The iteration number to run force -- Set to true to force recalculations (DEFAULT:FALSE) """ logging.info("Iteration # %d", iteration) iteration_result = {'iteration': iteration, 'score_means': {}} if force == True: rscores = self.row_scoring.compute_force(iteration_result) else: rscores = self.row_scoring.compute(iteration_result) start_time = util.current_millis() if force == True: cscores = self.column_scoring.compute_force(iteration_result) else: cscores = self.column_scoring.compute(iteration_result) elapsed = util.current_millis() - start_time if elapsed > 0.0001: logging.debug("computed column_scores in %f s.", elapsed / 1000.0) #skip_update = False #if (self['num_iterations'] == self['start_iteration'] and self['resume'] == True): # skip_update = True #if skip_update == False: self.membership().update(self.ratios, rscores, cscores, self['num_iterations'], iteration_result) mean_net_score = 0.0 mean_mot_pvalue = 0.0 if 'networks' in iteration_result.keys(): mean_net_score = iteration_result['networks'] mean_mot_pvalue = "NA" if 'motif-pvalue' in iteration_result.keys(): mean_mot_pvalue = "" mean_mot_pvalues = iteration_result['motif-pvalue'] mean_mot_pvalue = "" for seqtype in mean_mot_pvalues.keys(): mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype])) logging.debug('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue) # Reduce I/O, will write the results to database only on a debug run if not self['minimize_io']: if iteration == 1 or (iteration % self['result_freq'] == 0): self.write_results(iteration_result) # This should not be too much writing, so we can keep it OUT of minimize_io option...? if iteration == 1 or (iteration % self['stats_freq'] == 0): self.write_stats(iteration_result) self.update_iteration(iteration) if 'dump_results' in self['debug'] and (iteration == 1 or (iteration % self['debug_freq'] == 0)): # write complete result into a cmresults.tsv conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration) with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, iteration, self['num_clusters'], self['output_dir'])
def run_iterations(self, start_iter=None, num_iter=None): if start_iter is None: start_iter = self['start_iteration'] if num_iter is None: num_iter=self['num_iterations'] + 1 if self.config_params['interactive']: # stop here in interactive mode return #for iteration in range(self['start_iteration'], # self['num_iterations'] + 1): for iteration in range(start_iter, num_iter): start_time = util.current_millis() #02-09-15 Force recalculation if first iteration of a resume force = False if (iteration == start_iter) and (self['resume'] == True): force=True self.run_iteration(iteration, force=force) # garbage collection after everything in iteration went out of scope gc.collect() elapsed = util.current_millis() - start_time logging.debug("performed iteration %d in %f s.", iteration, elapsed / 1000.0) if 'profile_mem' in self['debug'] and (iteration == 1 or iteration % 100 == 0): with open(os.path.join(self['output_dir'], 'memprofile.tsv'), 'a') as outfile: self.write_mem_profile(outfile, iteration) """run post processing after the last iteration. We store the results in num_iterations + 1 to have a clean separation""" if self['postadjust']: logging.info("Postprocessing: Adjusting the clusters....") # run combiner using the weights of the last iteration rscores = self.row_scoring.combine_cached(self['num_iterations']) rd_scores = memb.get_row_density_scores(self.membership(), rscores) logging.info("Recomputed combined + density scores.") memb.postadjust(self.membership(), rd_scores) BSCM_obj = self.column_scoring.get_BSCM() if not (BSCM_obj is None): new_membership = BSCM_obj.resplit_clusters(self.membership(), cutoff=0.05) logging.info("Adjusted. Now re-run scoring (iteration: %d)", self['num_iterations']) iteration_result = {'iteration': self['num_iterations'] + 1, 'score_means': {}} combined_scores = self.row_scoring.compute_force(iteration_result) # write the combined scores for benchmarking/diagnostics with open(self.combined_rscores_pickle_path(), 'w') as outfile: cPickle.dump(combined_scores, outfile) self.write_results(iteration_result) self.write_stats(iteration_result) self.update_iteration(iteration) # default behaviour: # always write complete result into a cmresults.tsv for R/cmonkey # compatibility conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-postproc.tsv.bz2') with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, self['num_iterations'] + 1, self['num_clusters'], self['output_dir']) #Why is conn never closed? Where does it write to the db? # additionally: run tomtom on the motifs if requested if (self['MEME']['global_background'] == 'True' and self['Postprocessing']['run_tomtom'] == 'True'): meme.run_tomtom(conn, self['output_dir'], self['MEME']['version']) self.write_finish_info() logging.info("Done !!!!")
def do_compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.ratios SET_MEMBERSHIP = self.membership SET_SYNONYMS = self.organism.thesaurus() if CANONICAL_ROWNAMES is None: CANONICAL_ROWNAMES = set( map(lambda n: SET_SYNONYMS[n] if n in SET_SYNONYMS else n, self.ratios.row_names)) if CANONICAL_ROW_INDEXES is None: CANONICAL_ROW_INDEXES = {} for index, row in enumerate(self.ratios.row_names): if row in SET_SYNONYMS: CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index else: CANONICAL_ROW_INDEXES[row] = index ref_min_score = ref_matrix.min() logging.info('REF_MIN_SCORE: %f', ref_min_score) set_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_set.csv') pval_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_pvalue.csv') for set_type in self.__set_types: SET_SET_TYPE = set_type logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map( compute_cluster_score, [(cluster, self.bonferroni_cutoff(), ref_min_score) for cluster in xrange(1, self.num_clusters() + 1)]) else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append( compute_cluster_score( (cluster, self.bonferroni_cutoff(), ref_min_score))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) if not os.path.exists(set_filepath): setFile = open(set_filepath, 'w') setFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) pvFile = open(pval_filepath, 'w') pvFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) else: setFile = open(set_filepath, 'a') pvFile = open(pval_filepath, 'a') minSets = [] pValues = [] for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] minSets.append(min_set) pValues.append(min_pvalue) for row in xrange(len(self.gene_names())): matrix.values[row][cluster - 1] += scores[row] * set_type.weight setFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in minSets])) pvFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in pValues])) setFile.close() pvFile.close() logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) # cleanup SET_SET_TYPE = None SET_MATRIX = None SET_MEMBERSHIP = None SET_SYNONYMS = None return matrix
def do_compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_REF_MATRIX, SET_SET_TYPE logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.ratios SET_MEMBERSHIP = self.membership SET_REF_MATRIX = ref_matrix for set_type in self.__set_types: SET_SET_TYPE = set_type #logging.info("PROCESSING SET TYPE [%s]", repr(set_type)) logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() if use_multiprocessing: pool = mp.Pool() results = pool.map( compute_cluster_score, [(cluster, self.bonferroni_cutoff()) for cluster in xrange(1, self.num_clusters() + 1)]) pool.close() pool.join() else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append( compute_cluster_score( (cluster, self.bonferroni_cutoff()))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) if not os.path.exists('out/setEnrichment_set.csv'): setFile = open('out/setEnrichment_set.csv', 'w') setFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) pvFile = open('out/setEnrichment_pvalue.csv', 'w') pvFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) else: setFile = open('out/setEnrichment_set.csv', 'a') pvFile = open('out/setEnrichment_pvalue.csv', 'a') minSets = [] pValues = [] for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] self.__last_min_enriched_set[set_type][cluster] = (min_set, min_pvalue) minSets.append(min_set) pValues.append(min_pvalue) for row in xrange(len(self.gene_names())): matrix.values[row][cluster - 1] = scores[row] setFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in minSets])) pvFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in pValues])) setFile.close() pvFile.close() logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) return matrix
def compute_pvalues(self, iteration_result, num_motifs, force): """Compute motif scores. The result is a dictionary from cluster -> (feature_id, pvalue) containing a sparse gene-to-pvalue mapping for each cluster In order to influence the sequences that go into meme, the user can specify a list of sequence filter functions that have the signature (seqs, feature_ids, distance) -> seqs These filters are applied in the order they appear in the list. """ global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP cluster_pvalues = {} min_cluster_rows_allowed = self.config_params[ 'memb.min_cluster_rows_allowed'] max_cluster_rows_allowed = self.config_params[ 'memb.max_cluster_rows_allowed'] use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] # extract the sequences for each cluster, slow start_time = util.current_millis() SEQUENCE_FILTERS = self.__sequence_filters ORGANISM = self.organism MEMBERSHIP = self.membership with util.get_mp_pool(self.config_params) as pool: cluster_seqs_params = [ (cluster, self.seqtype) for cluster in xrange(1, self.num_clusters() + 1) ] seqs_list = pool.map(cluster_seqs, cluster_seqs_params) SEQUENCE_FILTERS = None ORGANISM = None MEMBERSHIP = None logging.debug("prepared sequences in %d ms.", util.current_millis() - start_time) # Make the parameters, this is fast enough start_time = util.current_millis() params = {} for cluster in xrange(1, self.num_clusters() + 1): # Pass the previous run's seed if possible if self.__last_motif_infos is not None: previous_motif_infos = self.__last_motif_infos.get( cluster, None) else: previous_motif_infos = None seqs, feature_ids = seqs_list[cluster - 1] params[cluster] = ComputeScoreParams( iteration_result['iteration'], cluster, feature_ids, seqs, self.used_seqs, self.meme_runner(), min_cluster_rows_allowed, max_cluster_rows_allowed, num_motifs, previous_motif_infos, self.config_params['output_dir'], self.config_params['num_iterations'], self.config_params['debug']) logging.debug("prepared MEME parameters in %d ms.", util.current_millis() - start_time) # create motif result map if necessary for cluster in xrange(1, self.num_clusters() + 1): if not cluster in iteration_result: iteration_result[cluster] = {} # Optimization: # if the cluster hasn't changed since last time, reuse the last results # we do this by filtering out the parameters of the clusters that did not # change if not force and self.__last_results is not None: oldlen = len(params) params = { cluster: params[cluster] for cluster in xrange(1, self.num_clusters() + 1) if params[cluster].feature_ids != self.__last_results[cluster][0] } newlen = len(params) if oldlen - newlen > 0: logging.debug("%d clusters did not change !!!", oldlen - newlen) # compute and store motif results self.__last_motif_infos = {} if self.__last_results is None: self.__last_results = {} if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map(compute_cluster_score, params.values()) results = {r[0]: r[1:] for r in results} # indexed by cluster for cluster in xrange(1, self.num_clusters() + 1): if cluster in results: pvalues, run_result = results[cluster] self.__last_results[cluster] = ( params[cluster].feature_ids, pvalues, run_result) else: feature_ids, pvalues, run_result = self.__last_results[ cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[ cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json( run_result) iteration_result[cluster]['pvalues'] = pvalues else: for cluster in xrange(1, self.num_clusters() + 1): if cluster in params: _, pvalues, run_result = compute_cluster_score( params[cluster]) self.__last_results[cluster] = ( params[cluster].feature_ids, pvalues, run_result) else: _, pvalues, run_result = self.__last_results[cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues return cluster_pvalues
def combine(result_matrices, score_scalings, membership, quantile_normalize): """This is the combining function, taking n result matrices and scalings""" for m in result_matrices: m.fix_extreme_values() if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row]][cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.info("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) #print "qqq(%d) = %f" % (i, qqq) if qqq == 0: logging.error("very sparse score !!!") values = values / qqq * abs(rs_quant) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def compute_pvalues(self, iteration_result, num_motifs): """Compute motif scores. The result is a dictionary from cluster -> (feature_id, pvalue) containing a sparse gene-to-pvalue mapping for each cluster In order to influence the sequences that go into meme, the user can specify a list of sequence filter functions that have the signature (seqs, feature_ids, distance) -> seqs These filters are applied in the order they appear in the list. """ global MOTIF_PARAMS, SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP cluster_pvalues = {} min_cluster_rows_allowed = self.config_params[ 'memb.min_cluster_rows_allowed'] max_cluster_rows_allowed = self.config_params[ 'memb.max_cluster_rows_allowed'] use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] # extract the sequences for each cluster, slow start_time = util.current_millis() SEQUENCE_FILTERS = self.__sequence_filters ORGANISM = self.organism MEMBERSHIP = self.membership pool = mp.Pool() cluster_seqs_params = [(cluster, self.seqtype) for cluster in xrange(1, self.num_clusters() + 1)] seqs_list = pool.map(cluster_seqs, cluster_seqs_params) SEQUENCE_FILTERS = None ORGANISM = None MEMBERSHIP = None logging.info("prepared sequences in %d ms.", util.current_millis() - start_time) # Make the parameters, this is fast enough start_time = util.current_millis() params = [] for cluster in xrange(1, self.num_clusters() + 1): # Pass the previous run's seed if possible if self.__last_motif_infos is not None: previous_motif_infos = self.__last_motif_infos.get( cluster, None) else: previous_motif_infos = None seqs, feature_ids = seqs_list[cluster - 1] params.append( ComputeScoreParams( iteration_result['iteration'], cluster, feature_ids, seqs, self.used_seqs, self.meme_runner(), min_cluster_rows_allowed, max_cluster_rows_allowed, num_motifs, previous_motif_infos, self.config_params.get('keep_memeout', False), self.config_params['output_dir'], self.config_params['num_iterations'], self.config_params['debug'])) logging.info("prepared MEME parameters in %d ms.", util.current_millis() - start_time) # create motif result map if necessary for cluster in xrange(1, self.num_clusters() + 1): if not cluster in iteration_result: iteration_result[cluster] = {} # compute and store motif results MOTIF_PARAMS = params self.__last_motif_infos = {} if use_multiprocessing: pool = mp.Pool() results = pool.map(compute_cluster_score, xrange(1, self.num_clusters() + 1)) for cluster in xrange(1, self.num_clusters() + 1): pvalues, run_result = results[cluster - 1] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues pool.close() pool.join() else: for cluster in xrange(1, self.num_clusters() + 1): pvalues, run_result = compute_cluster_score(cluster) cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues # cleanup MOTIF_PARAMS = None return cluster_pvalues