def qm_result_matrices(matrices, tmp_mean, multiprocessing=True): """builds the resulting matrices by looking at the rank of their original values and retrieving the means at the specified position""" if multiprocessing: # parallelized ranking with util.get_mp_pool() as pool: results = pool.map(rank_fun, [(matrix.values, matrix.row_names, matrix.column_names, tmp_mean) for matrix in matrices]) return results else: # non-parallelized result = [] for i in range(len(matrices)): matrix = matrices[i] values = matrix.values num_rows, num_cols = values.shape rankvals = util.rrank_matrix(values) values = np.reshape(tmp_mean[rankvals], (num_rows, num_cols)) outmatrix = DataMatrix(num_rows, num_cols, matrix.row_names, matrix.column_names, values=values) result.append(outmatrix) return result
def __compute_network_cluster_scores(self, network): """computes the cluster scores for the given network""" global COMPUTE_NETWORK, ALL_GENES, NETWORK_SCORE_MEMBERSHIP result = {} use_multiprocessing = self.config_params[ scoring.KEY_MULTIPROCESSING] # Set the huge memory objects into globals # These are readonly anyways, but using Manager.list() or something # similar brings this down to a crawl COMPUTE_NETWORK = network ALL_GENES = set(self.gene_names()) # optimization: O(1) lookup NETWORK_SCORE_MEMBERSHIP = self.membership if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: map_results = pool.map(compute_network_scores, xrange(1, self.num_clusters() + 1)) for cluster in xrange(1, self.num_clusters() + 1): result[cluster] = map_results[cluster - 1] else: for cluster in xrange(1, self.num_clusters() + 1): result[cluster] = compute_network_scores(cluster) # cleanup COMPUTE_NETWORK = None ALL_GENES = None NETWORK_SCORE_MEMBERSHIP = None return result
def __compute_network_cluster_scores(self, network): """computes the cluster scores for the given network""" global COMPUTE_NETWORK, ALL_GENES, NETWORK_SCORE_MEMBERSHIP result = {} use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] # Set the huge memory objects into globals # These are readonly anyways, but using Manager.list() or something # similar brings this down to a crawl COMPUTE_NETWORK = network ALL_GENES = set(self.gene_names()) # optimization: O(1) lookup NETWORK_SCORE_MEMBERSHIP = self.membership if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: map_results = pool.map(compute_network_scores, xrange(1, self.num_clusters() + 1)) for cluster in xrange(1, self.num_clusters() + 1): result[cluster] = map_results[cluster - 1] else: for cluster in xrange(1, self.num_clusters() + 1): result[cluster] = compute_network_scores(cluster) # cleanup COMPUTE_NETWORK = None ALL_GENES = None NETWORK_SCORE_MEMBERSHIP = None return result
def compute_column_scores(membership, matrix, num_clusters, config_params): """Computes the column scores for the specified number of clusters""" def compute_substitution(cluster_column_scores): """calculate substitution value for missing column scores""" membership_values = [] for cluster in xrange(1, num_clusters + 1): columns = membership.columns_for_cluster(cluster) column_scores = cluster_column_scores[cluster - 1] if column_scores is not None: colnames, scores = column_scores for col in xrange(len(colnames)): if colnames[col] in columns: membership_values.append(scores[col]) return util.quantile(membership_values, 0.95) def make_submatrix(cluster): row_names = membership.rows_for_cluster(cluster) if len(row_names) > 1: return matrix.submatrix_by_name(row_names=row_names) else: return None if config_params['multiprocessing']: with util.get_mp_pool(config_params) as pool: cluster_column_scores = pool.map(compute_column_scores_submatrix, map(make_submatrix, xrange(1, num_clusters + 1))) else: cluster_column_scores = [] for cluster in xrange(1, num_clusters + 1): cluster_column_scores.append(compute_column_scores_submatrix( make_submatrix(cluster))) substitution = compute_substitution(cluster_column_scores) # Convert scores into a matrix that have the clusters as columns # and conditions in the rows result = dm.DataMatrix(matrix.num_columns, num_clusters, row_names=matrix.column_names) rvalues = result.values for cluster in xrange(num_clusters): column_scores = cluster_column_scores[cluster] if column_scores is not None: _, scores = column_scores scores[np.isnan(scores)] = substitution for row_index in xrange(matrix.num_columns): if column_scores is None: rvalues[row_index, cluster] = substitution else: _, scores = column_scores rvalues[row_index, cluster] = scores[row_index] result.fix_extreme_values() return result
def __compute_row_scores_for_clusters(membership, matrix, num_clusters, config_params): """compute the pure row scores for the specified clusters without nowmalization""" # note that we set the data into globals before we fork it off # to save memory and pickling time global ROW_SCORE_MATRIX, ROW_SCORE_MEMBERSHIP ROW_SCORE_MATRIX = matrix ROW_SCORE_MEMBERSHIP = membership if config_params['multiprocessing']: with util.get_mp_pool(config_params) as pool: result = pool.map(compute_row_scores_for_cluster, xrange(1, num_clusters + 1)) else: result = [] for cluster in range(1, num_clusters + 1): result.append(compute_row_scores_for_cluster(cluster)) # cleanup ROW_SCORE_MATRIX = None ROW_SCORE_MEMBERSHIP = None return result
def compute_column_scores(membership, matrix, num_clusters, config_params, BSCM_obj=None): """Computes the column scores for the specified number of clusters""" def compute_substitution(cluster_column_scores): """calculate substitution value for missing column scores""" membership_values = [] for cluster in xrange(1, num_clusters + 1): columns = membership.columns_for_cluster(cluster) column_scores = cluster_column_scores[cluster - 1] if column_scores is not None: colnames, scores = column_scores for col in xrange(len(colnames)): if colnames[col] in columns: membership_values.append(scores[col]) return util.quantile(membership_values, 0.95) def make_submatrix(cluster): row_names = membership.rows_for_cluster(cluster) if len(row_names) > 1: return matrix.submatrix_by_name(row_names=row_names) else: return None cluster_column_scores = [] #To be filled or overwritten if BSCM_obj is None: if config_params['multiprocessing']: with util.get_mp_pool(config_params) as pool: cluster_column_scores = pool.map( compute_column_scores_submatrix, map(make_submatrix, xrange(1, num_clusters + 1))) else: for cluster in xrange(1, num_clusters + 1): cluster_column_scores.append( compute_column_scores_submatrix(make_submatrix(cluster))) else: #if BSCM_obj exists num_cores = 1 if not config_params['num_cores'] is None: num_cores = config_params['num_cores'] for cluster in xrange(1, num_clusters + 1): if make_submatrix(cluster) is None: cluster_column_scores.append(None) else: cur_column_scores = BSCM_obj.getPvals( make_submatrix(cluster).row_names, num_cores=num_cores) exp_names = cur_column_scores.keys() exp_scores = np.array(cur_column_scores.values()) cluster_column_scores.append((exp_names, exp_scores)) substitution = compute_substitution(cluster_column_scores) # Convert scores into a matrix that have the clusters as columns # and conditions in the rows result = dm.DataMatrix(matrix.num_columns, num_clusters, row_names=matrix.column_names) rvalues = result.values for cluster in xrange(num_clusters): column_scores = cluster_column_scores[cluster] if column_scores is not None: _, scores = column_scores scores[np.isnan(scores)] = substitution for row_index in xrange(matrix.num_columns): if column_scores is None: rvalues[row_index, cluster] = substitution else: _, scores = column_scores rvalues[row_index, cluster] = scores[row_index] result.fix_extreme_values() return result
def do_compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.ratios SET_MEMBERSHIP = self.membership SET_SYNONYMS = self.organism.thesaurus() if CANONICAL_ROWNAMES is None: CANONICAL_ROWNAMES = set(map(lambda n: SET_SYNONYMS[n] if n in SET_SYNONYMS else n, self.ratios.row_names)) if CANONICAL_ROW_INDEXES is None: CANONICAL_ROW_INDEXES = {} for index, row in enumerate(self.ratios.row_names): if row in SET_SYNONYMS: CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index else: CANONICAL_ROW_INDEXES[row] = index ref_min_score = ref_matrix.min() logging.info('REF_MIN_SCORE: %f', ref_min_score) set_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_set.csv') pval_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_pvalue.csv') for set_type in self.__set_types: SET_SET_TYPE = set_type logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map(compute_cluster_score, [(cluster, self.bonferroni_cutoff(), ref_min_score) for cluster in xrange(1, self.num_clusters() + 1)]) else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append(compute_cluster_score((cluster, self.bonferroni_cutoff(), ref_min_score))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) if not os.path.exists(set_filepath): setFile = open(set_filepath, 'w') setFile.write(',' + ','.join([str(i) for i in xrange(1, self.num_clusters() + 1)])) pvFile = open(pval_filepath, 'w') pvFile.write(',' + ','.join([str(i) for i in xrange(1, self.num_clusters() + 1)])) else: setFile = open(set_filepath, 'a') pvFile = open(pval_filepath, 'a') minSets = [] pValues = [] for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] minSets.append(min_set) pValues.append(min_pvalue) for row in xrange(len(self.gene_names())): matrix.values[row][cluster - 1] += scores[row] * set_type.weight setFile.write('\n'+str(iteration_result['iteration'])+','+','.join([str(i) for i in minSets])) pvFile.write('\n'+str(iteration_result['iteration'])+','+','.join([str(i) for i in pValues])) setFile.close() pvFile.close() logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) # cleanup SET_SET_TYPE = None SET_MATRIX = None SET_MEMBERSHIP = None SET_SYNONYMS = None return matrix
def do_compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.ratios SET_MEMBERSHIP = self.membership SET_SYNONYMS = self.organism.thesaurus() if CANONICAL_ROWNAMES is None: CANONICAL_ROWNAMES = set( map(lambda n: SET_SYNONYMS[n] if n in SET_SYNONYMS else n, self.ratios.row_names)) if CANONICAL_ROW_INDEXES is None: CANONICAL_ROW_INDEXES = {} for index, row in enumerate(self.ratios.row_names): if row in SET_SYNONYMS: CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index else: CANONICAL_ROW_INDEXES[row] = index ref_min_score = ref_matrix.min() logging.info('REF_MIN_SCORE: %f', ref_min_score) set_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_set.csv') pval_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_pvalue.csv') for set_type in self.__set_types: SET_SET_TYPE = set_type logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map( compute_cluster_score, [(cluster, self.bonferroni_cutoff(), ref_min_score) for cluster in xrange(1, self.num_clusters() + 1)]) else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append( compute_cluster_score( (cluster, self.bonferroni_cutoff(), ref_min_score))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) if not os.path.exists(set_filepath): setFile = open(set_filepath, 'w') setFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) pvFile = open(pval_filepath, 'w') pvFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) else: setFile = open(set_filepath, 'a') pvFile = open(pval_filepath, 'a') minSets = [] pValues = [] for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] minSets.append(min_set) pValues.append(min_pvalue) for row in xrange(len(self.gene_names())): matrix.values[row][cluster - 1] += scores[row] * set_type.weight setFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in minSets])) pvFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in pValues])) setFile.close() pvFile.close() logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) # cleanup SET_SET_TYPE = None SET_MATRIX = None SET_MEMBERSHIP = None SET_SYNONYMS = None return matrix
def compute_pvalues(self, iteration_result, num_motifs, force): """Compute motif scores. The result is a dictionary from cluster -> (feature_id, pvalue) containing a sparse gene-to-pvalue mapping for each cluster In order to influence the sequences that go into meme, the user can specify a list of sequence filter functions that have the signature (seqs, feature_ids, distance) -> seqs These filters are applied in the order they appear in the list. """ global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP cluster_pvalues = {} min_cluster_rows_allowed = self.config_params['memb.min_cluster_rows_allowed'] max_cluster_rows_allowed = self.config_params['memb.max_cluster_rows_allowed'] use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] # extract the sequences for each cluster, slow start_time = util.current_millis() SEQUENCE_FILTERS = self.__sequence_filters ORGANISM = self.organism MEMBERSHIP = self.membership with util.get_mp_pool(self.config_params) as pool: cluster_seqs_params = [(cluster, self.seqtype) for cluster in xrange(1, self.num_clusters() + 1)] seqs_list = pool.map(cluster_seqs, cluster_seqs_params) SEQUENCE_FILTERS = None ORGANISM = None MEMBERSHIP = None logging.debug("prepared sequences in %d ms.", util.current_millis() - start_time) # Make the parameters, this is fast enough start_time = util.current_millis() params = {} for cluster in xrange(1, self.num_clusters() + 1): # Pass the previous run's seed if possible if self.__last_motif_infos is not None: previous_motif_infos = self.__last_motif_infos.get(cluster, None) else: previous_motif_infos = None seqs, feature_ids = seqs_list[cluster - 1] params[cluster] = ComputeScoreParams(iteration_result['iteration'], cluster, feature_ids, seqs, self.used_seqs, self.meme_runner(), min_cluster_rows_allowed, max_cluster_rows_allowed, num_motifs, previous_motif_infos, self.config_params['output_dir'], self.config_params['num_iterations'], self.config_params['debug']) logging.debug("prepared MEME parameters in %d ms.", util.current_millis() - start_time) # create motif result map if necessary for cluster in xrange(1, self.num_clusters() + 1): if not cluster in iteration_result: iteration_result[cluster] = {} # Optimization: # if the cluster hasn't changed since last time, reuse the last results # we do this by filtering out the parameters of the clusters that did not # change if not force and self.__last_results is not None: oldlen = len(params) params = {cluster: params[cluster] for cluster in xrange(1, self.num_clusters() + 1) if params[cluster].feature_ids != self.__last_results[cluster][0]} newlen = len(params) if oldlen - newlen > 0: logging.debug("%d clusters did not change !!!", oldlen - newlen) # compute and store motif results self.__last_motif_infos = {} if self.__last_results is None: self.__last_results = {} if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map(compute_cluster_score, params.values()) results = {r[0]: r[1:] for r in results} # indexed by cluster for cluster in xrange(1, self.num_clusters() + 1): if cluster in results: pvalues, run_result = results[cluster] self.__last_results[cluster] = (params[cluster].feature_ids, pvalues, run_result) else: feature_ids, pvalues, run_result = self.__last_results[cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues else: for cluster in xrange(1, self.num_clusters() + 1): if cluster in params: _, pvalues, run_result = compute_cluster_score(params[cluster]) self.__last_results[cluster] = (params[cluster].feature_ids, pvalues, run_result) else: _, pvalues, run_result = self.__last_results[cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues return cluster_pvalues
def compute_pvalues(self, iteration_result, num_motifs, force): """Compute motif scores. The result is a dictionary from cluster -> (feature_id, pvalue) containing a sparse gene-to-pvalue mapping for each cluster In order to influence the sequences that go into meme, the user can specify a list of sequence filter functions that have the signature (seqs, feature_ids, distance) -> seqs These filters are applied in the order they appear in the list. """ global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP cluster_pvalues = {} min_cluster_rows_allowed = self.config_params[ 'memb.min_cluster_rows_allowed'] max_cluster_rows_allowed = self.config_params[ 'memb.max_cluster_rows_allowed'] use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] # extract the sequences for each cluster, slow start_time = util.current_millis() SEQUENCE_FILTERS = self.__sequence_filters ORGANISM = self.organism MEMBERSHIP = self.membership with util.get_mp_pool(self.config_params) as pool: cluster_seqs_params = [ (cluster, self.seqtype) for cluster in xrange(1, self.num_clusters() + 1) ] seqs_list = pool.map(cluster_seqs, cluster_seqs_params) SEQUENCE_FILTERS = None ORGANISM = None MEMBERSHIP = None logging.debug("prepared sequences in %d ms.", util.current_millis() - start_time) # Make the parameters, this is fast enough start_time = util.current_millis() params = {} for cluster in xrange(1, self.num_clusters() + 1): # Pass the previous run's seed if possible if self.__last_motif_infos is not None: previous_motif_infos = self.__last_motif_infos.get( cluster, None) else: previous_motif_infos = None seqs, feature_ids = seqs_list[cluster - 1] params[cluster] = ComputeScoreParams( iteration_result['iteration'], cluster, feature_ids, seqs, self.used_seqs, self.meme_runner(), min_cluster_rows_allowed, max_cluster_rows_allowed, num_motifs, previous_motif_infos, self.config_params['output_dir'], self.config_params['num_iterations'], self.config_params['debug']) logging.debug("prepared MEME parameters in %d ms.", util.current_millis() - start_time) # create motif result map if necessary for cluster in xrange(1, self.num_clusters() + 1): if not cluster in iteration_result: iteration_result[cluster] = {} # Optimization: # if the cluster hasn't changed since last time, reuse the last results # we do this by filtering out the parameters of the clusters that did not # change if not force and self.__last_results is not None: oldlen = len(params) params = { cluster: params[cluster] for cluster in xrange(1, self.num_clusters() + 1) if params[cluster].feature_ids != self.__last_results[cluster][0] } newlen = len(params) if oldlen - newlen > 0: logging.debug("%d clusters did not change !!!", oldlen - newlen) # compute and store motif results self.__last_motif_infos = {} if self.__last_results is None: self.__last_results = {} if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map(compute_cluster_score, params.values()) results = {r[0]: r[1:] for r in results} # indexed by cluster for cluster in xrange(1, self.num_clusters() + 1): if cluster in results: pvalues, run_result = results[cluster] self.__last_results[cluster] = ( params[cluster].feature_ids, pvalues, run_result) else: feature_ids, pvalues, run_result = self.__last_results[ cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[ cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json( run_result) iteration_result[cluster]['pvalues'] = pvalues else: for cluster in xrange(1, self.num_clusters() + 1): if cluster in params: _, pvalues, run_result = compute_cluster_score( params[cluster]) self.__last_results[cluster] = ( params[cluster].feature_ids, pvalues, run_result) else: _, pvalues, run_result = self.__last_results[cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues return cluster_pvalues