Esempio n. 1
0
    def __init__(self, id, organism, membership, ratios, seqtype, config_params=None):
        """creates a ScoringFunction"""
        scoring.ScoringFunctionBase.__init__(self, id, organism, membership,
                                             ratios, config_params=config_params)
        # attributes accessible by subclasses
        self.seqtype = seqtype
        self.__setup_meme_suite(config_params)
        self.num_motif_func = util.get_iter_fun(config_params['MEME'], "nmotifs",
                                                config_params['num_iterations'])

        self.__last_motif_infos = None
        self.__last_iteration_result = {}
        self.all_pvalues = None
        self.last_result = None

        self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params)
        self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params)

        used_genes = sorted(ratios.row_names)
        self.used_seqs = organism.sequences_for_genes_scan(
            used_genes, seqtype=self.seqtype)

        logging.debug("building reverse map...")
        start_time = util.current_millis()
        self.reverse_map = self.__build_reverse_map(ratios)
        logging.debug("reverse map built in %d ms.",
                      util.current_millis() - start_time)

        self.__last_results = None  # caches the results of the previous meme run
Esempio n. 2
0
    def do_compute(self, iteration_result, ref_matrix=None):
        """compute method, iteration is the 0-based iteration number"""
        # networks are cached
        if self.__networks == None:
            self.__networks = retrieve_networks(self.__organism)

        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())

        # a dictionary that holds the scores of each gene in a given cluster
        network_iteration_scores = {cluster: {}
                                    for cluster in xrange(1, self.num_clusters() + 1)}
        network_scores = {}
        for network in self.__networks:
            logging.info("Compute scores for network '%s', WEIGHT: %f",
                         network.name, network.weight)
            start_time = util.current_millis()
            network_score = self.__compute_network_cluster_scores(network)
            network_scores[network.name] = network_score
            self.__update_score_matrix(matrix, network_score, network.weight)
            elapsed = util.current_millis() - start_time
            logging.info("NETWORK '%s' SCORING TIME: %f s.",
                         network.name, (elapsed / 1000.0))
            # additional scoring information, not used for the actual clustering
            self.__update_network_iteration_scores(network_iteration_scores,
                                                   network_score, network.weight)
            iteration_scores = compute_iteration_scores(network_iteration_scores)

        with open(self.network_scores_pickle_path(), 'w') as outfile:
            cPickle.dump(network_scores, outfile)
        # immediately use in means computation
        self.network_scores = network_scores
        matrix.subtract_with_quantile(0.99)
        return matrix
Esempio n. 3
0
    def update(self, matrix, row_scores, column_scores, iteration,
               num_iterations, add_fuzz=True):
        """top-level update method"""
        if add_fuzz:
            row_scores, column_scores = self.__fuzzify(row_scores,
                                                       column_scores,
                                                       iteration,
                                                       num_iterations)

        rpc = map(len, self.__cluster_row_members.values())
        #logging.info('\x1b[31mMembership:\t\x1b[0mRows per cluster: %i to %i (median %d)' \
        #  %( min(rpc), max(rpc), np.median(rpc) ) )

        start = util.current_millis()
        #logging.info("\x1b[31mMembership:\t\x1b[0mGET_DENSITY_SCORES()...")
        rd_scores, cd_scores = get_density_scores(self, row_scores,
                                                  column_scores)
        elapsed = util.current_millis() - start
        #logging.info("\x1b[31mMembership:\t\x1b[0mGET_DENSITY_SCORES() took %f s.", elapsed / 1000.0)

        start = util.current_millis()
        #logging.info("\x1b[31mMembership:\t\x1b[0mCOMPENSATE_SIZE()...")
        compensate_size(self, matrix, rd_scores, cd_scores)
        elapsed = util.current_millis() - start
        #logging.info("\x1b[31mMembership:\t\x1b[0mCOMPENSATE_SIZE() took %f s.", elapsed / 1000.0)
        self.__update_memberships(rd_scores, cd_scores)
Esempio n. 4
0
    def __init__(self, organism, membership, matrix,
                 meme_suite, seqtype,
                 sequence_filters=[],
                 pvalue_filter=None,
                 scaling_func=None,
                 update_in_iteration=lambda iteration: True,
                 motif_in_iteration=lambda iteration: True,
                 config_params=None):
        """creates a ScoringFunction"""
        scoring.ScoringFunctionBase.__init__(self, membership,
                                             matrix, scaling_func,
                                             config_params)
        # attributes accessible by subclasses
        self.organism = organism
        self.meme_suite = meme_suite
        self.seqtype = seqtype
        self.run_in_iteration = run_in_iteration
        self.__sequence_filters = sequence_filters
        self.__pvalue_filter = pvalue_filter
        self.__last_computed_result = None

        # precompute the sequences for all genes that are referenced in the
        # input ratios, they are used as a basis to compute the background
        # distribution for every cluster
        self.used_seqs = organism.sequences_for_genes_scan(
            sorted(matrix.row_names()), seqtype=self.seqtype)
        logging.info("used sequences for motifing retrieved")
        logging.info("building reverse map...")
        start_time = util.current_millis()
        self.reverse_map = self.__build_reverse_map(matrix)
        logging.info("reverse map built in %d ms.",
                     util.current_millis() - start_time)
def seed_column_members(data_matrix, row_membership, num_clusters,
                        num_clusters_per_column):
    """Default column membership seeder ('best')
    In case of multiple input ratio matrices, we assume that these
    matrices have been combined into data_matrix"""
    num_rows = data_matrix.num_rows
    num_cols = data_matrix.num_columns
    # create a submatrix for each cluster
    cscores = np.zeros([data_matrix.num_columns, num_clusters])
    for cluster_num in xrange(1, num_clusters + 1):
        current_cluster_rows = []
        for row_index in xrange(num_rows):
            if row_membership[row_index][0] == cluster_num:
                current_cluster_rows.append(data_matrix.row_names[row_index])
        submatrix = data_matrix.submatrix_by_name(
            row_names=current_cluster_rows)
        _, scores = scoring.compute_column_scores_submatrix(submatrix)
        cscores.T[cluster_num - 1] = -scores

    start_time = util.current_millis()
    column_members = [util.rorder(cscores[i], num_clusters_per_column)
                      for i in xrange(num_cols)]
    elapsed = util.current_millis() - start_time
    logging.debug("seed column members in %f s.", elapsed % 1000.0)
    return column_members
Esempio n. 6
0
def compute_row_scores(membership, matrix, num_clusters,
                       use_multiprocessing):
    """for each cluster 1, 2, .. num_clusters compute the row scores
    for the each row name in the input name matrix"""
    #clusters = xrange(1, num_clusters + 1)
    start_time = util.current_millis()
    cluster_row_scores = __compute_row_scores_for_clusters(
        membership, matrix, num_clusters, use_multiprocessing)
    #logging.info("__compute_row_scores_for_clusters() in %f s.",
    #             (util.current_millis() - start_time) / 1000.0)
    start_time = util.current_millis()
    cluster_row_scores = __replace_non_numeric_values(cluster_row_scores,
                                                      membership,
                                                      matrix, num_clusters)
    #logging.info("__replace_non_numeric_values() in %f s.",
    #             (util.current_millis() - start_time) / 1000.0)

    # rearrange result into a DataMatrix, where rows are indexed by gene
    # and columns represent clusters
    start_time = util.current_millis()
    values = np.zeros((matrix.num_rows(), num_clusters))

    # note that cluster is 0 based on a matrix
    for cluster in xrange(num_clusters):
        row_scores = cluster_row_scores[cluster]
        values[:, cluster] = row_scores
    result = dm.DataMatrix(matrix.num_rows(), num_clusters,
                           row_names=matrix.row_names(),
                           values=values)
    #logging.info("made result matrix in %f s.",
    #             (util.current_millis() - start_time) / 1000.0)

    return result.sorted_by_row_name()
def compute_row_scores(membership, matrix, num_clusters, config_params):
    """for each cluster 1, 2, .. num_clusters compute the row scores
    for the each row name in the input name matrix"""
    start_time = util.current_millis()
    cluster_row_scores = __compute_row_scores_for_clusters(
        membership, matrix, num_clusters, config_params)
    # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version

    logging.debug("__compute_row_scores_for_clusters() in %f s.",
                  (util.current_millis() - start_time) / 1000.0)

    # rearrange result into a DataMatrix, where rows are indexed by gene
    # and columns represent clusters
    start_time = util.current_millis()
    values = np.zeros((matrix.num_rows, num_clusters))

    # note that cluster is 0 based on a matrix
    for cluster in xrange(num_clusters):
        row_scores = cluster_row_scores[cluster]
        values[:, cluster] = row_scores
    result = dm.DataMatrix(matrix.num_rows, num_clusters,
                           row_names=matrix.row_names,
                           values=values)
    logging.debug("made result matrix in %f s.",
                  (util.current_millis() - start_time) / 1000.0)
    return result
Esempio n. 8
0
def seed_column_members(data_matrix, row_membership, num_clusters,
                        num_clusters_per_column):
    """Default column membership seeder ('best')
    In case of multiple input ratio matrices, we assume that these
    matrices have been combined into data_matrix"""
    num_rows = data_matrix.num_rows
    num_cols = data_matrix.num_columns
    # create a submatrix for each cluster
    cscores = np.zeros([data_matrix.num_columns, num_clusters])
    for cluster_num in xrange(1, num_clusters + 1):
        current_cluster_rows = []
        for row_index in xrange(num_rows):
            if row_membership[row_index][0] == cluster_num:
                current_cluster_rows.append(data_matrix.row_names[row_index])
        submatrix = data_matrix.submatrix_by_name(
            row_names=current_cluster_rows)
        _, scores = scoring.compute_column_scores_submatrix(submatrix)
        cscores.T[cluster_num - 1] = -scores

    start_time = util.current_millis()
    column_members = [
        util.rorder(cscores[i], num_clusters_per_column)
        for i in xrange(num_cols)
    ]
    elapsed = util.current_millis() - start_time
    logging.info("seed column members in %f s.", elapsed % 1000.0)
    return column_members
Esempio n. 9
0
def compute_row_scores(membership, matrix, num_clusters, use_multiprocessing):
    """for each cluster 1, 2, .. num_clusters compute the row scores
    for the each row name in the input name matrix"""
    start_time = util.current_millis()
    cluster_row_scores = __compute_row_scores_for_clusters(
        membership, matrix, num_clusters, use_multiprocessing)
    # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version

    logging.info("__compute_row_scores_for_clusters() in %f s.",
                 (util.current_millis() - start_time) / 1000.0)

    # rearrange result into a DataMatrix, where rows are indexed by gene
    # and columns represent clusters
    start_time = util.current_millis()
    values = np.zeros((matrix.num_rows, num_clusters))

    # note that cluster is 0 based on a matrix
    for cluster in xrange(num_clusters):
        row_scores = cluster_row_scores[cluster]
        values[:, cluster] = row_scores
    result = dm.DataMatrix(matrix.num_rows,
                           num_clusters,
                           row_names=matrix.row_names,
                           values=values)
    logging.info("made result matrix in %f s.",
                 (util.current_millis() - start_time) / 1000.0)
    return result
Esempio n. 10
0
def seed_column_members(data_matrix, row_membership, num_clusters,
                        num_clusters_per_column):
    """Default column membership seeder ('best')
    In case of multiple input ratio matrices, we assume that these
    matrices have been combined into data_matrix"""
    num_rows = data_matrix.num_rows
    num_cols = data_matrix.num_columns
    # create a submatrix for each cluster
    column_scores = []
    for cluster_num in xrange(1, num_clusters + 1):
        current_cluster_rows = []
        for row_index in xrange(num_rows):
            if row_membership[row_index][0] == cluster_num:
                current_cluster_rows.append(data_matrix.row_names[row_index])
        submatrix = data_matrix.submatrix_by_name(
            row_names=current_cluster_rows)
        scores = -(scoring.compute_column_scores_submatrix(submatrix).values)[0]
        column_scores.append(scores)

    column_members = []
    start_time = util.current_millis()
    for column_index in xrange(num_cols):
        scores_to_order = []
        for row_index in xrange(num_clusters):
            scores_to_order.append(column_scores[row_index][column_index])
        column_members.append(order(scores_to_order)[:num_clusters_per_column])
    elapsed = util.current_millis() - start_time
    logging.info("seed column members in %f s.", elapsed % 1000.0)
    return column_members
Esempio n. 11
0
    def __compute(self):
        """compute method, iteration is the 0-based iteration number"""
        # networks are cached
        if self.__networks == None:
            self.__networks = retrieve_networks(self.__organism)

        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        #network_iteration_scores = self.__create_network_iteration_scores()
        #score_means = {}  # a dictionary indexed with network names

        for network in self.__networks:
            #logging.info("Compute scores for network '%s', WEIGHT: %f",
            #             network.name(), network.weight())
            start_time = util.current_millis()
            network_score = self.__compute_network_cluster_scores(network)
            self.__update_score_matrix(matrix, network_score, network.weight())
            elapsed = util.current_millis() - start_time
            #logging.info("NETWORK '%s' SCORING TIME: %f s.",
            #             network.name(), (elapsed / 1000.0))
            #score_means[network.name()] = self.__compute_cluster_score_means(
            #    network_score)
            #self.__update_network_iteration_scores(network_iteration_scores,
            #                                       network_score, weight)
            #iteration_scores = __compute_iteration_scores(
            #    network_iteration_scores)
        return matrix - matrix.quantile(0.99)
Esempio n. 12
0
    def run_iterations(self, row_scoring, col_scoring):
        self.report_params()
        self.write_start_info()

        for iteration in range(self['start_iteration'],
                               self['num_iterations'] + 1):
            logging.info("Iteration # %d", iteration)
            iteration_result = {'iteration': iteration}

            rscores = row_scoring.compute(iteration_result)
            start_time = util.current_millis()
            cscores = col_scoring.compute(iteration_result)
            elapsed = util.current_millis() - start_time
            logging.info("computed column_scores in %f s.", elapsed / 1000.0)

            self.membership().update(self.ratio_matrix, rscores, cscores,
                                     self['num_iterations'], iteration_result)

            if iteration > 0 and self.CHECKPOINT_INTERVAL and iteration % self.CHECKPOINT_INTERVAL == 0:
                self.save_checkpoint_data(iteration, row_scoring, col_scoring)
            mean_net_score = 0.0
            mean_mot_pvalue = 0.0
            if 'networks' in iteration_result.keys():
                mean_net_score = iteration_result['networks']
            mean_mot_pvalue = "NA"
            if 'motif-pvalue' in iteration_result.keys():
                mean_mot_pvalue = ""
                mean_mot_pvalues = iteration_result['motif-pvalue']
                mean_mot_pvalue = ""
                for seqtype in mean_mot_pvalues.keys():
                    mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype]))
                    
            logging.info('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue)

            if iteration == 1 or (iteration % RESULT_FREQ == 0):
                self.write_results(iteration_result)

            if iteration == 1 or (iteration % STATS_FREQ == 0):
                self.write_stats(iteration_result)
                # run infos should be written with the same frequency as stats
                self.write_runlog(row_scoring, iteration)

            gc.collect()
            #print "# ROW SCORING: ", sizes.asizeof(self.row_scoring)
            #print "# MOT SCORING: ", sizes.asizeof(self.motif_scoring)
            #print "# NET SCORING: ", sizes.asizeof(self.network_scoring)
            #print "# COL SCORING: ", sizes.asizeof(col_scoring)
            #print "# MEMBERSHIP: ", sizes.asizeof(self.membership())

        logging.info("Postprocessing: Adjusting the clusters....")
        self.membership().postadjust()
        iteration = self['num_iterations'] + 1
        iteration_result = {'iteration': iteration }
        logging.info("Adjusted. Now re-run scoring (iteration: %d)", iteration_result['iteration'])
        row_scoring.compute_force(iteration_result)
        self.write_results(iteration_result)
        self.write_stats(iteration_result)
        self.write_finish_info()
        print "Done !!!!"
Esempio n. 13
0
    def __init__(self, organism, membership, matrix,
                 meme_suite, seqtype,
                 sequence_filters=[],
                 scaling_func=None,
                 num_motif_func=None,
                 update_in_iteration=lambda iteration: True,
                 motif_in_iteration=lambda iteration: True,
                 config_params=None):
        """creates a ScoringFunction"""
        # run_in_iteration does not apply here, since we actually have
        # two schedules, motif_in_iteration and update_in_iteration here
        scoring.ScoringFunctionBase.__init__(self, membership,
                                             matrix, scaling_func,
                                             run_in_iteration=None,
                                             config_params=config_params)
        # attributes accessible by subclasses
        self.organism = organism
        self.meme_suite = meme_suite
        self.seqtype = seqtype
        self.update_in_iteration = update_in_iteration
        self.motif_in_iteration = motif_in_iteration
        self.num_motif_func = num_motif_func

        self.__sequence_filters = sequence_filters
        self.__last_run_results = None
        self.__last_iteration_result = {}

        self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params)
        self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params)

        used_genes = sorted(matrix.row_names)
        self.used_seqs = organism.sequences_for_genes_scan(
            used_genes, seqtype=self.seqtype)

        # precompute the sequences for all genes that are referenced in the
        # input ratios, they are used as a basis to compute the background
        # distribution for every cluster
        """
        self.seq_cache = sequence_cache.SequenceCache('sequence_cache.db')
        dist = self.config_params['scan_distances'][self.seqtype]
        self.seq_cache.add_sequence_type(self.seqtype, dist[0], dist[1])

        logging.info("used sequences retrieved, building cache...")
        start_time = util.current_millis()
        seq_data = [(gene, seq[1], seq[0].contig, seq[0].start,
                     seq[0].end, seq[0].reverse)
                    for gene, seq in self.used_seqs.items()]
        self.seq_cache.add_sequences(self.seqtype, dist[0], dist[1], seq_data)
        logging.info("used sequences cache built in %d ms.",
                     util.current_millis() - start_time)
        """

        logging.info("building reverse map...")
        start_time = util.current_millis()
        self.reverse_map = self.__build_reverse_map(matrix)
        logging.info("reverse map built in %d ms.",
                     util.current_millis() - start_time)
Esempio n. 14
0
 def compute(self, iteration_result, ref_matrix=None):
     """compute method, iteration is the 0-based iteration number"""
     start_time = util.current_millis()
     result = compute_column_scores(self.membership(),
                                    self.matrix(),
                                    self.num_clusters())
     elapsed = util.current_millis() - start_time
     #logging.info("\x1b[31mScoring:\t\x1b[0mCOLUMN SCORING TIME: %f s.", (elapsed / 1000.0))
     return result
Esempio n. 15
0
    def compute(self, iteration_result, ref_matrix):
        """compute method
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        global SET_MATRIX, SET_MEMBERSHIP, SET_REF_MATRIX, SET_SET_TYPE
        iteration = iteration_result['iteration']

        if self.__run_in_iteration(iteration):
            logging.info("Compute scores for set enrichment...")
            start_time = util.current_millis()
            matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                                   self.gene_names())
            use_multiprocessing = self.config_params[
                scoring.KEY_MULTIPROCESSING]
            SET_MATRIX = self.matrix()
            SET_MEMBERSHIP = self.membership()
            SET_REF_MATRIX = ref_matrix

            for set_type in self.__set_types:
                SET_SET_TYPE = set_type
                #logging.info("PROCESSING SET TYPE [%s]", repr(set_type))
                logging.info("PROCESSING SET TYPE '%s'", set_type.name)
                start1 = util.current_millis()
                if use_multiprocessing:
                    pool = mp.Pool()
                    results = pool.map(compute_cluster_score,
                            [(cluster, self.bonferroni_cutoff())
                             for cluster in xrange(1,
                                                   self.num_clusters() + 1)])
                    pool.close()
                    pass
                else:
                    results = []
                    for cluster in xrange(1, self.num_clusters() + 1):
                        results.append(compute_cluster_score(
                                (cluster, self.bonferroni_cutoff())))

                elapsed1 = util.current_millis() - start1
                logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...",
                             elapsed1 / 1000.0)

                for cluster in xrange(1, self.num_clusters() + 1):
                    # store the best enriched set determined
                    scores, min_set, min_pvalue = results[cluster - 1]
                    self.__last_min_enriched_set[set_type][cluster] = (
                        min_set, min_pvalue)

                    for row in xrange(len(self.gene_names())):
                        matrix[row][cluster - 1] = scores[row]

            logging.info("SET ENRICHMENT FINISHED IN %f s.\n",
                         (util.current_millis() - start_time) / 1000.0)
            self.__last_computed_result = matrix

        return self.__last_computed_result
Esempio n. 16
0
def weighted_row_means(matrix, weights):
    """compute weighted row means"""
    start_time = util.current_millis()
    # multiply each column of matrix with each component of the
    # weight vector: Using matrix multiplication resulted in speedup
    # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)!
    scaled = weights * matrix
    elapsed = util.current_millis() - start_time
    #logging.info("APPLIED WEIGHTS TO COLUMNS in %f s.", elapsed / 1000.0)
    scale = np.sum(np.ma.masked_array(weights, np.isnan(weights)))
    return util.row_means(scaled) / scale
Esempio n. 17
0
    def __fuzzify(self, row_scores, column_scores, iteration,
                  num_iterations):
        """Provide an iteration-specific fuzzification"""
        #logging.info("\x1b[31mMembership:\t\x1b[0m__fuzzify(), setup...")
        start_time = util.current_millis()
        fuzzy_coeff = std_fuzzy_coefficient(iteration, num_iterations)
        num_row_fuzzy_values = row_scores.num_rows() * row_scores.num_columns()
        num_col_fuzzy_values = (column_scores.num_rows() *
                                column_scores.num_columns())
        row_sd_values = []

        # optimization: unwrap the numpy arrays to access them directly
        row_score_values = row_scores.values()
        col_score_values = column_scores.values()

        # iterate the row names directly
        row_names = row_scores.row_names()
        for col in xrange(row_scores.num_columns()):
            cluster_rows = self.rows_for_cluster(col + 1)
            for row in xrange(row_scores.num_rows()):
                if row_names[row] in cluster_rows:
                    row_sd_values.append(row_score_values[row][col])

        # Note: If there are no non-NaN values in row_sd_values, row_rnorm
        # will have all NaNs
        row_rnorm = util.sd_rnorm(row_sd_values, num_row_fuzzy_values,
                                  fuzzy_coeff)

        col_sd_values = []
        row_names = column_scores.row_names()
        for col in xrange(column_scores.num_columns()):
            cluster_cols = self.columns_for_cluster(col + 1)
            for row in xrange(column_scores.num_rows()):
                if row_names[row] in cluster_cols:
                    col_sd_values.append(col_score_values[row][col])

        # Note: If there are no non-NaN values in col_sd_values, col_rnorm
        # will have all NaNs
        col_rnorm = util.sd_rnorm(col_sd_values, num_col_fuzzy_values,
                                  fuzzy_coeff)

        #elapsed = util.current_millis() - start_time
        #logging.info("fuzzify() SETUP finished in %f s.", elapsed / 1000.0)
        #logging.info("fuzzifying scores...")
        #start_time = util.current_millis()

        # add fuzzy values to the row/column scores
        row_score_values += np.array(row_rnorm).reshape(
            row_scores.num_rows(), row_scores.num_columns())
        col_score_values += np.array(col_rnorm).reshape(
            column_scores.num_rows(), column_scores.num_columns())
        elapsed = util.current_millis() - start_time
        #logging.info("\x1b[31mMembership:\t\x1b[0mfuzzify() finished in %f s.", elapsed / 1000.0)
        return row_scores, column_scores
Esempio n. 18
0
    def compute(self, iteration_result, ref_matrix=None):
        """compute method, iteration is the 0-based iteration number"""
        start_time = util.current_millis()
        result = compute_row_scores(
            self.membership(),
            self.matrix(),
            self.num_clusters(),
            self.config_params[scoring.KEY_MULTIPROCESSING])

        elapsed = util.current_millis() - start_time
        logging.info("ROW SCORING TIME: %f s.", (elapsed / 1000.0))
        self.run_log.log(True, self.scaling(iteration_result['iteration']))
        return result
Esempio n. 19
0
    def compute(self, iteration_result, ref_matrix=None):
        """compute method for the specified iteration
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        iteration = iteration_result['iteration']
        logging.info('Scoring motifs...')
        global_start_time = util.current_millis()
        if self.motif_in_iteration(iteration):  # meme.iter in R
            logging.info('Running Motifing...')
            self.__last_iteration_result = {}
            self.__last_pvalues = self.compute_pvalues(self.__last_iteration_result, iteration)
            # running MEME and store the result for the non-motifing iterations
            # to reuse
            

        if self.__last_pvalues != None and self.update_in_iteration(iteration):  # mot.iter in R
            logging.info('Recomputing motif scores...')
            # running the scoring itself
            # values are returned here - consider remapping them per gene
            remapped = {}
            
            
            
            
            for cluster in self.__last_pvalues:
                pvalues_k = self.__last_pvalues[cluster]
                pvalues_genes = {}
                for feature_id, pvalue in pvalues_k.items():
                    pvalues_genes[self.reverse_map[feature_id]] = pvalue
                remapped[cluster] = pvalues_genes

            # convert remapped to an actual scoring matrix
            matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                                   self.gene_names())
            for row_index in xrange(matrix.num_rows()):
                row = matrix.row_name(row_index)
                for cluster in xrange(1, self.num_clusters() + 1):
                    if (cluster in remapped.keys() and
                        row in remapped[cluster].keys()):
                        matrix[row_index][cluster - 1] = remapped[cluster][row]
            global_elapsed = util.current_millis() - global_start_time
            logging.info("GLOBAL MOTIF TIME: %d seconds",
                         (global_elapsed / 1000.0))
            self.__last_computed_result = matrix
            self.update_log.log(self.update_in_iteration(iteration),
                            self.scaling(iteration))
            self.motif_log.log(self.motif_in_iteration(iteration),
                           self.scaling(iteration))
        iteration_result['motifs'] = self.__last_iteration_result
        return self.__last_computed_result
Esempio n. 20
0
    def __init__(self,
                 organism,
                 membership,
                 ratios,
                 meme_suite,
                 seqtype,
                 sequence_filters=[],
                 scaling_func=None,
                 num_motif_func=None,
                 update_in_iteration=lambda iteration: True,
                 motif_in_iteration=lambda iteration: True,
                 config_params=None):
        """creates a ScoringFunction"""
        # run_in_iteration does not apply here, since we actually have
        # two schedules, motif_in_iteration and update_in_iteration here
        scoring.ScoringFunctionBase.__init__(self,
                                             organism,
                                             membership,
                                             ratios,
                                             scaling_func,
                                             schedule=None,
                                             config_params=config_params)
        # attributes accessible by subclasses
        self.meme_suite = meme_suite
        self.seqtype = seqtype
        self.update_in_iteration = update_in_iteration
        self.motif_in_iteration = motif_in_iteration
        self.num_motif_func = num_motif_func

        self.__sequence_filters = sequence_filters
        self.__last_motif_infos = None
        self.__last_iteration_result = {}
        self.all_pvalues = None
        self.last_result = None

        self.update_log = scoring.RunLog("motif-score-" + seqtype,
                                         config_params)
        self.motif_log = scoring.RunLog("motif-motif-" + seqtype,
                                        config_params)

        used_genes = sorted(ratios.row_names)
        self.used_seqs = organism.sequences_for_genes_scan(
            used_genes, seqtype=self.seqtype)

        logging.info("building reverse map...")
        start_time = util.current_millis()
        self.reverse_map = self.__build_reverse_map(ratios)
        logging.info("reverse map built in %d ms.",
                     util.current_millis() - start_time)
Esempio n. 21
0
    def run_iterations(self, row_scoring, col_scoring):
        self.report_params()
        self.write_start_info()
        for iteration in range(self['start_iteration'],
                               self['num_iterations'] + 1):
            start_time = util.current_millis()
            self.run_iteration(row_scoring, col_scoring, iteration)
            # garbage collection after everything in iteration went out of scope
            gc.collect()
            elapsed = util.current_millis() - start_time
            logging.info("performed iteration %d in %f s.", iteration,
                         elapsed / 1000.0)
        """run post processing after the last iteration. We store the results in
        num_iterations + 1 to have a clean separation"""
        if self['postadjust']:
            logging.info("Postprocessing: Adjusting the clusters....")
            # run combiner using the weights of the last iteration
            rscores = row_scoring.combine_cached(self['num_iterations'])
            rd_scores = memb.get_row_density_scores(self.membership(), rscores)
            logging.info("Recomputed combined + density scores.")
            memb.postadjust(self.membership(), rd_scores)
            logging.info("Adjusted. Now re-run scoring (iteration: %d)",
                         self['num_iterations'])

            iteration_result = {'iteration': self['num_iterations'] + 1}
            combined_scores = row_scoring.compute_force(iteration_result)

            # write the combined scores for benchmarking/diagnostics
            with open(self.combined_rscores_pickle_path(), 'w') as outfile:
                cPickle.dump(combined_scores, outfile)

            self.write_results(iteration_result)
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

            if self['debug']:
                # write complete result into a cmresults.tsv
                conn = self.__dbconn()
                path = os.path.join(self['output_dir'],
                                    'cmresults-postproc.tsv.bz2')
                with bz2.BZ2File(path, 'w') as outfile:
                    debug.write_iteration(conn, outfile,
                                          self['num_iterations'] + 1,
                                          self['num_clusters'],
                                          self['output_dir'])
                conn.close()

        self.write_finish_info()
        logging.info("Done !!!!")
Esempio n. 22
0
    def run_iteration(self, row_scoring, col_scoring, iteration):
        logging.info("Iteration # %d", iteration)
        iteration_result = {'iteration': iteration}
        rscores = row_scoring.compute(iteration_result)
        start_time = util.current_millis()
        cscores = col_scoring.compute(iteration_result)
        elapsed = util.current_millis() - start_time
        if elapsed > 0.0001:
            logging.info("computed column_scores in %f s.", elapsed / 1000.0)

        self.membership().update(self.ratio_matrix, rscores, cscores,
                                 self['num_iterations'], iteration_result)

        if (iteration > 0 and self['checkpoint_interval']
                and iteration % self['checkpoint_interval'] == 0):
            self.save_checkpoint_data(iteration, row_scoring, col_scoring)
        mean_net_score = 0.0
        mean_mot_pvalue = 0.0
        if 'networks' in iteration_result.keys():
            mean_net_score = iteration_result['networks']
        mean_mot_pvalue = "NA"
        if 'motif-pvalue' in iteration_result.keys():
            mean_mot_pvalue = ""
            mean_mot_pvalues = iteration_result['motif-pvalue']
            mean_mot_pvalue = ""
            for seqtype in mean_mot_pvalues.keys():
                mean_mot_pvalue = mean_mot_pvalue + (
                    " '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype]))

        logging.info('mean net = %s | mean mot = %s', str(mean_net_score),
                     mean_mot_pvalue)

        if iteration == 1 or (iteration % self['result_freq'] == 0):
            self.write_results(iteration_result)

        if iteration == 1 or (iteration % self['stats_freq'] == 0):
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

        if self['debug']:
            # write complete result into a cmresults.tsv
            conn = self.__dbconn()
            path = os.path.join(self['output_dir'],
                                'cmresults-%04d.tsv.bz2' % iteration)
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(conn, outfile, iteration,
                                      self['num_clusters'], self['output_dir'])
            conn.close()
Esempio n. 23
0
    def run_iterations(self, row_scoring, col_scoring):
        self.report_params()
        self.write_start_info()
        for iteration in range(self['start_iteration'],
                               self['num_iterations'] + 1):
            start_time = util.current_millis()
            self.run_iteration(row_scoring, col_scoring, iteration)
            # garbage collection after everything in iteration went out of scope
            gc.collect()
            elapsed = util.current_millis() - start_time
            logging.info("performed iteration %d in %f s.", iteration, elapsed / 1000.0)

        """run post processing after the last iteration. We store the results in
        num_iterations + 1 to have a clean separation"""
        if self['postadjust']:
            logging.info("Postprocessing: Adjusting the clusters....")
            # run combiner using the weights of the last iteration
            rscores = row_scoring.combine_cached(self['num_iterations'])
            rd_scores = memb.get_row_density_scores(self.membership(), rscores)
            logging.info("Recomputed combined + density scores.")
            memb.postadjust(self.membership(), rd_scores)
            logging.info("Adjusted. Now re-run scoring (iteration: %d)",
                         self['num_iterations'])

            iteration_result = {'iteration': self['num_iterations'] + 1}
            combined_scores = row_scoring.compute_force(iteration_result)

            # write the combined scores for benchmarking/diagnostics
            with open(self.combined_rscores_pickle_path(), 'w') as outfile:
                cPickle.dump(combined_scores, outfile)

            self.write_results(iteration_result)
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

            if self['debug']:
                # write complete result into a cmresults.tsv
                conn = self.__dbconn()
                path =  os.path.join(self['output_dir'], 'cmresults-postproc.tsv.bz2')
                with bz2.BZ2File(path, 'w') as outfile:
                    debug.write_iteration(conn, outfile,
                                          self['num_iterations'] + 1,
                                          self['num_clusters'], self['output_dir'])
                conn.close()


        self.write_finish_info()
        logging.info("Done !!!!")
Esempio n. 24
0
def quantile_normalize_scores(matrices, weights=None):
    """quantile normalize scores against each other"""

    flat_values = as_sorted_flat_values(matrices)
    #logging.info("COMPUTING WEIGHTED MEANS...")
    start_time = util.current_millis()
    if weights != None:
        tmp_mean = weighted_row_means(flat_values, weights)
    else:
        tmp_mean = util.row_means(flat_values)
    elapsed = util.current_millis() - start_time
    #logging.info("weighted means in %f s.", elapsed / 1000.0)
    start_time = util.current_millis()
    result = qm_result_matrices(matrices, tmp_mean)
    elapsed = util.current_millis() - start_time
    #logging.info("result matrices built in %f s.", elapsed / 1000.0)
    return result
Esempio n. 25
0
def get_col_density_scores(membership, col_scores):
    num_clusters = membership.num_clusters()
    cscore_range = abs(col_scores.max() - col_scores.min())
    colscore_bandwidth = max(cscore_range / 100.0, 0.001)
    cd_scores = dm.DataMatrix(col_scores.num_rows, col_scores.num_columns,
                              col_scores.row_names, col_scores.column_names)
    cds_values = cd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the cc_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        cds_values.T[cluster - 1] = get_cc_scores(membership, col_scores,
                                                  colscore_bandwidth, cluster)

    elapsed = util.current_millis() - start_time
    logging.info("CC_SCORES IN %f s.", elapsed / 1000.0)
    return cd_scores
Esempio n. 26
0
    def run_iteration(self, row_scoring, col_scoring, iteration):
        logging.info("Iteration # %d", iteration)
        iteration_result = {'iteration': iteration}
        rscores = row_scoring.compute(iteration_result)
        start_time = util.current_millis()
        cscores = col_scoring.compute(iteration_result)
        elapsed = util.current_millis() - start_time
        if elapsed > 0.0001:
            logging.info("computed column_scores in %f s.", elapsed / 1000.0)

        self.membership().update(self.ratio_matrix, rscores, cscores,
                                 self['num_iterations'], iteration_result)

        if (iteration > 0 and self['checkpoint_interval'] and iteration % self['checkpoint_interval'] == 0):
            self.save_checkpoint_data(iteration, row_scoring, col_scoring)
        mean_net_score = 0.0
        mean_mot_pvalue = 0.0
        if 'networks' in iteration_result.keys():
            mean_net_score = iteration_result['networks']
        mean_mot_pvalue = "NA"
        if 'motif-pvalue' in iteration_result.keys():
            mean_mot_pvalue = ""
            mean_mot_pvalues = iteration_result['motif-pvalue']
            mean_mot_pvalue = ""
            for seqtype in mean_mot_pvalues.keys():
                mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype]))

        logging.info('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue)

        if iteration == 1 or (iteration % self['result_freq'] == 0):
            self.write_results(iteration_result)

        if iteration == 1 or (iteration % self['stats_freq'] == 0):
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

        if self['debug']:
            # write complete result into a cmresults.tsv
            conn = self.__dbconn()
            path =  os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration)
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(conn, outfile, iteration,
                                      self['num_clusters'], self['output_dir'])
            conn.close()
Esempio n. 27
0
    def run_iteration(self, iteration):
        logging.info("Iteration # %d", iteration)
        iteration_result = {'iteration': iteration, 'score_means': {}}
        rscores = self.row_scoring.compute(iteration_result)
        start_time = util.current_millis()
        cscores = self.column_scoring.compute(iteration_result)
        elapsed = util.current_millis() - start_time
        if elapsed > 0.0001:
            logging.debug("computed column_scores in %f s.", elapsed / 1000.0)

        self.membership().update(self.ratios, rscores, cscores,
                                 self['num_iterations'], iteration_result)

        mean_net_score = 0.0
        mean_mot_pvalue = 0.0
        if 'networks' in iteration_result.keys():
            mean_net_score = iteration_result['networks']
        mean_mot_pvalue = "NA"
        if 'motif-pvalue' in iteration_result.keys():
            mean_mot_pvalue = ""
            mean_mot_pvalues = iteration_result['motif-pvalue']
            mean_mot_pvalue = ""
            for seqtype in mean_mot_pvalues.keys():
                mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype]))

        logging.debug('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue)

        # Reduce I/O, will write the results to database only on a debug run
        if not self['minimize_io']:
            if iteration == 1 or (iteration % self['result_freq'] == 0):
                self.write_results(iteration_result)

            if iteration == 1 or (iteration % self['stats_freq'] == 0):
                self.write_stats(iteration_result)
                self.update_iteration(iteration)

        if 'dump_results' in self['debug'] and (iteration == 1 or
                                                (iteration % self['debug_freq'] == 0)):
            # write complete result into a cmresults.tsv
            conn = self.__dbconn()
            path =  os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration)
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(conn, outfile, iteration,
                                      self['num_clusters'], self['output_dir'])
Esempio n. 28
0
def get_row_density_scores(membership, row_scores):
    """getting density scores improves small clusters"""
    num_clusters = membership.num_clusters()
    rscore_range = abs(row_scores.max() - row_scores.min())
    rowscore_bandwidth = max(rscore_range / 100.0, 0.001)
    rd_scores = dm.DataMatrix(row_scores.num_rows, row_scores.num_columns,
                              row_scores.row_names, row_scores.column_names)
    rds_values = rd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the rr_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        rds_values.T[cluster - 1] = get_rr_scores(membership, row_scores,
                                                  rowscore_bandwidth, cluster)

    elapsed = util.current_millis() - start_time
    logging.info("RR_SCORES IN %f s.", elapsed / 1000.0)
    return rd_scores
Esempio n. 29
0
    def update(self, matrix, row_scores, column_scores, num_iterations,
               iteration_result):
        """top-level update method"""
        start = util.current_millis()
        row_scores, column_scores = fuzzify(self, row_scores, column_scores,
                                            num_iterations, iteration_result,
                                            self.__config_params['add_fuzz'])
        elapsed = util.current_millis() - start
        logging.info("fuzzify took %f s.", elapsed / 1000.0)

        # pickle the (potentially fuzzed) row scores to use them
        # in the post adjustment step. We only need to do that in the last
        # iteration
        iteration = iteration_result['iteration']
        if iteration == num_iterations:
            with open(self.pickle_path(), 'w') as outfile:
                cPickle.dump(row_scores, outfile)

        start = util.current_millis()
        rd_scores, cd_scores = get_density_scores(self, row_scores,
                                                  column_scores)
        elapsed = util.current_millis() - start
        logging.info("GET_DENSITY_SCORES() took %f s.", elapsed / 1000.0)

        start = util.current_millis()
        compensate_size(self, matrix, rd_scores, cd_scores)
        elapsed = util.current_millis() - start
        logging.info("COMPENSATE_SIZE() took %f s.", elapsed / 1000.0)

        start_time = util.current_millis()
        update_for_rows(self, rd_scores,
                        self.__config_params['multiprocessing'],
                        self.__config_params['debug'])
        elapsed = util.current_millis() - start_time
        logging.info("update_for rdscores finished in %f s.", elapsed / 1000.0)

        start_time = util.current_millis()
        update_for_cols(self, cd_scores,
                        self.__config_params['multiprocessing'],
                        self.__config_params['debug'])
        elapsed = util.current_millis() - start_time
        logging.info("update_for cdscores finished in %f s.", elapsed / 1000.0)
Esempio n. 30
0
    def do_compute(self, iteration_result, ref_matrix=None):
        """compute method, iteration is the 0-based iteration number"""

        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        network_scores = {}
        for network in self.networks():
            logging.debug("Compute scores for network '%s', WEIGHT: %f",
                          network.name, network.weight)
            start_time = util.current_millis()
            network_score = self.__compute_network_cluster_scores(network)
            network_scores[network.name] = network_score
            self.__update_score_matrix(matrix, network_score, network.weight)
            elapsed = util.current_millis() - start_time
            logging.debug("NETWORK '%s' SCORING TIME: %f s.", network.name,
                          (elapsed / 1000.0))

        # compute and store score means
        self.score_means = self.__update_score_means(network_scores)
        return matrix
Esempio n. 31
0
    def do_compute(self, iteration_result, ref_matrix=None):
        """compute method, iteration is the 0-based iteration number"""

        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        network_scores = {}
        for network in self.networks():
            logging.debug("Compute scores for network '%s', WEIGHT: %f",
                          network.name, network.weight)
            start_time = util.current_millis()
            network_score = self.__compute_network_cluster_scores(network)
            network_scores[network.name] = network_score
            self.__update_score_matrix(matrix, network_score, network.weight)
            elapsed = util.current_millis() - start_time
            logging.debug("NETWORK '%s' SCORING TIME: %f s.",
                          network.name, (elapsed / 1000.0))

        # compute and store score means
        self.score_means = self.__update_score_means(network_scores)
        return matrix
Esempio n. 32
0
    def compute(self, iteration_result, ref_matrix=None):
        """compute scores for one iteration"""
        result_matrices = []
        score_scalings = []
        reference_matrix = ref_matrix
        iteration = iteration_result['iteration']
        for scoring_function in self.__scoring_functions:
            # This  is actually a hack in order to propagate
            # a reference matrix to the compute function
            # This could have negative impact on scalability
            if reference_matrix == None and len(result_matrices) > 0:
                reference_matrix = result_matrices[0]

            matrix = scoring_function.compute(iteration_result, reference_matrix)
            if matrix != None:
                result_matrices.append(matrix)
                score_scalings.append(scoring_function.scaling(iteration))

                if self.__log_subresults:
                    self.__log_subresult(scoring_function, matrix)

        if len(result_matrices) > 1:
            #logging.info(
            #    "\x1b[31mScoring:\t\x1b[0mCOMBINING THE SCORES OF %d matrices (quantile normalize)",
            #    len(result_matrices))
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(result_matrices,
                                                           score_scalings)
            elapsed = util.current_millis() - start_time
            #logging.info("\x1b[31mScoring:\t\x1b[0mSCORES COMBINED IN %f s", elapsed / 1000.0)

        if len(result_matrices) > 0:
            combined_score = (result_matrices[0] *
                              self.__scoring_functions[0].scaling(iteration))
            for index in xrange(1, len(result_matrices)):
                combined_score += (
                    result_matrices[index] *
                    self.__scoring_functions[index].scaling(iteration))
            return combined_score
        else:
            return None
Esempio n. 33
0
    def __init__(self,
                 id,
                 organism,
                 membership,
                 ratios,
                 seqtype,
                 config_params=None):
        """creates a ScoringFunction"""
        scoring.ScoringFunctionBase.__init__(self,
                                             id,
                                             organism,
                                             membership,
                                             ratios,
                                             config_params=config_params)
        # attributes accessible by subclasses
        self.seqtype = seqtype
        self.__setup_meme_suite(config_params)
        self.num_motif_func = util.get_iter_fun(
            config_params['MEME'], "nmotifs", config_params['num_iterations'])

        self.__last_motif_infos = None
        self.__last_iteration_result = {}
        self.all_pvalues = None
        self.last_result = None

        self.update_log = scoring.RunLog("motif-score-" + seqtype,
                                         config_params)
        self.motif_log = scoring.RunLog("motif-motif-" + seqtype,
                                        config_params)

        used_genes = sorted(ratios.row_names)
        self.used_seqs = organism.sequences_for_genes_scan(
            used_genes, seqtype=self.seqtype)

        logging.debug("building reverse map...")
        start_time = util.current_millis()
        self.reverse_map = self.__build_reverse_map(ratios)
        logging.debug("reverse map built in %d ms.",
                      util.current_millis() - start_time)

        self.__last_results = None  # caches the results of the previous meme run
Esempio n. 34
0
def get_col_density_scores(membership, col_scores):
    num_clusters = membership.num_clusters()
    cscore_range = abs(col_scores.max() - col_scores.min())
    colscore_bandwidth = max(cscore_range / 100.0, 0.001)
    cd_scores = dm.DataMatrix(col_scores.num_rows,
                              col_scores.num_columns,
                              col_scores.row_names,
                              col_scores.column_names)
    cds_values = cd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the cc_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        cds_values.T[cluster - 1] = get_cc_scores(membership, col_scores,
                                                  colscore_bandwidth,
                                                  cluster)

    elapsed = util.current_millis() - start_time
    logging.info("CC_SCORES IN %f s.", elapsed / 1000.0)
    return cd_scores
Esempio n. 35
0
    def __combine(self, result_matrices, score_scalings, iteration):
        if len(result_matrices) > 1 and self.__config_params['quantile_normalize']:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(result_matrices,
                                                           score_scalings)
            elapsed = util.current_millis() - start_time
            logging.info("quantile normalize in %f s.", elapsed / 1000.0)

        if len(result_matrices) > 0:
            start_time = util.current_millis()
            combined_score = (result_matrices[0] *
                              self.__scoring_functions[0].scaling(iteration))
            for index in xrange(1, len(result_matrices)):
                combined_score += (
                    result_matrices[index] *
                    self.__scoring_functions[index].scaling(iteration))
            elapsed = util.current_millis() - start_time
            logging.info("combined score in %f s.", elapsed / 1000.0)
            return combined_score
        else:
            return None
Esempio n. 36
0
    def __init__(self, organism, membership, ratios,
                 meme_suite, seqtype,
                 sequence_filters=[],
                 scaling_func=None,
                 num_motif_func=None,
                 update_in_iteration=lambda iteration: True,
                 motif_in_iteration=lambda iteration: True,
                 config_params=None):
        """creates a ScoringFunction"""
        # run_in_iteration does not apply here, since we actually have
        # two schedules, motif_in_iteration and update_in_iteration here
        scoring.ScoringFunctionBase.__init__(self, organism, membership,
                                             ratios, scaling_func,
                                             schedule=None,
                                             config_params=config_params)
        # attributes accessible by subclasses
        self.meme_suite = meme_suite
        self.seqtype = seqtype
        self.update_in_iteration = update_in_iteration
        self.motif_in_iteration = motif_in_iteration
        self.num_motif_func = num_motif_func

        self.__sequence_filters = sequence_filters
        self.__last_motif_infos = None
        self.__last_iteration_result = {}
        self.all_pvalues = None
        self.last_result = None

        self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params)
        self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params)

        used_genes = sorted(ratios.row_names)
        self.used_seqs = organism.sequences_for_genes_scan(
            used_genes, seqtype=self.seqtype)

        logging.info("building reverse map...")
        start_time = util.current_millis()
        self.reverse_map = self.__build_reverse_map(ratios)
        logging.info("reverse map built in %d ms.",
                     util.current_millis() - start_time)
Esempio n. 37
0
def quantile_normalize_scores(matrices, weights=None):
    """quantile normalize scores against each other"""

    logging.info("COMPUTING WEIGHTED MEANS...")
    start_time = util.current_millis()

    # rearranges the scores in the input matrices into a matrix
    # with |matrices| columns where the columns contain the values
    # of each matrix in sorted order
    flat_values = np.transpose(np.asarray([np.sort(matrix.values.flatten())
                                           for matrix in matrices]))

    elapsed = util.current_millis() - start_time
    logging.info("flattened/sorted score matrices in %f s.", elapsed / 1000.0)

    start_time = util.current_millis()
    if weights is not None:
        # multiply each column of matrix with each component of the
        # weight vector: Using matrix multiplication resulted in speedup
        # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)!
        scaled = weights * flat_values
        scale = np.sum(np.ma.masked_array(weights, np.isnan(weights)))
        tmp_mean = util.row_means(scaled) / scale
    else:
        tmp_mean = util.row_means(flat_values)
    elapsed = util.current_millis() - start_time
    logging.info("weighted means in %f s.", elapsed / 1000.0)
    start_time = util.current_millis()

    result = qm_result_matrices(matrices, tmp_mean)

    elapsed = util.current_millis() - start_time
    logging.info("result matrices built in %f s.", elapsed / 1000.0)
    return result
Esempio n. 38
0
def quantile_normalize_scores(matrices, weights=None):
    """quantile normalize scores against each other"""

    logging.info("COMPUTING WEIGHTED MEANS...")
    start_time = util.current_millis()

    # rearranges the scores in the input matrices into a matrix
    # with |matrices| columns where the columns contain the values
    # of each matrix in sorted order
    flat_values = np.transpose(
        np.asarray([np.sort(matrix.values.flatten()) for matrix in matrices]))

    elapsed = util.current_millis() - start_time
    logging.info("flattened/sorted score matrices in %f s.", elapsed / 1000.0)

    start_time = util.current_millis()
    if weights is not None:
        # multiply each column of matrix with each component of the
        # weight vector: Using matrix multiplication resulted in speedup
        # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)!
        scaled = weights * flat_values
        scale = np.sum(np.ma.masked_array(weights, np.isnan(weights)))
        tmp_mean = util.row_means(scaled) / scale
    else:
        tmp_mean = util.row_means(flat_values)
    elapsed = util.current_millis() - start_time
    logging.info("weighted means in %f s.", elapsed / 1000.0)
    start_time = util.current_millis()

    result = qm_result_matrices(matrices, tmp_mean)

    elapsed = util.current_millis() - start_time
    logging.info("result matrices built in %f s.", elapsed / 1000.0)
    return result
Esempio n. 39
0
    def update(self, matrix, row_scores, column_scores,
               num_iterations, iteration_result):
        """top-level update method"""
        start = util.current_millis()
        row_scores, column_scores = fuzzify(self, row_scores, column_scores,
                                            num_iterations, iteration_result,
                                            self.__config_params['add_fuzz'])
        elapsed = util.current_millis() - start
        logging.info("fuzzify took %f s.", elapsed / 1000.0)

        # pickle the (potentially fuzzed) row scores to use them
        # in the post adjustment step. We only need to do that in the last
        # iteration
        iteration = iteration_result['iteration']
        if iteration == num_iterations:
            with open(self.pickle_path(), 'w') as outfile:
                cPickle.dump(row_scores, outfile)

        start = util.current_millis()
        rd_scores, cd_scores = get_density_scores(self, row_scores,
                                                  column_scores)
        elapsed = util.current_millis() - start
        logging.info("GET_DENSITY_SCORES() took %f s.", elapsed / 1000.0)

        start = util.current_millis()
        compensate_size(self, matrix, rd_scores, cd_scores)
        elapsed = util.current_millis() - start
        logging.info("COMPENSATE_SIZE() took %f s.", elapsed / 1000.0)

        start_time = util.current_millis()
        update_for_rows(self, rd_scores, self.__config_params['multiprocessing'],
                        self.__config_params['debug'])
        elapsed = util.current_millis() - start_time
        logging.info("update_for rdscores finished in %f s.", elapsed / 1000.0)

        start_time = util.current_millis()
        update_for_cols(self, cd_scores, self.__config_params['multiprocessing'],
                        self.__config_params['debug'])
        elapsed = util.current_millis() - start_time
        logging.info("update_for cdscores finished in %f s.", elapsed / 1000.0)
Esempio n. 40
0
    def compute(self, iteration_result, ref_matrix=None):
        """compute method for the specified iteration
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        iteration = iteration_result['iteration']
        if self.run_in_iteration(iteration):
            logging.info('Scoring motifs...')
            global_start_time = util.current_millis()

            # here is the main compute of the cluster scores
            pvalues = self.compute_pvalues(iteration_result)
            # values are returned here - consider remapping them per gene
            remapped = {}
            
            
            
            
            for cluster in pvalues:
                pvalues_k = pvalues[cluster]
                pvalues_genes = {}
                for feature_id, pvalue in pvalues_k.items():
                    pvalues_genes[self.reverse_map[feature_id]] = pvalue
                remapped[cluster] = pvalues_genes

            # convert remapped to an actual scoring matrix
            matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                                   self.gene_names())
            for row_index in xrange(matrix.num_rows()):
                row = matrix.row_name(row_index)
                for cluster in xrange(1, self.num_clusters() + 1):
                    if (cluster in remapped.keys() and
                        row in remapped[cluster].keys()):
                        matrix[row_index][cluster - 1] = remapped[cluster][row]
            global_elapsed = util.current_millis() - global_start_time
            logging.info("GLOBAL MOTIF TIME: %d seconds",
                         (global_elapsed / 1000.0))
            self.__last_computed_result = matrix

        return self.__last_computed_result
Esempio n. 41
0
def get_row_density_scores(membership, row_scores):
    """getting density scores improves small clusters"""
    num_clusters = membership.num_clusters()
    rscore_range = abs(row_scores.max() - row_scores.min())
    rowscore_bandwidth = max(rscore_range / 100.0, 0.001)
    rd_scores = dm.DataMatrix(row_scores.num_rows,
                              row_scores.num_columns,
                              row_scores.row_names,
                              row_scores.column_names)
    rds_values = rd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the rr_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        rds_values.T[cluster - 1] = get_rr_scores(membership, row_scores,
                                                  rowscore_bandwidth,
                                                  cluster)

    elapsed = util.current_millis() - start_time
    logging.info("RR_SCORES IN %f s.", elapsed / 1000.0)
    return rd_scores
Esempio n. 42
0
def combine(result_matrices, score_scalings, membership, iteration,
            config_params):
    """This is  the combining function, taking n result matrices and scalings"""
    quantile_normalize = config_params['quantile_normalize']

    for i, m in enumerate(result_matrices):
        m.fix_extreme_values()
        m.subtract_with_quantile(0.99)

        # debug mode: print scoring matrices before combining
        if ('dump_scores' in config_params['debug']
                and (iteration == 1 or
                     (iteration % config_params['debug_freq'] == 0))):
            funs = config_params['pipeline']['row-scoring']['args'][
                'functions']
            m.write_tsv_file(os.path.join(
                config_params['output_dir'],
                'score-%s-%04d.tsv' % (funs[i]['id'], iteration)),
                             compressed=False)

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(
                result_matrices, score_scalings)
            elapsed = util.current_millis() - start_time
            logging.debug("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([
                mat.values[index_map[row], cluster - 1] for row in row_members
            ])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows,
                                    num_cols,
                                    mat.row_names,
                                    mat.column_names,
                                    values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.debug("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 1: pick from sorted values',
                        i)
                    qqq = sorted(values.ravel())[9]
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 2: pick minimum value', i)
                    qqq = abs(values.min())
                if qqq != 0:
                    values = values / qqq * abs(rs_quant)
                else:
                    logging.debug('SPARSE SCORES - %d not normalizing!', i)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.debug("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(matrix0.num_rows,
                             matrix0.num_columns,
                             matrix0.row_names,
                             matrix0.column_names,
                             values=combined_score)
    else:
        return None
Esempio n. 43
0
    def run_iteration(self, iteration, force=False):
        """Run a single cMonkey iteration
    
             Keyword arguments:
             iteration -- The iteration number to run
             force     -- Set to true to force recalculations (DEFAULT:FALSE)
        """
        logging.info("Iteration # %d", iteration)
        iteration_result = {'iteration': iteration, 'score_means': {}}
        if force == True:
            rscores = self.row_scoring.compute_force(iteration_result)
        else:
            rscores = self.row_scoring.compute(iteration_result)
        start_time = util.current_millis()
        if force == True:
            cscores = self.column_scoring.compute_force(iteration_result)
        else:
            cscores = self.column_scoring.compute(iteration_result)
        elapsed = util.current_millis() - start_time
        if elapsed > 0.0001:
            logging.debug("computed column_scores in %f s.", elapsed / 1000.0)

        #skip_update = False
        #if (self['num_iterations'] == self['start_iteration'] and self['resume'] == True):
        #    skip_update = True
            
        #if skip_update == False:
        self.membership().update(self.ratios, rscores, cscores,
                                 self['num_iterations'], iteration_result)

        mean_net_score = 0.0
        mean_mot_pvalue = 0.0
        if 'networks' in iteration_result.keys():
            mean_net_score = iteration_result['networks']
        mean_mot_pvalue = "NA"
        if 'motif-pvalue' in iteration_result.keys():
            mean_mot_pvalue = ""
            mean_mot_pvalues = iteration_result['motif-pvalue']
            mean_mot_pvalue = ""
            for seqtype in mean_mot_pvalues.keys():
                mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype]))

        logging.debug('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue)

        # Reduce I/O, will write the results to database only on a debug run
        if not self['minimize_io']:
            if iteration == 1 or (iteration % self['result_freq'] == 0):
                self.write_results(iteration_result)

        # This should not be too much writing, so we can keep it OUT of minimize_io option...?
        if iteration == 1 or (iteration % self['stats_freq'] == 0):
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

        if 'dump_results' in self['debug'] and (iteration == 1 or
                                                (iteration % self['debug_freq'] == 0)):
            # write complete result into a cmresults.tsv
            conn = self.__dbconn()
            path =  os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration)
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(conn, outfile, iteration,
                                      self['num_clusters'], self['output_dir'])
Esempio n. 44
0
    def run_iterations(self, start_iter=None, num_iter=None):
        if start_iter is None:
            start_iter = self['start_iteration']
        if num_iter is None:
            num_iter=self['num_iterations'] + 1

        if self.config_params['interactive']:  # stop here in interactive mode
            return

        #for iteration in range(self['start_iteration'],
        #                       self['num_iterations'] + 1):
        for iteration in range(start_iter, num_iter):
            start_time = util.current_millis()
            
            #02-09-15 Force recalculation if first iteration of a resume
            force = False
            if (iteration == start_iter) and (self['resume'] == True):
                force=True
            self.run_iteration(iteration, force=force) 
            # garbage collection after everything in iteration went out of scope
            gc.collect()
            elapsed = util.current_millis() - start_time
            logging.debug("performed iteration %d in %f s.", iteration, elapsed / 1000.0)
            
            if 'profile_mem' in self['debug'] and (iteration == 1 or iteration % 100 == 0):
                with open(os.path.join(self['output_dir'], 'memprofile.tsv'), 'a') as outfile:
                    self.write_mem_profile(outfile, iteration)


        """run post processing after the last iteration. We store the results in
        num_iterations + 1 to have a clean separation"""
        if self['postadjust']:
            logging.info("Postprocessing: Adjusting the clusters....")
            # run combiner using the weights of the last iteration
            
            rscores = self.row_scoring.combine_cached(self['num_iterations'])
            rd_scores = memb.get_row_density_scores(self.membership(), rscores)
            logging.info("Recomputed combined + density scores.")
            memb.postadjust(self.membership(), rd_scores)
            
            BSCM_obj = self.column_scoring.get_BSCM()
            if not (BSCM_obj is None):
                new_membership = BSCM_obj.resplit_clusters(self.membership(), cutoff=0.05)
            
            logging.info("Adjusted. Now re-run scoring (iteration: %d)",
                         self['num_iterations'])
            iteration_result = {'iteration': self['num_iterations'] + 1,
                                'score_means': {}}
                                
            combined_scores = self.row_scoring.compute_force(iteration_result)

            # write the combined scores for benchmarking/diagnostics
            with open(self.combined_rscores_pickle_path(), 'w') as outfile:
                cPickle.dump(combined_scores, outfile)

            self.write_results(iteration_result)
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

            # default behaviour:
            # always write complete result into a cmresults.tsv for R/cmonkey
            # compatibility
            conn = self.__dbconn()
            path =  os.path.join(self['output_dir'], 'cmresults-postproc.tsv.bz2')
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(conn, outfile,
                                      self['num_iterations'] + 1,
                                      self['num_clusters'], self['output_dir'])
            #Why is conn never closed?  Where does it write to the db?

            # additionally: run tomtom on the motifs if requested
            if (self['MEME']['global_background'] == 'True' and
                self['Postprocessing']['run_tomtom'] == 'True'):
                meme.run_tomtom(conn, self['output_dir'], self['MEME']['version'])

        self.write_finish_info()
        logging.info("Done !!!!")
    def do_compute(self, iteration_result, ref_matrix):
        """compute method
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES
        logging.info("Compute scores for set enrichment...")
        start_time = util.current_millis()
        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]
        SET_MATRIX = self.ratios
        SET_MEMBERSHIP = self.membership
        SET_SYNONYMS = self.organism.thesaurus()

        if CANONICAL_ROWNAMES is None:
            CANONICAL_ROWNAMES = set(
                map(lambda n: SET_SYNONYMS[n]
                    if n in SET_SYNONYMS else n, self.ratios.row_names))

        if CANONICAL_ROW_INDEXES is None:
            CANONICAL_ROW_INDEXES = {}
            for index, row in enumerate(self.ratios.row_names):
                if row in SET_SYNONYMS:
                    CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index
                else:
                    CANONICAL_ROW_INDEXES[row] = index

        ref_min_score = ref_matrix.min()
        logging.info('REF_MIN_SCORE: %f', ref_min_score)

        set_filepath = os.path.join(self.config_params['output_dir'],
                                    'setEnrichment_set.csv')
        pval_filepath = os.path.join(self.config_params['output_dir'],
                                     'setEnrichment_pvalue.csv')

        for set_type in self.__set_types:
            SET_SET_TYPE = set_type
            logging.info("PROCESSING SET TYPE '%s'", set_type.name)
            start1 = util.current_millis()
            if use_multiprocessing:
                with util.get_mp_pool(self.config_params) as pool:
                    results = pool.map(
                        compute_cluster_score,
                        [(cluster, self.bonferroni_cutoff(), ref_min_score)
                         for cluster in xrange(1,
                                               self.num_clusters() + 1)])
            else:
                results = []
                for cluster in xrange(1, self.num_clusters() + 1):
                    results.append(
                        compute_cluster_score(
                            (cluster, self.bonferroni_cutoff(),
                             ref_min_score)))

            elapsed1 = util.current_millis() - start1
            logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...",
                         elapsed1 / 1000.0)

            if not os.path.exists(set_filepath):
                setFile = open(set_filepath, 'w')
                setFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
                pvFile = open(pval_filepath, 'w')
                pvFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
            else:
                setFile = open(set_filepath, 'a')
                pvFile = open(pval_filepath, 'a')

            minSets = []
            pValues = []
            for cluster in xrange(1, self.num_clusters() + 1):
                # store the best enriched set determined
                scores, min_set, min_pvalue = results[cluster - 1]
                minSets.append(min_set)
                pValues.append(min_pvalue)

                for row in xrange(len(self.gene_names())):
                    matrix.values[row][cluster -
                                       1] += scores[row] * set_type.weight
            setFile.write('\n' + str(iteration_result['iteration']) + ',' +
                          ','.join([str(i) for i in minSets]))
            pvFile.write('\n' + str(iteration_result['iteration']) + ',' +
                         ','.join([str(i) for i in pValues]))
            setFile.close()
            pvFile.close()

        logging.info("SET ENRICHMENT FINISHED IN %f s.\n",
                     (util.current_millis() - start_time) / 1000.0)
        # cleanup
        SET_SET_TYPE = None
        SET_MATRIX = None
        SET_MEMBERSHIP = None
        SET_SYNONYMS = None

        return matrix
Esempio n. 46
0
    def do_compute(self, iteration_result, ref_matrix):
        """compute method
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        global SET_MATRIX, SET_MEMBERSHIP, SET_REF_MATRIX, SET_SET_TYPE
        logging.info("Compute scores for set enrichment...")
        start_time = util.current_millis()
        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]
        SET_MATRIX = self.ratios
        SET_MEMBERSHIP = self.membership
        SET_REF_MATRIX = ref_matrix

        for set_type in self.__set_types:
            SET_SET_TYPE = set_type
            #logging.info("PROCESSING SET TYPE [%s]", repr(set_type))
            logging.info("PROCESSING SET TYPE '%s'", set_type.name)
            start1 = util.current_millis()
            if use_multiprocessing:
                pool = mp.Pool()
                results = pool.map(
                    compute_cluster_score,
                    [(cluster, self.bonferroni_cutoff())
                     for cluster in xrange(1,
                                           self.num_clusters() + 1)])
                pool.close()
                pool.join()
            else:
                results = []
                for cluster in xrange(1, self.num_clusters() + 1):
                    results.append(
                        compute_cluster_score(
                            (cluster, self.bonferroni_cutoff())))

            elapsed1 = util.current_millis() - start1
            logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...",
                         elapsed1 / 1000.0)
            if not os.path.exists('out/setEnrichment_set.csv'):
                setFile = open('out/setEnrichment_set.csv', 'w')
                setFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
                pvFile = open('out/setEnrichment_pvalue.csv', 'w')
                pvFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
            else:
                setFile = open('out/setEnrichment_set.csv', 'a')
                pvFile = open('out/setEnrichment_pvalue.csv', 'a')
            minSets = []
            pValues = []
            for cluster in xrange(1, self.num_clusters() + 1):
                # store the best enriched set determined
                scores, min_set, min_pvalue = results[cluster - 1]
                self.__last_min_enriched_set[set_type][cluster] = (min_set,
                                                                   min_pvalue)
                minSets.append(min_set)
                pValues.append(min_pvalue)

                for row in xrange(len(self.gene_names())):
                    matrix.values[row][cluster - 1] = scores[row]
            setFile.write('\n' + str(iteration_result['iteration']) + ',' +
                          ','.join([str(i) for i in minSets]))
            pvFile.write('\n' + str(iteration_result['iteration']) + ',' +
                         ','.join([str(i) for i in pValues]))
            setFile.close()
            pvFile.close()
        logging.info("SET ENRICHMENT FINISHED IN %f s.\n",
                     (util.current_millis() - start_time) / 1000.0)
        return matrix
Esempio n. 47
0
    def compute_pvalues(self, iteration_result, num_motifs, force):
        """Compute motif scores.
        The result is a dictionary from cluster -> (feature_id, pvalue)
        containing a sparse gene-to-pvalue mapping for each cluster

        In order to influence the sequences
        that go into meme, the user can specify a list of sequence filter
        functions that have the signature
        (seqs, feature_ids, distance) -> seqs
        These filters are applied in the order they appear in the list.
        """
        global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP

        cluster_pvalues = {}
        min_cluster_rows_allowed = self.config_params[
            'memb.min_cluster_rows_allowed']
        max_cluster_rows_allowed = self.config_params[
            'memb.max_cluster_rows_allowed']
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]

        # extract the sequences for each cluster, slow
        start_time = util.current_millis()
        SEQUENCE_FILTERS = self.__sequence_filters
        ORGANISM = self.organism
        MEMBERSHIP = self.membership

        with util.get_mp_pool(self.config_params) as pool:
            cluster_seqs_params = [
                (cluster, self.seqtype)
                for cluster in xrange(1,
                                      self.num_clusters() + 1)
            ]
            seqs_list = pool.map(cluster_seqs, cluster_seqs_params)

        SEQUENCE_FILTERS = None
        ORGANISM = None
        MEMBERSHIP = None
        logging.debug("prepared sequences in %d ms.",
                      util.current_millis() - start_time)

        # Make the parameters, this is fast enough
        start_time = util.current_millis()
        params = {}
        for cluster in xrange(1, self.num_clusters() + 1):
            # Pass the previous run's seed if possible
            if self.__last_motif_infos is not None:
                previous_motif_infos = self.__last_motif_infos.get(
                    cluster, None)
            else:
                previous_motif_infos = None

            seqs, feature_ids = seqs_list[cluster - 1]
            params[cluster] = ComputeScoreParams(
                iteration_result['iteration'], cluster, feature_ids, seqs,
                self.used_seqs, self.meme_runner(), min_cluster_rows_allowed,
                max_cluster_rows_allowed, num_motifs, previous_motif_infos,
                self.config_params['output_dir'],
                self.config_params['num_iterations'],
                self.config_params['debug'])

        logging.debug("prepared MEME parameters in %d ms.",
                      util.current_millis() - start_time)

        # create motif result map if necessary
        for cluster in xrange(1, self.num_clusters() + 1):
            if not cluster in iteration_result:
                iteration_result[cluster] = {}

        # Optimization:
        # if the cluster hasn't changed since last time, reuse the last results
        # we do this by filtering out the parameters of the clusters that did not
        # change
        if not force and self.__last_results is not None:
            oldlen = len(params)
            params = {
                cluster: params[cluster]
                for cluster in xrange(1,
                                      self.num_clusters() + 1) if
                params[cluster].feature_ids != self.__last_results[cluster][0]
            }
            newlen = len(params)
            if oldlen - newlen > 0:
                logging.debug("%d clusters did not change !!!",
                              oldlen - newlen)

        # compute and store motif results
        self.__last_motif_infos = {}
        if self.__last_results is None:
            self.__last_results = {}

        if use_multiprocessing:
            with util.get_mp_pool(self.config_params) as pool:
                results = pool.map(compute_cluster_score, params.values())
                results = {r[0]: r[1:] for r in results}  # indexed by cluster

                for cluster in xrange(1, self.num_clusters() + 1):
                    if cluster in results:
                        pvalues, run_result = results[cluster]
                        self.__last_results[cluster] = (
                            params[cluster].feature_ids, pvalues, run_result)
                    else:
                        feature_ids, pvalues, run_result = self.__last_results[
                            cluster]

                    cluster_pvalues[cluster] = pvalues
                    if run_result:
                        self.__last_motif_infos[
                            cluster] = run_result.motif_infos
                    iteration_result[cluster]['motif-info'] = meme_json(
                        run_result)
                    iteration_result[cluster]['pvalues'] = pvalues
        else:
            for cluster in xrange(1, self.num_clusters() + 1):
                if cluster in params:
                    _, pvalues, run_result = compute_cluster_score(
                        params[cluster])
                    self.__last_results[cluster] = (
                        params[cluster].feature_ids, pvalues, run_result)
                else:
                    _, pvalues, run_result = self.__last_results[cluster]

                cluster_pvalues[cluster] = pvalues
                if run_result:
                    self.__last_motif_infos[cluster] = run_result.motif_infos
                iteration_result[cluster]['motif-info'] = meme_json(run_result)
                iteration_result[cluster]['pvalues'] = pvalues

        return cluster_pvalues
Esempio n. 48
0
def combine(result_matrices, score_scalings, membership, quantile_normalize):
    """This is  the combining function, taking n result matrices and scalings"""
    for m in result_matrices:
        m.fix_extreme_values()

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(result_matrices,
                                                           score_scalings)
            elapsed = util.current_millis() - start_time
            logging.info("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([mat.values[index_map[row]][cluster - 1]
                        for row in row_members])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows, num_cols,
                                    mat.row_names,
                                    mat.column_names,
                                    values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.info("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                #print "qqq(%d) = %f" % (i, qqq)
                if qqq == 0:
                    logging.error("very sparse score !!!")
                values = values / qqq * abs(rs_quant)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.info("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns,
                             matrix0.row_names, matrix0.column_names,
                             values=combined_score)
    else:
        return None
Esempio n. 49
0
    def compute_pvalues(self, iteration_result, num_motifs):
        """Compute motif scores.
        The result is a dictionary from cluster -> (feature_id, pvalue)
        containing a sparse gene-to-pvalue mapping for each cluster

        In order to influence the sequences
        that go into meme, the user can specify a list of sequence filter
        functions that have the signature
        (seqs, feature_ids, distance) -> seqs
        These filters are applied in the order they appear in the list.
        """
        global MOTIF_PARAMS, SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP

        cluster_pvalues = {}
        min_cluster_rows_allowed = self.config_params[
            'memb.min_cluster_rows_allowed']
        max_cluster_rows_allowed = self.config_params[
            'memb.max_cluster_rows_allowed']
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]

        # extract the sequences for each cluster, slow
        start_time = util.current_millis()
        SEQUENCE_FILTERS = self.__sequence_filters
        ORGANISM = self.organism
        MEMBERSHIP = self.membership

        pool = mp.Pool()
        cluster_seqs_params = [(cluster, self.seqtype)
                               for cluster in xrange(1,
                                                     self.num_clusters() + 1)]
        seqs_list = pool.map(cluster_seqs, cluster_seqs_params)
        SEQUENCE_FILTERS = None
        ORGANISM = None
        MEMBERSHIP = None
        logging.info("prepared sequences in %d ms.",
                     util.current_millis() - start_time)

        # Make the parameters, this is fast enough
        start_time = util.current_millis()
        params = []
        for cluster in xrange(1, self.num_clusters() + 1):
            # Pass the previous run's seed if possible
            if self.__last_motif_infos is not None:
                previous_motif_infos = self.__last_motif_infos.get(
                    cluster, None)
            else:
                previous_motif_infos = None

            seqs, feature_ids = seqs_list[cluster - 1]
            params.append(
                ComputeScoreParams(
                    iteration_result['iteration'],
                    cluster, feature_ids, seqs, self.used_seqs,
                    self.meme_runner(), min_cluster_rows_allowed,
                    max_cluster_rows_allowed, num_motifs, previous_motif_infos,
                    self.config_params.get('keep_memeout', False),
                    self.config_params['output_dir'],
                    self.config_params['num_iterations'],
                    self.config_params['debug']))

        logging.info("prepared MEME parameters in %d ms.",
                     util.current_millis() - start_time)

        # create motif result map if necessary
        for cluster in xrange(1, self.num_clusters() + 1):
            if not cluster in iteration_result:
                iteration_result[cluster] = {}

        # compute and store motif results
        MOTIF_PARAMS = params
        self.__last_motif_infos = {}
        if use_multiprocessing:
            pool = mp.Pool()
            results = pool.map(compute_cluster_score,
                               xrange(1,
                                      self.num_clusters() + 1))

            for cluster in xrange(1, self.num_clusters() + 1):
                pvalues, run_result = results[cluster - 1]
                cluster_pvalues[cluster] = pvalues
                if run_result:
                    self.__last_motif_infos[cluster] = run_result.motif_infos
                iteration_result[cluster]['motif-info'] = meme_json(run_result)
                iteration_result[cluster]['pvalues'] = pvalues
            pool.close()
            pool.join()
        else:
            for cluster in xrange(1, self.num_clusters() + 1):
                pvalues, run_result = compute_cluster_score(cluster)
                cluster_pvalues[cluster] = pvalues
                if run_result:
                    self.__last_motif_infos[cluster] = run_result.motif_infos
                iteration_result[cluster]['motif-info'] = meme_json(run_result)
                iteration_result[cluster]['pvalues'] = pvalues

        # cleanup
        MOTIF_PARAMS = None
        return cluster_pvalues