Example #1
0
def seed_column_members(data_matrix, row_membership, num_clusters,
                        num_clusters_per_column):
    """Default column membership seeder ('best')
    In case of multiple input ratio matrices, we assume that these
    matrices have been combined into data_matrix"""
    num_rows = data_matrix.num_rows
    num_cols = data_matrix.num_columns
    # create a submatrix for each cluster
    cscores = np.zeros([data_matrix.num_columns, num_clusters])
    for cluster_num in xrange(1, num_clusters + 1):
        current_cluster_rows = []
        for row_index in xrange(num_rows):
            if row_membership[row_index][0] == cluster_num:
                current_cluster_rows.append(data_matrix.row_names[row_index])
        submatrix = data_matrix.submatrix_by_name(
            row_names=current_cluster_rows)
        _, scores = scoring.compute_column_scores_submatrix(submatrix)
        cscores.T[cluster_num - 1] = -scores

    start_time = util.current_millis()
    column_members = [util.rorder(cscores[i], num_clusters_per_column)
                      for i in xrange(num_cols)]
    elapsed = util.current_millis() - start_time
    logging.debug("seed column members in %f s.", elapsed % 1000.0)
    return column_members
Example #2
0
    def __init__(self, id, organism, membership, ratios, seqtype, config_params=None):
        """creates a ScoringFunction"""
        scoring.ScoringFunctionBase.__init__(self, id, organism, membership,
                                             ratios, config_params=config_params)
        # attributes accessible by subclasses
        self.seqtype = seqtype
        self.__setup_meme_suite(config_params)
        self.num_motif_func = util.get_iter_fun(config_params['MEME'], "nmotifs",
                                                config_params['num_iterations'])

        self.__last_motif_infos = None
        self.__last_iteration_result = {}
        self.all_pvalues = None
        self.last_result = None

        self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params)
        self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params)

        used_genes = sorted(ratios.row_names)
        self.used_seqs = organism.sequences_for_genes_scan(
            used_genes, seqtype=self.seqtype)

        logging.debug("building reverse map...")
        start_time = util.current_millis()
        self.reverse_map = self.__build_reverse_map(ratios)
        logging.debug("reverse map built in %d ms.",
                      util.current_millis() - start_time)

        self.__last_results = None  # caches the results of the previous meme run
Example #3
0
def compute_row_scores(membership, matrix, num_clusters, config_params):
    """for each cluster 1, 2, .. num_clusters compute the row scores
    for the each row name in the input name matrix"""
    start_time = util.current_millis()
    cluster_row_scores = __compute_row_scores_for_clusters(
        membership, matrix, num_clusters, config_params)
    # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version

    logging.debug("__compute_row_scores_for_clusters() in %f s.",
                  (util.current_millis() - start_time) / 1000.0)

    # rearrange result into a DataMatrix, where rows are indexed by gene
    # and columns represent clusters
    start_time = util.current_millis()
    values = np.zeros((matrix.num_rows, num_clusters))

    # note that cluster is 0 based on a matrix
    for cluster in xrange(num_clusters):
        row_scores = cluster_row_scores[cluster]
        values[:, cluster] = row_scores
    result = dm.DataMatrix(matrix.num_rows, num_clusters,
                           row_names=matrix.row_names,
                           values=values)
    logging.debug("made result matrix in %f s.",
                  (util.current_millis() - start_time) / 1000.0)
    return result
Example #4
0
    def run_iteration(self, iteration, force=False):
        """Run a single cMonkey iteration

             Keyword arguments:
             iteration -- The iteration number to run
             force     -- Set to true to force recalculations (DEFAULT:FALSE)
        """
        logging.info("Iteration # %d", iteration)
        iteration_result = {'iteration': iteration, 'score_means': {}}
        if force:
            rscores = self.row_scoring.compute_force(iteration_result)
        else:
            rscores = self.row_scoring.compute(iteration_result)
        start_time = util.current_millis()

        if force:
            cscores = self.column_scoring.compute_force(iteration_result)
        else:
            cscores = self.column_scoring.compute(iteration_result)

        elapsed = util.current_millis() - start_time
        if elapsed > 0.0001:
            logging.debug("computed column_scores in %f s.", elapsed / 1000.0)

        self.membership().update(self.ratios, rscores, cscores,
                                 self['num_iterations'], iteration_result)

        mean_net_score = 0.0
        mean_mot_pvalue = 0.0
        if 'networks' in iteration_result.keys():
            mean_net_score = iteration_result['networks']
        mean_mot_pvalue = "NA"
        if 'motif-pvalue' in iteration_result.keys():
            mean_mot_pvalue = ""
            mean_mot_pvalues = iteration_result['motif-pvalue']
            mean_mot_pvalue = ""
            for seqtype in mean_mot_pvalues.keys():
                mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype]))

        logging.debug('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue)

        # Reduce I/O, will write the results to database only on a debug run
        if not self['minimize_io']:
            if iteration == 1 or (iteration % self['result_freq'] == 0):
                self.write_results(iteration_result)

        # This should not be too much writing, so we can keep it OUT of minimize_io option...?
        if iteration == 1 or (iteration % self['stats_freq'] == 0):
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

        if 'dump_results' in self['debug'] and (iteration == 1 or
                                                (iteration % self['debug_freq'] == 0)):
            # write complete result into a cmresults.tsv
            conn = self.__dbconn()
            path =  os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration)
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(conn, outfile, iteration,
                                      self['num_clusters'], self['output_dir'])
Example #5
0
def set_config_general(config, params):
    """Process General section"""
    # override directories
    tmp_dir = config.get('General', 'tmp_dir')
    if tmp_dir:
        tempfile.tempdir = tmp_dir
        
    params['output_dir'] = config.get('General', 'output_dir')
    params['cache_dir'] = config.get('General', 'cache_dir')
    params['tmp_dir'] = tmp_dir
    params['dbfile_name'] = config.get('General', 'dbfile_name')
    params['normalize_ratios'] = config.getboolean('General', 'normalize_ratios')
    params['num_iterations'] = config.getint("General", "num_iterations")
    params['start_iteration'] = config.getint("General", "start_iteration")
    params['multiprocessing'] = config.getboolean('General', 'use_multiprocessing')
    params['case_sensitive'] = config.getboolean('General', 'case_sensitive')
    params['num_cores'] = get_config_int('General', 'num_cores', None)
    params['postadjust'] = config.getboolean('General', 'postadjust')
    params['log_subresults'] = config.getboolean('General', 'log_subresults')
    params['add_fuzz'] = config.get('General', 'add_fuzz')

    # python can have large seeds, R, however has a 32 bit limit it seems
    params['random_seed'] = get_config_int(config, 'General', 'random_seed',
                                           util.current_millis() % 2147483647)
    params['stats_freq'] = config.getint('General', 'stats_frequency')
    params['result_freq'] = config.getint('General', 'result_frequency')
    params['debug_freq'] = config.getint('General', 'debug_frequency')

    # implicit parameters for compatibility
    params['use_operons'] = get_config_boolean(config, 'General', 'use_operons', True)
    params['use_string'] = get_config_boolean(config, 'General', 'use_string', True)
    params['checkratios'] = get_config_boolean(config, 'General', 'checkratios', False)
    params['organism_code'] = get_config_str(config, 'General', 'organism_code', None)
Example #6
0
    def do_compute(self, iteration_result, ref_matrix=None):
        """compute method, iteration is the 0-based iteration number"""

        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        network_scores = {}
        for network in self.networks():
            logging.debug("Compute scores for network '%s', WEIGHT: %f",
                          network.name, network.weight)
            start_time = util.current_millis()
            network_score = self.__compute_network_cluster_scores(network)
            network_scores[network.name] = network_score
            self.__update_score_matrix(matrix, network_score, network.weight)
            elapsed = util.current_millis() - start_time
            logging.debug("NETWORK '%s' SCORING TIME: %f s.",
                          network.name, (elapsed / 1000.0))

        # compute and store score means
        self.score_means = self.__update_score_means(network_scores)
        return matrix
Example #7
0
def get_col_density_scores(membership, col_scores):
    num_clusters = membership.num_clusters()
    cscore_range = abs(col_scores.max() - col_scores.min())
    colscore_bandwidth = max(cscore_range / 100.0, 0.001)
    cd_scores = dm.DataMatrix(col_scores.num_rows,
                              col_scores.num_columns,
                              col_scores.row_names,
                              col_scores.column_names)
    cds_values = cd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the cc_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        cds_values.T[cluster - 1] = get_cc_scores(membership, col_scores,
                                                  colscore_bandwidth,
                                                  cluster)

    elapsed = util.current_millis() - start_time
    logging.debug("CC_SCORES IN %f s.", elapsed / 1000.0)
    return cd_scores
Example #8
0
def quantile_normalize_scores(matrices, weights=None):
    """quantile normalize scores against each other"""

    logging.info("COMPUTING WEIGHTED MEANS...")
    start_time = util.current_millis()

    # rearranges the scores in the input matrices into a matrix
    # with |matrices| columns where the columns contain the values
    # of each matrix in sorted order
    flat_values = np.transpose(np.asarray([np.sort(matrix.values.flatten())
                                           for matrix in matrices]))

    elapsed = util.current_millis() - start_time
    logging.info("flattened/sorted score matrices in %f s.", elapsed / 1000.0)

    start_time = util.current_millis()
    if weights is not None:
        # multiply each column of matrix with each component of the
        # weight vector: Using matrix multiplication resulted in speedup
        # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)!
        scaled = weights * flat_values
        scale = np.sum(np.ma.masked_array(weights, np.isnan(weights)))
        tmp_mean = util.row_means(scaled) / scale
    else:
        tmp_mean = util.row_means(flat_values)
    elapsed = util.current_millis() - start_time
    logging.info("weighted means in %f s.", elapsed / 1000.0)
    start_time = util.current_millis()

    result = qm_result_matrices(matrices, tmp_mean)

    elapsed = util.current_millis() - start_time
    logging.info("result matrices built in %f s.", elapsed / 1000.0)
    return result
Example #9
0
def get_row_density_scores(membership, row_scores):
    """getting density scores improves small clusters"""
    num_clusters = membership.num_clusters()
    rscore_range = abs(row_scores.max() - row_scores.min())
    rowscore_bandwidth = max(rscore_range / 100.0, 0.001)
    rd_scores = dm.DataMatrix(row_scores.num_rows,
                              row_scores.num_columns,
                              row_scores.row_names,
                              row_scores.column_names)
    rds_values = rd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the rr_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        rds_values.T[cluster - 1] = get_rr_scores(membership, row_scores,
                                                  rowscore_bandwidth,
                                                  cluster)

    elapsed = util.current_millis() - start_time
    logging.debug("RR_SCORES IN %f s.", elapsed / 1000.0)
    return rd_scores
Example #10
0
    def update(self, matrix, row_scores, column_scores,
               num_iterations, iteration_result):
        """top-level update method"""
        start = util.current_millis()
        row_scores, column_scores = fuzzify(self, row_scores, column_scores,
                                            num_iterations, iteration_result,
                                            self.__config_params['add_fuzz'])
        elapsed = util.current_millis() - start
        logging.debug("fuzzify took %f s.", elapsed / 1000.0)

        # pickle the (potentially fuzzed) row scores to use them
        # in the post adjustment step. We only need to do that in the last
        # iteration
        iteration = iteration_result['iteration']
        if iteration == num_iterations:
            with open(self.pickle_path(), 'wb') as outfile:
                pickle.dump(row_scores, outfile)

        start = util.current_millis()
        rd_scores, cd_scores = get_density_scores(self, row_scores,
                                                  column_scores)
        elapsed = util.current_millis() - start
        logging.debug("GET_DENSITY_SCORES() took %f s.", elapsed / 1000.0)

        start = util.current_millis()
        compensate_size(self, matrix, rd_scores, cd_scores)
        elapsed = util.current_millis() - start
        logging.debug("COMPENSATE_SIZE() took %f s.", elapsed / 1000.0)

        start_time = util.current_millis()
        update_for_rows(self, rd_scores, self.__config_params['multiprocessing'])
        elapsed = util.current_millis() - start_time
        logging.debug("update_for rdscores finished in %f s.", elapsed / 1000.0)

        start_time = util.current_millis()
        update_for_cols(self, cd_scores, self.__config_params['multiprocessing'])
        elapsed = util.current_millis() - start_time
        logging.debug("update_for cdscores finished in %f s.", elapsed / 1000.0)
Example #11
0
def set_config_general(config, params):
    """Process General section"""
    # override directories
    tmp_dir = config.get("General", "tmp_dir")
    if tmp_dir:
        tempfile.tempdir = tmp_dir

    try:  # Only resumed or final runs should have a stored command line
        params["command_line"] = config.get("General", "command_line")
    except:
        pass
    params["output_dir"] = config.get("General", "output_dir")
    params["cache_dir"] = config.get("General", "cache_dir")
    params["tmp_dir"] = tmp_dir
    params["pipeline_file"] = config.get("General", "pipeline_file")
    params["dbfile_name"] = config.get("General", "dbfile_name")
    params["rsat_base_url"] = config.get("General", "rsat_base_url")
    params["rsat_features"] = config.get("General", "rsat_features")
    params["rsat_organism"] = config.get("General", "rsat_organism")
    params["rsat_dir"] = config.get("General", "rsat_dir")
    params["normalize_ratios"] = config.getboolean("General", "normalize_ratios")
    params["num_iterations"] = config.getint("General", "num_iterations")
    params["start_iteration"] = config.getint("General", "start_iteration")
    params["multiprocessing"] = config.getboolean("General", "use_multiprocessing")
    params["case_sensitive"] = config.getboolean("General", "case_sensitive")
    params["num_cores"] = get_config_int(config, "General", "num_cores", None)
    params["postadjust"] = config.getboolean("General", "postadjust")
    params["log_subresults"] = config.getboolean("General", "log_subresults")
    params["add_fuzz"] = config.get("General", "add_fuzz")

    # python can have large seeds, R, however has a 32 bit limit it seems
    params["random_seed"] = get_config_int(config, "General", "random_seed", util.current_millis() % 2147483647)
    params["stats_freq"] = config.getint("General", "stats_frequency")
    params["result_freq"] = config.getint("General", "result_frequency")
    params["debug_freq"] = config.getint("General", "debug_frequency")

    # implicit parameters for compatibility
    params["use_operons"] = get_config_boolean(config, "General", "use_operons", True)
    params["use_string"] = get_config_boolean(config, "General", "use_string", True)
    params["checkratios"] = get_config_boolean(config, "General", "checkratios", False)
    params["organism_code"] = get_config_str(config, "General", "organism_code", None)

    params["use_BSCM"] = get_config_boolean(config, "General", "use_BSCM", False)
    params["use_chi2"] = get_config_boolean(config, "General", "use_chi2", False)
Example #12
0
    def do_compute(self, iteration_result, ref_matrix):
        """compute method
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES
        logging.info("Compute scores for set enrichment...")
        start_time = util.current_millis()
        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]
        SET_MATRIX = self.ratios
        SET_MEMBERSHIP = self.membership
        SET_SYNONYMS = self.organism.thesaurus()

        if CANONICAL_ROWNAMES is None:
            CANONICAL_ROWNAMES = set(
                map(lambda n: SET_SYNONYMS[n]
                    if n in SET_SYNONYMS else n, self.ratios.row_names))

        if CANONICAL_ROW_INDEXES is None:
            CANONICAL_ROW_INDEXES = {}
            for index, row in enumerate(self.ratios.row_names):
                if row in SET_SYNONYMS:
                    CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index
                else:
                    CANONICAL_ROW_INDEXES[row] = index

        ref_min_score = np.nanpercentile(ref_matrix.values, 10.0)
        logging.info('REF_MIN_SCORE: %f', ref_min_score)

        set_filepath = os.path.join(self.config_params['output_dir'],
                                    'setEnrichment_set.csv')
        pval_filepath = os.path.join(self.config_params['output_dir'],
                                     'setEnrichment_pvalue.csv')

        for set_type in self.__set_types:
            SET_SET_TYPE = set_type
            logging.info("PROCESSING SET TYPE '%s'", set_type.name)
            start1 = util.current_millis()
            cutoff = self.bonferroni_cutoff()
            if use_multiprocessing:
                with util.get_mp_pool(self.config_params) as pool:
                    results = pool.map(
                        compute_cluster_score,
                        [(cluster, cutoff, ref_min_score)
                         for cluster in xrange(1,
                                               self.num_clusters() + 1)])
            else:
                results = []
                for cluster in xrange(1, self.num_clusters() + 1):
                    results.append(
                        compute_cluster_score(
                            (cluster, cutoff, ref_min_score)))

            elapsed1 = util.current_millis() - start1
            logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...",
                         elapsed1 / 1000.0)

            if not os.path.exists(set_filepath):
                setFile = open(set_filepath, 'w')
                setFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
                pvFile = open(pval_filepath, 'w')
                pvFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
            else:
                setFile = open(set_filepath, 'a')
                pvFile = open(pval_filepath, 'a')

            minSets = []
            pValues = []
            for cluster in xrange(1, self.num_clusters() + 1):
                # store the best enriched set determined
                scores, min_set, min_pvalue = results[cluster - 1]
                minSets.append(min_set)
                pValues.append(min_pvalue)

                for row in xrange(len(self.gene_names())):
                    matrix.values[row][cluster -
                                       1] += scores[row] * set_type.weight
            setFile.write('\n' + str(iteration_result['iteration']) + ',' +
                          ','.join([str(i) for i in minSets]))
            pvFile.write('\n' + str(iteration_result['iteration']) + ',' +
                         ','.join([str(i) for i in pValues]))
            setFile.close()
            pvFile.close()

        logging.info("SET ENRICHMENT FINISHED IN %f s.\n",
                     (util.current_millis() - start_time) / 1000.0)
        # cleanup
        SET_SET_TYPE = None
        SET_MATRIX = None
        SET_MEMBERSHIP = None
        SET_SYNONYMS = None

        return matrix
Example #13
0
def combine(result_matrices, score_scalings, membership, iteration,
            config_params):
    """This is  the combining function, taking n result matrices and scalings"""
    quantile_normalize = config_params['quantile_normalize']

    for i, m in enumerate(result_matrices):
        m.fix_extreme_values()
        m.subtract_with_quantile(0.99)

        # debug mode: print scoring matrices before combining
        if ('dump_scores' in config_params['debug']
                and (iteration == 1 or
                     (iteration % config_params['debug_freq'] == 0))):
            funs = config_params['pipeline']['row-scoring']['args'][
                'functions']
            m.write_tsv_file(os.path.join(
                config_params['output_dir'],
                'score-%s-%04d.tsv' % (funs[i]['id'], iteration)),
                             compressed=False)

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(
                result_matrices, score_scalings)
            elapsed = util.current_millis() - start_time
            logging.debug("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([
                mat.values[index_map[row], cluster - 1] for row in row_members
            ])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows,
                                    num_cols,
                                    mat.row_names,
                                    mat.column_names,
                                    values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.debug("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 1: pick from sorted values',
                        i)
                    qqq = sorted(values.ravel())[9]
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 2: pick minimum value', i)
                    qqq = abs(values.min())
                if qqq != 0:
                    values = values / qqq * abs(rs_quant)
                else:
                    logging.debug('SPARSE SCORES - %d not normalizing!', i)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.debug("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(matrix0.num_rows,
                             matrix0.num_columns,
                             matrix0.row_names,
                             matrix0.column_names,
                             values=combined_score)
    else:
        return None
Example #14
0
    def do_compute(self, iteration_result, ref_matrix):
        """compute method
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES
        logging.info("Compute scores for set enrichment...")
        start_time = util.current_millis()
        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]
        SET_MATRIX = self.ratios
        SET_MEMBERSHIP = self.membership
        SET_SYNONYMS = self.organism.thesaurus()

        if CANONICAL_ROWNAMES is None:
            CANONICAL_ROWNAMES = set(map(lambda n: SET_SYNONYMS[n] if n in SET_SYNONYMS else n,
                                         self.ratios.row_names))

        if CANONICAL_ROW_INDEXES is None:
            CANONICAL_ROW_INDEXES = {}
            for index, row in enumerate(self.ratios.row_names):
                if row in SET_SYNONYMS:
                    CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index
                else:
                    CANONICAL_ROW_INDEXES[row] = index

        ref_min_score = ref_matrix.min()
        logging.info('REF_MIN_SCORE: %f', ref_min_score)

        set_filepath = os.path.join(self.config_params['output_dir'],
                                    'setEnrichment_set.csv')
        pval_filepath = os.path.join(self.config_params['output_dir'],
                                     'setEnrichment_pvalue.csv')

        for set_type in self.__set_types:
            SET_SET_TYPE = set_type
            logging.info("PROCESSING SET TYPE '%s'", set_type.name)
            start1 = util.current_millis()
            if use_multiprocessing:
                with util.get_mp_pool(self.config_params) as pool:
                    results = pool.map(compute_cluster_score,
                                       [(cluster, self.bonferroni_cutoff(), ref_min_score)
                                        for cluster in xrange(1, self.num_clusters() + 1)])
            else:
                results = []
                for cluster in xrange(1, self.num_clusters() + 1):
                    results.append(compute_cluster_score((cluster, self.bonferroni_cutoff(), ref_min_score)))

            elapsed1 = util.current_millis() - start1
            logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...",
                         elapsed1 / 1000.0)

            if not os.path.exists(set_filepath):
                setFile = open(set_filepath, 'w')
                setFile.write(',' + ','.join([str(i) for i in xrange(1, self.num_clusters() + 1)]))
                pvFile = open(pval_filepath, 'w')
                pvFile.write(',' + ','.join([str(i) for i in xrange(1, self.num_clusters() + 1)]))
            else:
                setFile = open(set_filepath, 'a')
                pvFile = open(pval_filepath, 'a')

            minSets = []
            pValues = []
            for cluster in xrange(1, self.num_clusters() + 1):
                # store the best enriched set determined
                scores, min_set, min_pvalue = results[cluster - 1]
                minSets.append(min_set)
                pValues.append(min_pvalue)

                for row in xrange(len(self.gene_names())):
                    matrix.values[row][cluster - 1] += scores[row] * set_type.weight
            setFile.write('\n'+str(iteration_result['iteration'])+','+','.join([str(i) for i in minSets]))
            pvFile.write('\n'+str(iteration_result['iteration'])+','+','.join([str(i) for i in pValues]))
            setFile.close()
            pvFile.close()

        logging.info("SET ENRICHMENT FINISHED IN %f s.\n",
                     (util.current_millis() - start_time) / 1000.0)
        # cleanup
        SET_SET_TYPE = None
        SET_MATRIX = None
        SET_MEMBERSHIP = None
        SET_SYNONYMS = None

        return matrix
Example #15
0
    def compute_pvalues(self, iteration_result, num_motifs, force):
        """Compute motif scores.
        The result is a dictionary from cluster -> (feature_id, pvalue)
        containing a sparse gene-to-pvalue mapping for each cluster

        In order to influence the sequences
        that go into meme, the user can specify a list of sequence filter
        functions that have the signature
        (seqs, feature_ids, distance) -> seqs
        These filters are applied in the order they appear in the list.
        """
        global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP

        cluster_pvalues = {}
        min_cluster_rows_allowed = self.config_params['memb.min_cluster_rows_allowed']
        max_cluster_rows_allowed = self.config_params['memb.max_cluster_rows_allowed']
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]

        # extract the sequences for each cluster, slow
        start_time = util.current_millis()
        SEQUENCE_FILTERS = self.__sequence_filters
        ORGANISM = self.organism
        MEMBERSHIP = self.membership

        cluster_seqs_params = [(cluster, self.seqtype) for cluster in xrange(1, self.num_clusters() + 1)]
        if use_multiprocessing:
            with util.get_mp_pool(self.config_params) as pool:
                seqs_list = pool.map(cluster_seqs, cluster_seqs_params)
        else:
            seqs_list = [cluster_seqs(p) for p in cluster_seqs_params]

        SEQUENCE_FILTERS = None
        ORGANISM = None
        MEMBERSHIP = None
        logging.debug("prepared sequences in %d ms.", util.current_millis() - start_time)

        # Make the parameters, this is fast enough
        start_time = util.current_millis()
        params = {}
        for cluster in xrange(1, self.num_clusters() + 1):
            # Pass the previous run's seed if possible
            if self.__last_motif_infos is not None:
                previous_motif_infos = self.__last_motif_infos.get(cluster, None)
            else:
                previous_motif_infos = None

            seqs, feature_ids = seqs_list[cluster - 1]
            params[cluster] = ComputeScoreParams(iteration_result['iteration'], cluster,
                                                 feature_ids,
                                                 seqs,
                                                 self.used_seqs,
                                                 self.meme_runner(),
                                                 min_cluster_rows_allowed,
                                                 max_cluster_rows_allowed,
                                                 num_motifs,
                                                 previous_motif_infos,
                                                 self.config_params['output_dir'],
                                                 self.config_params['num_iterations'],
                                                 self.config_params['debug'])

        logging.debug("prepared MEME parameters in %d ms.",
                      util.current_millis() - start_time)

        # create motif result map if necessary
        for cluster in xrange(1, self.num_clusters() + 1):
            if not cluster in iteration_result:
                iteration_result[cluster] = {}

        # Optimization:
        # if the cluster hasn't changed since last time, reuse the last results
        # we do this by filtering out the parameters of the clusters that did not
        # change
        if not force and self.__last_results is not None:
            oldlen = len(params)
            params = {cluster: params[cluster]
                      for cluster in xrange(1, self.num_clusters() + 1)
                      if params[cluster].feature_ids != self.__last_results[cluster][0]}
            newlen = len(params)
            if oldlen - newlen > 0:
                logging.debug("%d clusters did not change !!!", oldlen - newlen)

        # compute and store motif results
        self.__last_motif_infos = {}
        if self.__last_results is None:
            self.__last_results = {}

        if use_multiprocessing:
            with util.get_mp_pool(self.config_params) as pool:
                results = pool.map(compute_cluster_score, params.values())
                results = {r[0]: r[1:] for r in results}  # indexed by cluster

                for cluster in xrange(1, self.num_clusters() + 1):
                    if cluster in results:
                        pvalues, run_result = results[cluster]
                        self.__last_results[cluster] = (params[cluster].feature_ids,
                                                        pvalues, run_result)
                    else:
                        feature_ids, pvalues, run_result = self.__last_results[cluster]

                    cluster_pvalues[cluster] = pvalues
                    if run_result:
                        self.__last_motif_infos[cluster] = run_result.motif_infos
                    iteration_result[cluster]['motif-info'] = meme_json(run_result)
                    iteration_result[cluster]['pvalues'] = pvalues
        else:
            for cluster in xrange(1, self.num_clusters() + 1):
                if cluster in params:
                    _, pvalues, run_result = compute_cluster_score(params[cluster])
                    self.__last_results[cluster] = (params[cluster].feature_ids,
                                                    pvalues, run_result)
                else:
                    _, pvalues, run_result = self.__last_results[cluster]

                cluster_pvalues[cluster] = pvalues
                if run_result:
                    self.__last_motif_infos[cluster] = run_result.motif_infos
                iteration_result[cluster]['motif-info'] = meme_json(run_result)
                iteration_result[cluster]['pvalues'] = pvalues

        return cluster_pvalues
Example #16
0
    def compute_pvalues(self, iteration_result, num_motifs, force):
        """Compute motif scores.
        The result is a dictionary from cluster -> (feature_id, pvalue)
        containing a sparse gene-to-pvalue mapping for each cluster

        In order to influence the sequences
        that go into meme, the user can specify a list of sequence filter
        functions that have the signature
        (seqs, feature_ids, distance) -> seqs
        These filters are applied in the order they appear in the list.
        """
        global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP

        cluster_pvalues = {}
        min_cluster_rows_allowed = self.config_params[
            'memb.min_cluster_rows_allowed']
        max_cluster_rows_allowed = self.config_params[
            'memb.max_cluster_rows_allowed']
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]

        # extract the sequences for each cluster, slow
        start_time = util.current_millis()
        SEQUENCE_FILTERS = self.__sequence_filters
        ORGANISM = self.organism
        MEMBERSHIP = self.membership

        cluster_seqs_params = [(cluster, self.seqtype)
                               for cluster in xrange(1,
                                                     self.num_clusters() + 1)]
        if use_multiprocessing:
            with util.get_mp_pool(self.config_params) as pool:
                seqs_list = pool.map(cluster_seqs, cluster_seqs_params)
        else:
            seqs_list = [cluster_seqs(p) for p in cluster_seqs_params]

        SEQUENCE_FILTERS = None
        ORGANISM = None
        MEMBERSHIP = None
        logging.debug("prepared sequences in %d ms.",
                      util.current_millis() - start_time)

        # Make the parameters, this is fast enough
        start_time = util.current_millis()
        params = {}
        for cluster in xrange(1, self.num_clusters() + 1):
            # Pass the previous run's seed if possible
            if self.__last_motif_infos is not None:
                previous_motif_infos = self.__last_motif_infos.get(
                    cluster, None)
            else:
                previous_motif_infos = None

            seqs, feature_ids = seqs_list[cluster - 1]
            params[cluster] = ComputeScoreParams(
                iteration_result['iteration'], cluster, feature_ids, seqs,
                self.used_seqs, self.meme_runner(), min_cluster_rows_allowed,
                max_cluster_rows_allowed, num_motifs, previous_motif_infos,
                self.config_params['output_dir'],
                self.config_params['num_iterations'],
                self.config_params['debug'])

        logging.debug("prepared MEME parameters in %d ms.",
                      util.current_millis() - start_time)

        # create motif result map if necessary
        for cluster in xrange(1, self.num_clusters() + 1):
            if not cluster in iteration_result:
                iteration_result[cluster] = {}

        # Optimization:
        # if the cluster hasn't changed since last time, reuse the last results
        # we do this by filtering out the parameters of the clusters that did not
        # change
        if not force and self.__last_results is not None:
            oldlen = len(params)
            params = {
                cluster: params[cluster]
                for cluster in xrange(1,
                                      self.num_clusters() + 1) if
                params[cluster].feature_ids != self.__last_results[cluster][0]
            }
            newlen = len(params)
            if oldlen - newlen > 0:
                logging.debug("%d clusters did not change !!!",
                              oldlen - newlen)

        # compute and store motif results
        self.__last_motif_infos = {}
        if self.__last_results is None:
            self.__last_results = {}

        if use_multiprocessing:
            with util.get_mp_pool(self.config_params) as pool:
                results = pool.map(compute_cluster_score, params.values())
                results = {r[0]: r[1:] for r in results}  # indexed by cluster

                for cluster in xrange(1, self.num_clusters() + 1):
                    if cluster in results:
                        pvalues, run_result = results[cluster]
                        self.__last_results[cluster] = (
                            params[cluster].feature_ids, pvalues, run_result)
                    else:
                        feature_ids, pvalues, run_result = self.__last_results[
                            cluster]

                    cluster_pvalues[cluster] = pvalues
                    if run_result:
                        self.__last_motif_infos[
                            cluster] = run_result.motif_infos
                    iteration_result[cluster]['motif-info'] = meme_json(
                        run_result)
                    iteration_result[cluster]['pvalues'] = pvalues
        else:
            for cluster in xrange(1, self.num_clusters() + 1):
                if cluster in params:
                    _, pvalues, run_result = compute_cluster_score(
                        params[cluster])
                    self.__last_results[cluster] = (
                        params[cluster].feature_ids, pvalues, run_result)
                else:
                    _, pvalues, run_result = self.__last_results[cluster]

                cluster_pvalues[cluster] = pvalues
                if run_result:
                    self.__last_motif_infos[cluster] = run_result.motif_infos
                iteration_result[cluster]['motif-info'] = meme_json(run_result)
                iteration_result[cluster]['pvalues'] = pvalues

        return cluster_pvalues
Example #17
0
    def run_iterations(self, start_iter=None, num_iter=None):
        if start_iter is None:
            start_iter = self.config_params['start_iteration']
        if num_iter is None:
            num_iter=self.config_params['num_iterations'] + 1

        if self.config_params['interactive']:  # stop here in interactive mode
            return

        for iteration in range(start_iter, num_iter):
            start_time = util.current_millis()
            force = self.config_params['resume'] and iteration == start_iter
            self.run_iteration(iteration, force=force)

            # garbage collection after everything in iteration went out of scope
            gc.collect()
            elapsed = util.current_millis() - start_time
            logging.debug("performed iteration %d in %f s.", iteration, elapsed / 1000.0)


        """run post processing after the last iteration. We store the results in
        num_iterations + 1 to have a clean separation"""
        if self.config_params['postadjust']:
            logging.info("Postprocessing: Adjusting the clusters....")
            # run combiner using the weights of the last iteration

            rscores = self.row_scoring.combine_cached(self.config_params['num_iterations'])
            rd_scores = memb.get_row_density_scores(self.membership(), rscores)
            logging.info("Recomputed combined + density scores.")
            memb.postadjust(self.membership(), rd_scores)

            BSCM_obj = self.column_scoring.get_BSCM()
            if not (BSCM_obj is None):
                new_membership = BSCM_obj.resplit_clusters(self.membership(), cutoff=0.05)

            logging.info("Adjusted. Now re-run scoring (iteration: %d)",
                         self.config_params['num_iterations'])
            iteration_result = {'iteration': self.config_params['num_iterations'] + 1,
                                'score_means': {}}

            combined_scores = self.row_scoring.compute_force(iteration_result)

            # write the combined scores for benchmarking/diagnostics
            with open(self.combined_rscores_pickle_path(), 'wb') as outfile:
                pickle.dump(combined_scores, outfile)

            self.write_results(iteration_result)
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

            # default behaviour:
            # always write complete result into a cmresults.tsv for R/cmonkey
            # compatibility
            session = self.dbsession()
            path =  os.path.join(self.config_params['output_dir'], 'cmresults-postproc.tsv.bz2')
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(session, outfile,
                                      self.config_params['num_iterations'] + 1,
                                      self.config_params['num_clusters'], self.config_params['output_dir'])

            # additionally: run tomtom on the motifs if requested
            if (self.config_params['MEME']['global_background'] == 'True' and
                self.config_params['Postprocessing']['run_tomtom'] == 'True'):
                meme.run_tomtom(session, self.config_params['output_dir'], self.config_params['MEME']['version'])

        self.write_finish_info()
        logging.info("Done !!!!")
Example #18
0
    def run_iterations(self, start_iter=None, num_iter=None):
        if start_iter is None:
            start_iter = self['start_iteration']
        if num_iter is None:
            num_iter=self['num_iterations'] + 1

        if self.config_params['interactive']:  # stop here in interactive mode
            return

        for iteration in range(start_iter, num_iter):
            start_time = util.current_millis()
            force = self['resume'] and iteration == start_iter
            self.run_iteration(iteration, force=force)

            # garbage collection after everything in iteration went out of scope
            gc.collect()
            elapsed = util.current_millis() - start_time
            logging.debug("performed iteration %d in %f s.", iteration, elapsed / 1000.0)

            if 'profile_mem' in self['debug'] and (iteration == 1 or iteration % 100 == 0):
                with open(os.path.join(self['output_dir'], 'memprofile.tsv'), 'a') as outfile:
                    self.write_mem_profile(outfile, iteration)


        """run post processing after the last iteration. We store the results in
        num_iterations + 1 to have a clean separation"""
        if self['postadjust']:
            logging.info("Postprocessing: Adjusting the clusters....")
            # run combiner using the weights of the last iteration

            rscores = self.row_scoring.combine_cached(self['num_iterations'])
            rd_scores = memb.get_row_density_scores(self.membership(), rscores)
            logging.info("Recomputed combined + density scores.")
            memb.postadjust(self.membership(), rd_scores)

            BSCM_obj = self.column_scoring.get_BSCM()
            if not (BSCM_obj is None):
                new_membership = BSCM_obj.resplit_clusters(self.membership(), cutoff=0.05)

            logging.info("Adjusted. Now re-run scoring (iteration: %d)",
                         self['num_iterations'])
            iteration_result = {'iteration': self['num_iterations'] + 1,
                                'score_means': {}}

            combined_scores = self.row_scoring.compute_force(iteration_result)

            # write the combined scores for benchmarking/diagnostics
            with open(self.combined_rscores_pickle_path(), 'wb') as outfile:
                pickle.dump(combined_scores, outfile)

            self.write_results(iteration_result)
            self.write_stats(iteration_result)
            self.update_iteration(iteration)

            # default behaviour:
            # always write complete result into a cmresults.tsv for R/cmonkey
            # compatibility
            conn = self.__dbconn()
            path =  os.path.join(self['output_dir'], 'cmresults-postproc.tsv.bz2')
            with bz2.BZ2File(path, 'w') as outfile:
                debug.write_iteration(conn, outfile,
                                      self['num_iterations'] + 1,
                                      self['num_clusters'], self['output_dir'])
            # TODO: Why is conn never closed?  Where does it write to the db?

            # additionally: run tomtom on the motifs if requested
            if (self['MEME']['global_background'] == 'True' and
                self['Postprocessing']['run_tomtom'] == 'True'):
                meme.run_tomtom(conn, self['output_dir'], self['MEME']['version'])

        self.write_finish_info()
        logging.info("Done !!!!")
Example #19
0
def combine(result_matrices, score_scalings, membership, iteration, config_params):
    """This is  the combining function, taking n result matrices and scalings"""
    quantile_normalize = config_params["quantile_normalize"]

    for i, m in enumerate(result_matrices):
        m.fix_extreme_values()
        m.subtract_with_quantile(0.99)

        # debug mode: print scoring matrices before combining
        if "dump_scores" in config_params["debug"] and (
            iteration == 1 or (iteration % config_params["debug_freq"] == 0)
        ):
            funs = config_params["pipeline"]["row-scoring"]["args"]["functions"]
            m.write_tsv_file(
                os.path.join(config_params["output_dir"], "score-%s-%04d.tsv" % (funs[i]["id"], iteration)),
                compressed=False,
            )

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings)
            elapsed = util.current_millis() - start_time
            logging.debug("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.debug("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                if qqq == 0:
                    logging.debug("SPARSE SCORES - %d attempt 1: pick from sorted values", i)
                    qqq = sorted(values.ravel())[9]
                if qqq == 0:
                    logging.debug("SPARSE SCORES - %d attempt 2: pick minimum value", i)
                    qqq = abs(values.min())
                if qqq != 0:
                    values = values / qqq * abs(rs_quant)
                else:
                    logging.debug("SPARSE SCORES - %d not normalizing!", i)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.debug("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(
            matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score
        )
    else:
        return None