コード例 #1
0
ファイル: util_test.py プロジェクト: baliga-lab/cmonkey2
 def test_quantile(self):
     """tests the quantile function"""
     data = [1, 2, 3, 4, 5]
     self.assertEquals(1, util.quantile(data, 0))
     self.assertEquals(1.8, util.quantile(data, 0.2))
     self.assertEquals(2, util.quantile(data, 0.25))
     self.assertEquals(3, util.quantile(data, 0.5))
     self.assertEquals(4, util.quantile(data, 0.75))
     self.assertEquals(5, util.quantile(data, 1))
     self.assertTrue(np.isnan(util.quantile([], 0.99)))
コード例 #2
0
ファイル: scoring.py プロジェクト: baliga-lab/cmonkey2
 def compute_substitution(cluster_column_scores):
     """calculate substitution value for missing column scores"""
     membership_values = []
     for cluster in xrange(1, num_clusters + 1):
         columns = membership.columns_for_cluster(cluster)
         column_scores = cluster_column_scores[cluster - 1]
         if column_scores is not None:
             colnames, scores = column_scores
             for col in xrange(len(colnames)):
                 if colnames[col] in columns:
                     membership_values.append(scores[col])
     return util.quantile(membership_values, 0.95)
コード例 #3
0
 def compute_substitution(cluster_column_scores):
     """calculate substitution value for missing column scores"""
     membership_values = []
     for cluster in xrange(1, num_clusters + 1):
         columns = membership.columns_for_cluster(cluster)
         column_scores = cluster_column_scores[cluster - 1]
         if column_scores is not None:
             colnames, scores = column_scores
             for col in xrange(len(colnames)):
                 if colnames[col] in columns:
                     membership_values.append(scores[col])
     return util.quantile(membership_values, 0.95)
コード例 #4
0
 def test_quantile(self):
     """tests the quantile function"""
     data = [1, 2, 3, 4, 5]
     self.assertEquals(1, util.quantile(data, 0))
     self.assertEquals(1.8, util.quantile(data, 0.2))
     self.assertEquals(2, util.quantile(data, 0.25))
     self.assertEquals(3, util.quantile(data, 0.5))
     self.assertEquals(4, util.quantile(data, 0.75))
     self.assertEquals(5, util.quantile(data, 1))
     self.assertTrue(np.isnan(util.quantile([], 0.99)))
コード例 #5
0
 def quantile(self, probability):
     """returns the result of the quantile function over all contained
     values"""
     return util.quantile(self.values.ravel(), probability)
コード例 #6
0
ファイル: datamatrix.py プロジェクト: sdanzige/cmonkey-python
 def quantile(self, probability):
     """returns the result of the quantile function over all contained
     values"""
     return util.quantile(self.values.ravel(), probability)
コード例 #7
0
ファイル: scoring.py プロジェクト: baliga-lab/cmonkey2
def combine(result_matrices, score_scalings, membership, iteration, config_params):
    """This is  the combining function, taking n result matrices and scalings"""
    quantile_normalize = config_params["quantile_normalize"]

    for i, m in enumerate(result_matrices):
        m.fix_extreme_values()
        m.subtract_with_quantile(0.99)

        # debug mode: print scoring matrices before combining
        if "dump_scores" in config_params["debug"] and (
            iteration == 1 or (iteration % config_params["debug_freq"] == 0)
        ):
            funs = config_params["pipeline"]["row-scoring"]["args"]["functions"]
            m.write_tsv_file(
                os.path.join(config_params["output_dir"], "score-%s-%04d.tsv" % (funs[i]["id"], iteration)),
                compressed=False,
            )

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings)
            elapsed = util.current_millis() - start_time
            logging.debug("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.debug("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                if qqq == 0:
                    logging.debug("SPARSE SCORES - %d attempt 1: pick from sorted values", i)
                    qqq = sorted(values.ravel())[9]
                if qqq == 0:
                    logging.debug("SPARSE SCORES - %d attempt 2: pick minimum value", i)
                    qqq = abs(values.min())
                if qqq != 0:
                    values = values / qqq * abs(rs_quant)
                else:
                    logging.debug("SPARSE SCORES - %d not normalizing!", i)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.debug("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(
            matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score
        )
    else:
        return None
コード例 #8
0
ファイル: util_test.py プロジェクト: baliga-lab/cmonkey2
 def test_quantile_nan(self):
     """tests the quantile function with NaN"""
     data = [0.2, 0.1, np.nan, 0.3]
     self.assertAlmostEqual(0.102, util.quantile(data, 0.01))
コード例 #9
0
def combine(result_matrices, score_scalings, membership, iteration,
            config_params):
    """This is  the combining function, taking n result matrices and scalings"""
    quantile_normalize = config_params['quantile_normalize']

    for i, m in enumerate(result_matrices):
        m.fix_extreme_values()
        m.subtract_with_quantile(0.99)

        # debug mode: print scoring matrices before combining
        if ('dump_scores' in config_params['debug']
                and (iteration == 1 or
                     (iteration % config_params['debug_freq'] == 0))):
            funs = config_params['pipeline']['row-scoring']['args'][
                'functions']
            m.write_tsv_file(os.path.join(
                config_params['output_dir'],
                'score-%s-%04d.tsv' % (funs[i]['id'], iteration)),
                             compressed=False)

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(
                result_matrices, score_scalings)
            elapsed = util.current_millis() - start_time
            logging.debug("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([
                mat.values[index_map[row], cluster - 1] for row in row_members
            ])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows,
                                    num_cols,
                                    mat.row_names,
                                    mat.column_names,
                                    values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.debug("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 1: pick from sorted values',
                        i)
                    qqq = sorted(values.ravel())[9]
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 2: pick minimum value', i)
                    qqq = abs(values.min())
                if qqq != 0:
                    values = values / qqq * abs(rs_quant)
                else:
                    logging.debug('SPARSE SCORES - %d not normalizing!', i)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.debug("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(matrix0.num_rows,
                             matrix0.num_columns,
                             matrix0.row_names,
                             matrix0.column_names,
                             values=combined_score)
    else:
        return None
コード例 #10
0
 def test_quantile_nan(self):
     """tests the quantile function with NaN"""
     data = [0.2, 0.1, np.nan, 0.3]
     self.assertAlmostEqual(0.102, util.quantile(data, 0.01))