def get_samples(self, typ, init, prop_sd, p, data=None, save=False): """ MCMC algorithm for sampling theta and q with log(sigma) parametrisation from prior or posterior. ----------------------------------------------------------------------- typ: "prior" or "post". init: Initialisation for MCMC algorithm. prop_sd: Proposal normal standard deviations for MCMC algorithm. p: [p_1, p_2, p_3]. data: GEVData for posterior. """ # Number of iterations N = 501000 # Length of burn-in burnin = 1000 if typ == "post": self.post["data"] = data # Transformation M -> M* prior = lambda X: self.pdf(X) * data.theta_annual_det(X) target = lambda X: util.logpost(X, prior, data) else: target = self.pdf # Transformation sigma -> log(sigma) new_target = util.log_transform(target, util.sig_trans, lambda X: X[1]) mcmc_obj = util.MCMCSample(new_target, init, prop_sd, N, burnin) mcmc_obj.results(para_names=[ r"$\tilde{\mu}$", r"$\log\tilde{\sigma}$", r"$\tilde{\xi}$" ], colour=self.colour, save=save, save_name="%s-%s-%s-%s" % (self.inst_name, *self.hyperpara, typ)) if typ == "post": # Transformation M* -> M sample_theta = np.array( [data.theta_annual(X) for X in mcmc_obj.sample]) else: sample_theta = mcmc_obj.sample getattr(self, typ)["theta"]["sample"] = sample_theta # Transformation log(sigma) -> sigma sample_theta[:, 1] = np.exp(sample_theta[:, 1]) sample_q = np.array( [util.quantile(theta, 1.0 - p) for theta in sample_theta]) getattr(self, typ)["q"]["sample"] = sample_q getattr(self, typ)["mcmc"] = mcmc_obj
def compute_substitution(cluster_column_scores): """calculate substitution value for missing column scores""" membership_values = [] for cluster in xrange(1, num_clusters + 1): columns = membership.columns_for_cluster(cluster) column_scores = cluster_column_scores[cluster - 1] if column_scores is not None: colnames, scores = column_scores for col in xrange(len(colnames)): if colnames[col] in columns: membership_values.append(scores[col]) return util.quantile(membership_values, 0.95)
def test_quantile(self): """tests the quantile function""" data = [1, 2, 3, 4, 5] self.assertEquals(1, util.quantile(data, 0)) self.assertEquals(1.8, util.quantile(data, 0.2)) self.assertEquals(2, util.quantile(data, 0.25)) self.assertEquals(3, util.quantile(data, 0.5)) self.assertEquals(4, util.quantile(data, 0.75)) self.assertEquals(5, util.quantile(data, 1)) self.assertTrue(np.isnan(util.quantile([], 0.99)))
def compute_substitution(cluster_column_scores): """calculate substitution value for missing column scores""" membership_values = [] for cluster in xrange(1, num_clusters + 1): columns = membership.columns_for_cluster(cluster) column_scores = cluster_column_scores[cluster - 1] if column_scores != None: for row in xrange(column_scores.num_rows): for col in xrange(column_scores.num_columns): if column_scores.column_names[col] in columns: membership_values.append(column_scores.values[row][col]) return util.quantile(membership_values, 0.95)
def __quantile_normalize_scores(cluster_row_scores, row_names, membership, num_clusters): """quantile normalize the row scores in cluster_row_scores that are not NaN or +/-Inf and are in a row cluster membership """ values_for_quantile = [] for cluster in xrange(1, num_clusters + 1): row_scores_for_cluster = cluster_row_scores[cluster - 1] cluster_rows = membership.rows_for_cluster(cluster) if row_scores_for_cluster != None: for row in xrange(len(row_scores_for_cluster)): score = row_scores_for_cluster[row] gene_name = row_names[row] if np.isfinite(score) and (gene_name in cluster_rows): values_for_quantile.append(score) return util.quantile(values_for_quantile, 0.95)
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args'][ 'functions'] m.write_tsv_file(os.path.join( config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores( result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([ mat.values[index_map[row], cluster - 1] for row in row_members ]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.debug('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def test_quantile_nan(self): """tests the quantile function with NaN""" data = [0.2, 0.1, np.nan, 0.3] self.assertAlmostEqual(0.102, util.quantile(data, 0.01))
def quantile(self, probability): """returns the result of the quantile function over all contained values""" return util.quantile(self.values.ravel(), probability)
def combine(result_matrices, score_scalings, membership, quantile_normalize): """This is the combining function, taking n result matrices and scalings""" for m in result_matrices: m.fix_extreme_values() if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row]][cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.info("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) #print "qqq(%d) = %f" % (i, qqq) if qqq == 0: logging.error("very sparse score !!!") values = values / qqq * abs(rs_quant) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
# Chooses optimal value of M data = data.optimal_M(theta[2]) # Plots fit of annual maxima data.set_obs_in_year(365) theta_annual = data.theta_annual(theta) data.fit_GEV(theta=theta_annual, save=save_all) # Constructing priors #-------------------------------------------------------# #=============================================================================# p = np.array([0.1, 0.01, 0.001]) pi = priors.all_priors(p, qu=util.quantile(theta, 1.0 - p), var=[27] * 3, name=study_name) # MCMC sampling #-------------------------------------------------------------# #=============================================================================# for i in range(4): if pi[i].prior["proper"]: pi[i].get_samples( "prior", [ [25.0, 2.0, 0.2], # k = 3, I copula None, [25.0, 2.0, 0.0], # k = 3, ME copula None
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args']['functions'] m.write_tsv_file(os.path.join(config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.warn('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None