def pointbiserialr_dataframe(df, x, y, columns, p=0.05): """ Parameters: ----------- df : the dataframe source of data : DataFrame : : x : column with binary data : str : : y : list of columns with numeric data : str : : columns : list of columns to be named in the dataframe : str : : p : p-value threshold for signifiance : int : : Description: ------------ Generates a list of point-biserial r coefficients and accompanying p-values for a **binary** variable and numeric variables. This correlation test assumes that the binary variable is _naturally_ binary _not_ artificially binary, i.e pass/fail. Null Hypothesis: ---------------- There variables are independant. Returns: -------- A dataframe with three columns: the float point-biserial r coefficient (a float from -1 to 1), the corresponding p-value, and the significance of the p-value. Both the coefficient and p-value are rounded to 5 decimal places. """ pbr_coef = [round(pointbiserialr(x=df[x], y=df[i])[0], 5) for i in y] pbr_pval = [round(pointbiserialr(x=df[x], y=df[i])[1], 5) for i in y] pval_sig = ["True" if i < p else "False" for i in pbr_pval] pbr_dataframe = pd.DataFrame( [pbr_coef, pbr_pval, pval_sig], index=["Coefficient.", "P-Value", "Significant"], columns=columns).T return pbr_dataframe
def correlation_test(): fp = 'data/Ivan_common.csv' df = pd.read_csv(fp) print len(df) print stats.pearsonr(df['coast_dist'], df['elevation']) ############################################################################### # categorical vs continuous # point-biserial, https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm#POINTB # https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient # same as using stats.pearsonr print stats.pointbiserialr(df['evac'], df['coast_dist']) print stats.pearsonr(df['coast_dist'], df['evac']) # t-test has the same p-value as point-biserial, # http://web.pdx.edu/~newsomj/da1/ho_correlation%20t%20phi.pdf evac_yes = df[df['evac'] == 1]['coast_dist'] evac_no = df[df['evac'] == 0]['coast_dist'] print stats.ttest_ind(evac_yes, evac_no) ############################################################################### # categorical vs categorical # phi coefficient, special case of Cramer's V # https://en.wikipedia.org/wiki/Phi_coefficient # phi is computed using chi-square statistic # https://en.wikipedia.org/wiki/Matthews_correlation_coefficient, same as phi coefficient # same as using stats.pearsonr print metrics.matthews_corrcoef(df['ht_mobile'], df['evac']) print stats.pearsonr(df['ht_mobile'], df['evac'])
def create_categorical_feature_regression(y, fraction=0.2, seed=None, verbose=False): """ Create synthetic categorical column, strongly correlated with regression target. Each value is calculated according to the formula: v = y * a + random(-b, b) Where: a: 10 b: one standard deviation of target vector So its scaled target value with some noise. Then a fraction of values is permuted, to reduce the correlation. Point biserial correlation is used to measure association. Parameters --------- y : np.ndarray, target vector fraction : float (default=0.2), fraction of values to be permuted to reduce the correlation seed : int (default=None), random seed that can be specified to obtain deterministic behaviour verbose : bool (default=False), when True, print correlation before and after the shuffling Returns ---------- new_column : np.ndarray, new feature vector corr : float, correlation of new feature vector with target vector """ if seed is not None: np.random.seed(seed) discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') new_column = discretizer.fit_transform(y.reshape(-1, 1)) new_column = new_column.ravel() if verbose: corr, v = pointbiserialr(new_column, y) print( f'Initial new feature - target point biserial correlation, without shuffling: {round(corr, 3)}, p: {round(v, 3)}' ) # Choose which samples to permute indices = np.random.choice(range(len(y)), int(fraction * len(y)), replace=False) # Find new order of this samples shuffled_indices = np.random.permutation(len(indices)) new_column[indices] = new_column[indices][shuffled_indices] corr, p = pointbiserialr(new_column, y) if verbose: print( f'New feature - target point biserial correlation, after shuffling: {round(corr, 3)}, p: {round(p, 3)}' ) return new_column, corr
def create_numerical_feature_classification(y, a=10, b=5, fraction=0.2, seed=None, verbose=False): """ Create synthetic numerical column, strongly correlated with binary classification target. Each value is calculated according to the formula: v = y * a + random(-b, b) So its scaled target value with some noise. Then a fraction of values is permuted, to reduce the correlation. Point biserial correlation is used to measure association. Parameters --------- y : np.ndarray, target vector a : int or float (default=10), scaling factor in a formula above b : int or float (default=5), value that determines the range of noise to be added fraction : float (default=0.2), fraction of values to be permuted to reduce the correlation seed : int (default=None), random seed that can be specified to obtain deterministic behaviour verbose : bool (default=False), when True, print correlation before and after the shuffling Returns ---------- new_column : np.ndarray, new feature vector corr : float, correlation of new feature vector with target vector """ if seed is not None: np.random.seed(seed) new_column = y * a + np.random.uniform(low=-b, high=b, size=len(y)) if verbose: corr, v = pointbiserialr(new_column, y) print( f'Initial new feature - target point biserial correlation, without shuffling: {round(corr, 3)}, p: {round(v, 3)}' ) # Choose which samples to permute indices = np.random.choice(range(len(y)), int(fraction * len(y)), replace=False) # Find new order of this samples shuffled_indices = np.random.permutation(len(indices)) new_column[indices] = new_column[indices][shuffled_indices] corr, p = pointbiserialr(new_column, y) if verbose: print( f'New feature - target point biserial correlation, after shuffling: {round(corr, 3)}, p: {round(v, 3)}' ) return new_column, corr
def compute_test(): r_is_bug_included_c = [5, 5, 25, 25, 30, 30, 80, 80] r_scores_c = [False, False, True, True, True, True, True, True] c_r = getattr(pointbiserialr(r_is_bug_included_c, r_scores_c), 'correlation') print(c_r) r_is_bug_included_c = [5, 5, 25, 25, 30, 30, 80, 80] r_scores_c = [False, False, False, False, False, False, True, True] c_r = getattr(pointbiserialr(r_is_bug_included_c, r_scores_c), 'correlation') print(c_r)
def calc_corr(x, y, x_datatype, y_datatype, x_label, y_label): """ Calculate the correlation of two arrays x and y Parameters ---------- x : array with vals y : array with vals x_datatype: binary, discrete or continous y_datatype: binary, discrete or continous Returns ------- dict with pval, rval and method """ from scipy import stats rval = None pval = None method = None if (y_datatype in ['binary', 'discrete'] and x_datatype == 'continous') or (y_datatype == 'continous' and x_datatype in ['binary', 'discrete']): # Calculate a point biserial correlation coefficient rval = stats.pointbiserialr(x, y)[0] pval = stats.pointbiserialr(x, y)[1] method = 'pointbiserial' # format for scientific notation pval = "{:.2e}".format(pval) if (y_datatype in ['binary', 'discrete'] and x_datatype in ['binary', 'discrete']): # Calculate pearson correlation rval = cramers_v(pd.crosstab(x, y).to_numpy()) pval = 'None' # not yet implemented method = 'cramers_v' if y_datatype == 'continous' and x_datatype == 'continous': rval = stats.pearsonr(x, y)[0] pval = stats.pearsonr(x, y)[1] method = 'pearson' return {'rval': rval, 'pval': pval, 'method': method}
def compute_group_importance(pipelines, scores, up_to_k=5): primitive_matrix, primitives = extract_primitive_matrix(pipelines) column_idx_map = {p: idx for idx, p in enumerate(primitives)} importances = {} for k in range(1, up_to_k + 1): for selected_columns in combinations(primitives, k): selected_columns_idx = np.array( [column_idx_map[c] for c in selected_columns]) sub_matrix = primitive_matrix[:, selected_columns_idx] used_all = np.prod(sub_matrix, axis=1) importance, _ = pointbiserialr(used_all, scores) importance = 0 if np.isnan(importance) else importance importances[frozenset(selected_columns)] = importance # keeping only the ones for which importance is greater than those of its component parts kept = [] for selected_columns in importances.keys(): if (len(selected_columns) > 1): to_add = True for subgroup in chain(*[ list(combinations(selected_columns, take)) for take in range(1, len(selected_columns)) ]): if abs(importances[frozenset(subgroup)]) >= abs( importances[selected_columns]): to_add = False break if to_add: kept.append({ 'importance': importances[selected_columns], 'group': list(selected_columns) }) return sorted(kept, key=lambda x: abs(x['importance']), reverse=True)
def correlation(data): col_names = data.columns param = [] correlation = [] abs_corr = [] for c in col_names: #Check if binary or continuous if c != "income": if len(data[c].unique()) <= 2: corr = spearmanr(data['income'], data[c])[0] else: corr = pointbiserialr(data['income'], data[c])[0] param.append(c) correlation.append(corr) abs_corr.append(abs(corr)) #Create dataframe for visualization param_df = pd.DataFrame({ 'correlation': correlation, 'parameter': param, 'abs_corr': abs_corr }) #Sort by absolute correlation param_df = param_df.sort_values(by=['abs_corr'], ascending=False) #Set parameter name as index param_df = param_df.set_index('parameter') scoresCV = [] scores = [] for i in range(1, len(param_df)): new_df = data[ param_df.index[0:i + 1].values] #sorting DF by correlation importance X = new_df.iloc[:, 1::] target = new_df.iloc[:, 0] clf = DecisionTreeClassifier() scoreCV = cross_val_score(clf, X, target, cv=10) scores.append(np.mean(scoreCV)) plt.figure(figsize=(15, 5)) plt.plot(range(1, len(scores) + 1), scores, '.-') plt.axis("tight") plt.title('Feature Selection', fontsize=14) plt.xlabel('# Features', fontsize=12) plt.ylabel('Score', fontsize=12) plt.grid() new_df = data[param_df.index[1:i + 1].values] new_df.shape X = new_df.iloc[:, 1::] return Bunch(data_corr=correlation, data_par=param, data_df_corr=param_df.copy(), target=target.copy())
def bin_accuracy_scores_prob(y_true, y_prob): """ A function to calculate accuracy measures for probabilistic responses with sklearn and scipy. Function written by Osian Roberts. :param y_true: binary class labels, where 0 is absence and 1 is presence. :param y_prob: probability of presence scores e.g., generated by a species distribution model. :returns: a list containing two arrays - metrics = names of test metrics. scores = test scores for each metric. Useful reference: https://machinelearningmastery.com/how-to-score-probability-predictions-in-python """ import numpy # check inputs: if not isinstance(y_true, numpy.ndarray): y_true = numpy.array(y_true) if not isinstance(y_prob, numpy.ndarray): y_prob = numpy.array(y_prob) if y_true.ndim != 1: raise SystemExit('ERROR: the true labels are not in a 1D array.') if y_prob.ndim != 1: raise SystemExit('ERROR: the probability of presence values are not in a 1D array.') if y_true.size != y_prob.size: raise SystemExit('ERROR: unequal number of binary labels and probabilities.') # ensure that y_true contains binary labels (i.e. 0 or 1 values): y_true = y_true.astype('uint8') if numpy.min(y_true) != 0 or numpy.max(y_true) != 1: raise SystemExit('ERROR: the true labels are not binary (zero or one values).') from sklearn.metrics import roc_auc_score # calculates area under the receiver operating curve score. # A score of 0.5 shows the model is unable to discriminate between presence and absence. roc_auc = roc_auc_score(y_true, y_prob) from sklearn.metrics import average_precision_score # calculates area under the precision-recall curve. Perfect model = 1.0. average_precision = average_precision_score(y_true, y_prob) from sklearn.metrics import brier_score_loss # This is a quadratic loss function that calculates the mean squared error between # predicted probabilities and the true presence-absence (binary) labels. # A model with no false positives/negatives has a score of 0.0. Perfect model = 1.0. brier_score = brier_score_loss(y_true, y_prob) from sklearn.metrics import log_loss # The is logarithmic loss function that more heavily penalises false positives/negatives than the brier score. # A model with no false positives/negatives has a score of 0.0. There is no upper bound. log_loss_score = log_loss(y_true, y_prob) from scipy.stats import pointbiserialr # The point biserial correlation coefficient, range -1 to 1. # Quantifies the correlation between a binary and continuous variable. r = pointbiserialr(y_true, y_prob)[0] metrics = ['Test AUC', 'Point-Biserial r', 'Av. Precision', 'Brier Score', 'Log-Loss Score'] scores = numpy.array([roc_auc, r, average_precision, brier_score, log_loss_score]).round(decimals=6) del roc_auc, r, average_precision, brier_score, log_loss_score, y_true, y_prob return [metrics, scores]
def slicing_analysis(df, n, scoring_columns = None): df_splits = np.array_split(df, n) pbs_scores = {} # Container for n scores indication the average lexical similarity for each sub-dataframe # Rouge-1 with stopwords included as the measure of lexical similarty between the two sentences lexical_sim_all = [] lexical_sim_label_0 = [] lexical_sim_label_1 = [] for df in df_splits: lexical_sim_all.append(df['rouge1_stopwords:False'].mean()) lexical_sim_label_0.append(df['rouge1_stopwords:False'].loc[df['Label'] == 0].mean()) lexical_sim_label_1.append(df['rouge1_stopwords:False'].loc[df['Label'] == 1].mean()) for metric in scoring_columns: score = stats.pointbiserialr(df['Label'], df[metric])[0] if metric not in pbs_scores: pbs_scores[metric] = [] pbs_scores[metric].append(score) # Prints the plots for the analysis of correlation scores as an effect of lexical similarity for metric in scoring_columns: sbs.set_style(style = 'darkgrid') sbs.scatterplot(x = lexical_sim_all, y = pbs_scores[metric]) plt.xlabel('Lexical Similarity (ROUGE-1 with stopwords)') plt.ylabel(metric) plt.show()
def pointBiserialCorrelation(x, y): ''' Calculates the point-biserial correlation coefficients between the NUMERICAL features and the CATEGORICAL label :param x (pandas.DataFrame): The numerical features :param y (pandas.DataFrame): The categorical label ''' corrs = pd.DataFrame( columns=["Feature", "Coefficient", "p-value", "Variance"]) for column in x: # calculate the coefficients res = stats.pointbiserialr(x[column], y) # calculate additionally the variance of the features var = x[column].var() coefs = { "Feature": column, "Coefficient": res[0], "p-value": res[1], "Variance": var } corrs = corrs.append(coefs, ignore_index=True) print(f"{column}: Correlation = {res[0]}, pvalue = {res[1]}") # write the results to the file writeFrameToCsv(corrs, "point-biserial-corrs.csv")
def get_corr_coef(self, method): """ 计算维度间的相关系数,返回相关系数较高的特征对 :param method: 何种相关系数(Pearson,Spearman,Kendall) 新增判定系数、点二列相关 :return: """ flag = 0 attr_relate = [] # 记录相关系数过高的特征编号和相关系数,例如:(1,2,0.9)表示特征1与特征2的相关系数为0.9 if method == 'coef_determination': method = 'pearson' flag = 1 if method == 'pointbiserialr': for i in range(self.attr_num): for j in range(i + 1, self.attr_num): temp, _ = stats.pointbiserialr(self.dfattr.iloc[:, i], self.dfattr.iloc[:, j]) if temp > 0.8: attr_relate.append((i, j, temp)) return attr_relate pearson_corr_mat = self.dfattr.corr(method=method) for i in range(self.attr_num): for j in range(i + 1, self.attr_num): temp = pearson_corr_mat.iloc[i, j] if temp * temp > 0.64: if flag > 0: attr_relate.append((i, j, temp * temp)) elif temp > 0.8: attr_relate.append((i, j, temp)) return attr_relate
def correlation(): df = pd.read_csv("dataset/train_new.csv") # df = df.dropna(axis=0,how="any") print df.describe() # print df.head() param=[] correlation=[] abs_corr=[] covariance = [] columns = ["Applicant_Gender","App_age","Applicant_Occupation","Applicant_Qualification","Manager_age","Manager_Status","Manager_Gender","Manager_Business","Manager_Business2","Manager_Num_Application"] for c in columns: #Check if binary or continuous if len(df[c].unique())<=12: corr = spearmanr(df['Business_Sourced'],df[c])[0] print "spear",c,corr y = df['Business_Sourced'] x = df[c] X = np.vstack((y,x)) covar = np.cov(X) else: corr = pointbiserialr(df['Business_Sourced'],df[c])[0] print "point",c,corr y = df['Business_Sourced'] x = df[c] X = np.vstack((y,x)) covar = np.cov(X) param.append(c) correlation.append(corr) abs_corr.append(abs(corr)) # covariance.append(covar[0][1]) print covariance
def pointbiserialcorr(s1, s2): """ Calculate the mean point biserial correlation of the RTDs of the two given solvers on all instances of the experiment. Only consider values where the statistical significance is large enough (p-value < alpha = 0.05) """ from scipy import stats alpha = 0.05 # level of statistical significant difference d = 0.0 num = 0 for i in instance_ids: res1 = solver_config_results[s1.idSolverConfig][i] res2 = solver_config_results[s2.idSolverConfig][i] ranked_data = list(rankdata(res1 + res2)) r, p = stats.pointbiserialr([1] * len(res1) + [0] * len(res2), ranked_data) # only take instances with significant differences into account if p < alpha: #print str(s1), str(s2), str(i), r, p d += r num += 1 if num > 0: return d / num # return mean difference else: return 0 # s1 == s2
def pointbiserialr(self, dataset, specName=0): dataset['code'] = pd.factorize(dataset[specName])[0] + 1 dataset = dataset.drop([specName], axis=1) sizes = ['size'] corrs = ['correlation'] cats = [] for col in dataset.columns: if col != 'code': dataset2 = dataset.filter([col, 'code'], axis=1) dataset2 = dataset2.dropna() features = dataset2.iloc[:, 0].values labels = dataset2.iloc[:, 1].values result = stats.pointbiserialr(features, labels) if math.isnan(result.correlation) == False and math.isinf( result.correlation) == False: sizes.append(len(features)) corrs.append(result.correlation * 100) cats.append(col) return cats, [corrs, sizes]
def test(self, use_saved_embeddings=True): self.load('%s/adem_model.pkl' % self.config['exp_folder']) test_fname_embeddings = '%s/test_%s_embeddings.pkl' % (self.config['vhrd_data'], self.config['mode']) test_x, test_y = self.get_vhrd_embeddings(self.config['test_data'], self.config['mode'], test_fname_embeddings, use_saved_embeddings) if self.config['use_pca']: test_x = self._apply_pca(test_x) predictions = (np.array(self._get_outputs(test_x))-1)/4.0 test_y = (np.array(test_y)-1)/4 predictions_positive = predictions[test_y==1] predictions_negative = predictions[test_y==0] np.savetxt(os.path.join(self.config['exp_folder'], 'positive_probs.npy'), predictions_positive) np.savetxt(os.path.join(self.config['exp_folder'], '{}_negative_probs.npy'.format(self.config['mode'])), predictions_negative) acc = accuracy_score(test_y, predictions>0.5) print ('Accuracy: ',acc) matrix = confusion_matrix(test_y, predictions>0.5) print ('confusion_matrix: ', matrix) pbc, pval = pointbiserialr(test_y, predictions) print ('PBC: ', pbc, 'p-value: ', pval) sys.stdout.flush() return
def assign_functions(k, clust, splits, act, dtm, lexicon, list_lens=range(5, 26)): from scipy.stats import pointbiserialr lists = pd.DataFrame() for i in range(k): structures = list(clust.loc[clust["CLUSTER"] == i + 1, "STRUCTURE"]) centroid = np.mean(act.loc[splits["train"], structures], axis=1) R = pd.Series([ pointbiserialr(dtm.loc[splits["train"], word], centroid)[0] for word in lexicon ], index=lexicon) R = R[R > 0].sort_values(ascending=False)[:max(list_lens)] R = pd.DataFrame({ "CLUSTER": [i + 1 for l in range(max(list_lens))], "TOKEN": R.index, "R": R.values }) lists = lists.append(R) return lists
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None t_stat, p_val = stats.pointbiserialr(data[0], data[1]) dof = None test_result = TestResult(name=pointbiserial_name, test_statistic=t_stat, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha) return test_result
def best_relationship_class(self): label = self.data_loader.cleaned[:, -1] variables_data = self.data_loader.scaled[:, :-1] relationships_array = np.array([ stats.pointbiserialr(variable, label) for variable in variables_data.T ]) variables_names = self.data_loader.columns[:-1] max_value = 0 var = 0 variable_max = [] for i in relationships_array: if max_value == 0 or abs(i[0]) > max_value: max_value = i[0] variable_max = i best_var = var var += 1 print('{} presents a Point Biserial Correlation of {} with the Class' ' variable. (p-value={})'.format(variables_names[best_var], max_value, str(variable_max[1]))) return (relationships_array, variable_max, best_var, variables_names[best_var])
def _compute_correlative_all(x, y, xa, ya, method): outliers = [] # where x, y are pd.Series, xa, ya are preprocessed numpy arrays if _both_continuous(x, y): if method == "pearson": r, pval = pearsonr(xa, ya) elif method == "spearman": r, pval = spearmanr(xa, ya) elif method == "kendall": r, pval = kendalltau(xa, ya) elif method == "percbend": r, pval = percbend(xa, ya) elif method == "shepherd": r, pval, outliers = shepherd(xa, ya) elif method == "skipped": r, pval, outliers = skipped(xa, ya, method="spearman") else: raise ValueError("Method not recognized.") elif _both_integers(x, y): # handle the integer-integer use case. r, pval = spearmanr(xa, ya) # if they're both categories (strings), then use kramers_v elif _continuous_categorical(x, y): # correlation ratio [0, 1] r, pval = corr_ratio(xa, ya) elif _categorical_continuous(x, y): # correlation ratio [0, 1] r, pval = corr_ratio(ya, xa) elif _both_categorical(x, y): # kramer's v for categorical-categorical [0, 1] r, pval = kramers_v(x, y, True) elif _continuous_bool(x, y): # sort them into order, it matters r, pval = pointbiserialr(xa, ya.astype(np.uint8)) elif _bool_continuous(x, y): # sort them into order, it matters r, pval = pointbiserialr(xa.astype(np.uint8), ya) elif _both_bool(x, y): # use spearman r, pval = spearmanr(xa.astype(np.uint8), ya.astype(np.uint8)) else: raise TypeError( "columns '{}':{} to '{}':{} combination not accepted for `bicorr`." .format(x.name, x.dtype, y.name, y.dtype)) assert not np.isnan(r), "Correlation returned NaN. Check your data." return r, pval, outliers
def main(): dataset = pd.read_csv("/Users/rathi/Downloads/BreastCancer.txt") X = dataset.iloc[:, :-1].values X_labels = dataset.iloc[:, :-1].columns.values y = dataset.iloc[:, -1].values correlation_coef = [] indexes = [] X_feat_labels = [] for i in range(len(X_labels)): pbsr = pointbiserialr(X[:, i], y) # print(pbsr) # if pbsr.pvalue >= 0.5: # Select features with correlation > 0 and pvalue < 0.5 if pbsr.correlation > 0 and pbsr.pvalue < THR: indexes.append(i) X_feat_labels.append(X_labels[i]) correlation_coef.append(pbsr.correlation) X_feat = X[:, indexes] corr_matrix = pd.DataFrame(X_feat, columns=X_feat_labels).corr(method='spearman') sns.heatmap(corr_matrix) indexes_final = [] X_feat_labels_final = [] for i in indexes: coef, pvalue = spearmanr(X[:, i], y) # print(coef, pvalue) if pvalue < THR: X_feat_labels_final.append(X_labels[i]) indexes_final.append(i) X_feat_final = X[:, indexes_final] X_train, X_test, y_train, y_test = train_test_split(X_feat_final, y, test_size=0.2, random_state=0) model = LogisticRegression(solver='liblinear', max_iter=50) # model = LogisticRegression(solver='lbfgs', class_weight="balanced") model.fit(X_train, y_train) preds = np.where(model.predict_proba(X_test)[:, 1] > THR, 1, 0) df = pd.DataFrame(X_feat_final, columns=X_feat_labels_final) df2 = pd.DataFrame( data=[ accuracy_score(y_test, preds), recall_score(y_test, preds), precision_score(y_test, preds), roc_auc_score(y_test, preds) ], index=["accuracy", "recall", "precision", "roc_auc_score"]) log_likelihood = -log_loss(y_test, preds) r, c = df.shape bic = calculate_bic(log_likelihood, r, c - 1) pprint(X_feat_labels_final) print("BIC:\t\t", bic) print(df2) # print(df) pprint(y_test) pprint(preds) plt.show() exit(0)
def test_pointbiserial(): # copied from mstats tests removing nans x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0, 0,0,0,0,1] y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0, 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1, 0.8,0.7,0.6,0.5,0.2,0.2,0.1] assert_almost_equal(stats.pointbiserialr(x, y)[0], 0.36149, 5)
def PointBiserial(x, y): x, y = PairwiseDeletion(x,y) r, prob = stats.pointbiserialr(x, y) df = Count(x)-1 result = {'r':r, 'df':df, 'prob':prob} result['quote'] = "<b>Quote: </b> <i>r</i> (%d) = %.3f, <i>p</i> = %1.4f<br />" result['quotetxt'] = "Quote: r (%d) = %.3f, p = %1.4f\n" return result
def validation(data_iter, net, save_scores=False, delta=0.8): ''' calculate the Acc ''' score_list=[] label_list=[] net.eval() losses, batch_num, acc, acc_num = 0, 0, 0, 0 # criterion = nn.CrossEntropyLoss() criterion = nn.BCELoss() for batch_idx, batch in enumerate((data_iter)): qbatch, rbatch, qlength, rlength, label = batch qbatch = torch.from_numpy(qbatch) rbatch = torch.from_numpy(rbatch) qlength = torch.from_numpy(qlength) rlength = torch.from_numpy(rlength) label = torch.from_numpy(label).float() batch_size = len(qlength) if torch.cuda.is_available(): qbatch, rbatch = qbatch.cuda(), rbatch.cuda() qlength, rlength = qlength.cuda(), rlength.cuda() label = label.cuda() qbatch = qbatch.transpose(0, 1) rbatch = rbatch.transpose(0, 1) scores = net(qbatch, qlength, rbatch, rlength) # [2 * B] loss = criterion(scores, label) score_list.extend(scores.cpu().data.numpy().tolist()) label_list.extend(label.cpu().data.numpy().tolist()) s = scores >= 0.5 acc += torch.sum(s.float() == label).item() acc_num += batch_size batch_num += 1 losses += loss.item() score_list = np.array(score_list) label_list = np.array(label_list) pbc, pval = pointbiserialr(label_list, score_list) acc = accuracy_score(label_list, score_list >=0.5) print ('PBC: {}, pval: {}'.format(pbc, pval)) if save_scores: np.savetxt( args.exp_dir + '/test_' + args.mode +'_scores.txt' ,score_list) np.savetxt( args.exp_dir + '/test_' + args.mode +'_labels.txt' ,label_list) predicted = (score_list >=0.5).astype(np.int32) c_matrix = confusion_matrix(label_list,predicted) print ('confusion_matrix = ',c_matrix) return round(losses / (batch_num), 4), acc
def data_info(): """ Function for inspecting the data set. """ # Set font sizes for plotting fonts = { "font.size": 16, "legend.fontsize": "medium", "xtick.labelsize": 15, "ytick.labelsize": 15, "axes.titlesize": 22 } plt.rcParams.update(fonts) pulsar_data.info() print(pulsar_data.head()) # Print top 5 entries of the data set print("Value|Count") print(pulsar_data["Target"].value_counts() ) # Count number of targets in data f, ax = plt.subplots(figsize=(12, 12)) ax.set_title("Heatmap of the data set features") sns.heatmap(pulsar_data.corr(), annot=True, linecolor="blue", fmt=".2f", ax=ax) #sns.set(font_scale=5) plt.tight_layout() plt.savefig("Figures/heatmap.png") # Information gain of the features cols = list(feature.columns) infos = mutual_info_classif(X, np.ravel(y), random_state=42) info_gain_int = { cols[0]: [infos[0]], cols[1]: [infos[1]], cols[2]: [infos[2]], cols[3]: [infos[3]] } info_gain_int = pd.DataFrame(info_gain_int) info_gain_curve = { cols[4]: [infos[4]], cols[5]: [infos[5]], cols[6]: [infos[6]], cols[7]: [infos[7]] } info_gain_curve = pd.DataFrame(info_gain_curve) print("Information gain of the features:") print(info_gain_int) print(info_gain_curve) # Point-biserial correlation, linear correlation between the variables for # dichotomous target variable for i in range(8): print(pointbiserialr(np.ravel(y), X[:, i])) plt.show()
def proportion_base_atom(df: pd.DataFrame, base_atom_pref: List) -> List: proportion = [ df[i].sum() / df[i].count() for i in range(len(df.columns)) if i != 9 and i != 19 ] print(len(proportion)) print(len(base_atom_pref)) print(pointbiserialr(proportion, base_atom_pref)) assert False
def continuous_significance(df, significant_pval=0.01): df = df.copy() df = df.replace([np.inf, -np.inf], np.nan) df = df.dropna(subset=list(continuous_columns(df))) return filter( lambda y: y[0] <= significant_pval, map(lambda x: (stats.pointbiserialr(df["pred_state"], df[x])[1], x), continuous_columns(df)))
def PointBiserial(x, y): x, y = PairwiseDeletion(x, y) r, prob = stats.pointbiserialr(x, y) df = Count(x) - 1 result = {'r': r, 'df': df, 'prob': prob} result[ 'quote'] = "<b>Quote: </b> <i>r</i> (%d) = %.3f, <i>p</i> = %1.4f<br />" result['quotetxt'] = "Quote: r (%d) = %.3f, p = %1.4f\n" return result
def plot_scatters(self, df, show=True): factors = [col for col in df.columns if col[:3] in ['NMF', 'ICA', 'PCA']] df = self.normalise_component_columns(df, factors) rows, cols = len(factors), len(self.scatter_cols) plt.figure(figsize=(2.5*cols, 2*rows)) i = 0 for factor in factors: for feat in self.scatter_cols: i += 1 plt.subplot(rows, cols, i) if feat == 'Mutational_load': # Remove an outlier which messes up the scaling outlier = 127424 x = df[feat][df[feat] < outlier] y = df[factor][df[feat] < outlier] elif feat in ['WGD', 'Which']: # WGD is 0 or 1, so jitter slightly jitter = np.random.uniform(-0.1, 0.1, len(df)) x = df[feat]+jitter y = df[factor] else: x, y = df[feat], df[factor] plt.scatter(x, y, c=self.colours[factor[:3]]) # Labels only on the left and bottom plots if i > (rows - 1) * cols: plt.xlabel(feat, size=16) if i % cols == 1: plt.ylabel(factor, size=16) # No scales - there's no space and it would not be informative plt.xticks([]) plt.yticks([]) # Calculate and show correlation coefficients and p-values # Must get x and y again to avoid jitter and outlier changes x = df[feat].values y = df[factor].values if feat in ['WGD', 'Which']: # Binary value, so use Point-Biserialr correlation r, p_val = stats.pointbiserialr(x, y) else: r, p_val = stats.pearsonr(x, y) star = '***' if p_val <= 0.01 else '' annotation = 'r=%4.2f, p=%0.3f %s' % (r, p_val, star) plt.title(annotation) if self.dataset_tag: figpath = self.plots_dir + 'genomic_feature_scatters_%s.pdf' % self.dataset_tag print("Saving figure to", figpath) plt.savefig(figpath, bbox_inches='tight') if show: plt.show()
def corr_categorical_with_wages(df, wages): correlations = {} columns = df.columns.tolist() for col in columns: correlations["wages" + '___' + col] = stats.pointbiserialr( wages, df[col].values) results = pd.DataFrame.from_dict(correlations, orient="index") results.columns = ["correlation", "pvalues"] results.sort_index(inplace=True) return results
def pointbiserial(a, bs, weather_var): weather_dict = {} for variable in weather_var: b = [j[variable] for j in bs] r, p = stats.pointbiserialr(a, b) weather_dict[variable] = [p, r] result_df = pd.DataFrame.from_dict(weather_dict, orient='index', columns=['p-value', 'r-value']) return result_df
def SpearmanCorr(self, values, labels): if self.mode == 'birth' or self.mode == 'data': if self.mode == 'birth': print('-------- Organized with birth date --------') elif self.mode == 'data': print('-------- Organized with # of data --------') coef = spearmanr(values, np.arange(0, 13, step=1))[0] p_value = spearmanr(values, np.arange(0, 13, step=1))[1] print('Spearman Rank: ', round(coef, 4)) print('P_Value: ', round(p_value, 4)) # Calculating Point Biserial Rank coef = pointbiserialr(values, np.arange(0, 13, step=1))[0] p_value = pointbiserialr(values, np.arange(0, 13, step=1))[1] print('Point Biserial Rank: ', round(coef, 4)) print('P_value: ', round(p_value, 4)) return
def get_PBS_corr_from_cols(df,target_col,cont_cols,thresh = 0 ): res = dict() for col in cont_cols: correlation, pval = pointbiserialr(df[target_col],df[col]) res[col] = correlation inter = pd.Series(res, name='corr').reset_index() inter['abs_corr'] = pd.DataFrame.abs(inter['corr']) inter = inter[inter['abs_corr'] > thresh ] fin_res = inter.sort_values('corr',ascending=False) fin_res = fin_res.drop(columns = ['abs_corr']) return(fin_res)
def correlation(df): columns = df.columns.values print columns param=[] correlation=[] abs_corr=[] covariance = [] # cor = np.array(df) # # print cor # x = cor[:,1:] # # print x # y = np.array(cor[:,0]) # # print y # X = np.vstack((y,x)) # print np.cov(X) for c in columns: #Check if binary or continuous if len(df[c].unique())<=2: corr = spearmanr(df['Survived'],df[c])[0] y = df['Survived'] x = df[c] X = np.vstack((y,x)) covar = np.cov(X) else: corr = pointbiserialr(df['Survived'],df[c])[0] print corr y = df['Survived'] x = df[c] X = np.vstack((y,x)) covar = np.cov(X) param.append(c) correlation.append(corr) abs_corr.append(abs(corr)) covariance.append(covar[0][1]) print covariance #Create dataframe for visualization param_df=pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr,'covariance':covariance}) #Sort by absolute correlation param_df=param_df.sort_values(by=['abs_corr'], ascending=False) #Set parameter name as index param_df=param_df.set_index('parameter') parameter_grid(param_df,df) print param_df
def get_sort_abs_cor(data): columns=data.columns.values correlation=[] #spearmanr计算类别变量之间的相关性 #pointbiserialr计算类别变量与连续变量之间的相关性 for i in columns: if len(data[i].unique())<=2: correlation.append(spearmanr(data['Survived'],data[i])[0]) else: correlation.append(pointbiserialr(data['Survived'],data[i])[0]) cor=pd.DataFrame({'Correlation':correlation}) cor.index=columns cor['abs_cor']=cor.Correlation.apply(lambda x:abs(x)) cor=cor.iloc[1:,:] sort_abs_cor=cor.abs_cor.sort_values(ascending=False) return sort_abs_cor
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def calc_biserial_correlation_coefficient(self): """ Calculates a point biserial correlation coefficient and the associated p-value. The point biserial correlation is used to measure the relationship between a binary variable and a continuous variable """ self.scores['Biserial Correlation Coefficient'] = pointbiserialr(self.y_true, self.y_pred)
''' from sklearn.metrics import mean_squared_error from sklearn.metrics import log_loss, matthews_corrcoef, precision_recall_curve, auc, roc_curve, confusion_matrix, hinge_loss, accuracy_score, classification_report, f1_score, fbeta_score, hamming_loss, jaccard_similarity_score, precision_recall_fscore_support, precision_score, recall_score, zero_one_loss, average_precision_score, roc_auc_score from scipy.stats import scoreatpercentile, pointbiserialr,ks_2samp,pearsonr, spearmanr from sklearn.metrics.regression import mean_absolute_error, r2_score import matplotlib.pyplot as plt from sklearn.preprocessing import binarize import numpy as np x_true = np.array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0]) y_true = np.array([1.0, 0.01, 1.0, 0.012, 0.42, 0.021, 0.56, 0.011, 0.091, 0.0215, 0.001, 1]) #y_true = np.array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1]) print ks_2samp(x_true, binarize(y_true, scoreatpercentile(y_true, 30))[0]) print pointbiserialr(y_true, x_true) print scoreatpercentile(y_true, 30) def validate(y_true, y_pred): print 'Kolmogorov-Smirnov test = ', ks_2samp(y_true, y_pred) print 'mean_squared_error = ', mean_squared_error(y_true, y_pred) print 'mean_absolute_error = ', mean_absolute_error(y_true, y_pred) print 'r2_score = ', r2_score(y_true, y_pred) """TBD compute the log-loss to consider boolean""" print "log_loss = " + str(log_loss(y_true, y_pred)) #Log loss, aka logistic loss or cross-entropy loss. precision, recall, thresholds = precision_recall_curve(y_true, y_pred) #Compute precision-recall pairs for different probability thresholds
train_df = pd.concat([train_df,dummies_Sex,dummies_Embarked,dummies_Pclass,dummies_Title], axis=1) train_df = train_df.drop(["Sex","Embarked","Pclass","Title","Name"], axis=1) train_df.set_index(['PassengerId']) columns = train_df.columns.values param = [] correlation = [] abs_corr = [] for c in columns: # Check if binary or continuous if len(train_df[c].unique()) <= 2: corr = spearmanr(train_df["Survived"], train_df[c])[0] else: corr = pointbiserialr(train_df["Survived"], train_df[c])[0] param.append(c) correlation.append(corr) abs_corr.append(abs(corr)) # Create dataframe for visualization param_df = pd.DataFrame({"correlation":correlation, "parameter":param, "abs_corr":abs_corr}) # Sort by absolute correlation param_df = param_df.sort_values(by=["abs_corr"],ascending=False) # Set parameter name as index param_df = param_df.set_index("parameter") print param_df scoresCV = [] scores = [] print '================='
df = df.set_index(['PassengerId']) # 计算相关系数 columns = df.columns.values param=[] correlation=[] abs_corr=[] for c in columns: #Check if binary or continuous if len(df[c].unique())<=2: corr = spearmanr(df['Survived'],df[c])[0] else: corr = pointbiserialr(df['Survived'],df[c])[0] param.append(c) correlation.append(corr) abs_corr.append(abs(corr)) #Create dataframe for visualization param_df=pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr}) #Sort by absolute correlation param_df=param_df.sort_values(by=['abs_corr'], ascending=False) #Set parameter name as index param_df=param_df.set_index('parameter') ''' # 利用DecisionTree进行特征选择