Example #1
0
def pointbiserialr_dataframe(df, x, y, columns, p=0.05):
    """
    Parameters:
    -----------
    df      : the dataframe source of data                 : DataFrame : :
    x       : column with binary data                      : str       : :
    y       : list of columns with numeric data            : str       : :
    columns : list of columns to be named in the dataframe : str       : :
    p       : p-value threshold for signifiance            : int       : :

    Description:
    ------------
    Generates a list of point-biserial r coefficients and accompanying p-values for a **binary** variable and numeric variables.
    This correlation test assumes that the binary variable is _naturally_ binary _not_ artificially binary, i.e pass/fail.

    Null Hypothesis:
    ----------------
    There variables are independant.

    Returns:
    --------
    A dataframe with three columns: the float point-biserial r coefficient (a float from -1 to 1), the corresponding p-value, and the significance
    of the p-value.  Both the coefficient and p-value are rounded to 5 decimal places.
    """
    pbr_coef = [round(pointbiserialr(x=df[x], y=df[i])[0], 5) for i in y]
    pbr_pval = [round(pointbiserialr(x=df[x], y=df[i])[1], 5) for i in y]
    pval_sig = ["True" if i < p else "False" for i in pbr_pval]
    pbr_dataframe = pd.DataFrame(
        [pbr_coef, pbr_pval, pval_sig],
        index=["Coefficient.", "P-Value", "Significant"],
        columns=columns).T
    return pbr_dataframe
def correlation_test():
    fp = 'data/Ivan_common.csv'
    df = pd.read_csv(fp)
    print len(df)
    print stats.pearsonr(df['coast_dist'], df['elevation'])

    ###############################################################################
    # categorical vs continuous
    # point-biserial, https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm#POINTB
    # https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient
    # same as using stats.pearsonr
    print stats.pointbiserialr(df['evac'], df['coast_dist'])
    print stats.pearsonr(df['coast_dist'], df['evac'])

    # t-test has the same p-value as point-biserial,
    # http://web.pdx.edu/~newsomj/da1/ho_correlation%20t%20phi.pdf
    evac_yes = df[df['evac'] == 1]['coast_dist']
    evac_no = df[df['evac'] == 0]['coast_dist']
    print stats.ttest_ind(evac_yes, evac_no)

    ###############################################################################
    # categorical vs categorical
    # phi coefficient, special case of Cramer's V
    # https://en.wikipedia.org/wiki/Phi_coefficient
    # phi is computed using chi-square statistic
    # https://en.wikipedia.org/wiki/Matthews_correlation_coefficient, same as phi coefficient
    # same as using stats.pearsonr
    print metrics.matthews_corrcoef(df['ht_mobile'], df['evac'])
    print stats.pearsonr(df['ht_mobile'], df['evac'])
Example #3
0
def create_categorical_feature_regression(y,
                                          fraction=0.2,
                                          seed=None,
                                          verbose=False):
    """
    Create synthetic categorical column, strongly correlated with regression target.
    Each value is calculated according to the formula:
        v = y * a + random(-b, b)
        Where:
            a: 10
            b: one standard deviation of target vector
        So its scaled target value with some noise.
    Then a fraction of values is permuted, to reduce the correlation.

    Point biserial correlation is used to measure association.
    Parameters
    ---------
        y : np.ndarray, target vector
        fraction : float (default=0.2), fraction of values to be permuted to reduce the correlation
        seed : int (default=None), random seed that can be specified to obtain deterministic behaviour
        verbose : bool (default=False), when True, print correlation before and after the shuffling

    Returns
    ----------
        new_column : np.ndarray, new feature vector
        corr : float, correlation of new feature vector with target vector
    """
    if seed is not None:
        np.random.seed(seed)

    discretizer = KBinsDiscretizer(n_bins=5,
                                   encode='ordinal',
                                   strategy='uniform')
    new_column = discretizer.fit_transform(y.reshape(-1, 1))
    new_column = new_column.ravel()

    if verbose:
        corr, v = pointbiserialr(new_column, y)
        print(
            f'Initial new feature - target point biserial correlation, without shuffling: {round(corr, 3)}, p: {round(v, 3)}'
        )

    # Choose which samples to permute
    indices = np.random.choice(range(len(y)),
                               int(fraction * len(y)),
                               replace=False)

    # Find new order of this samples
    shuffled_indices = np.random.permutation(len(indices))
    new_column[indices] = new_column[indices][shuffled_indices]
    corr, p = pointbiserialr(new_column, y)
    if verbose:
        print(
            f'New feature - target point biserial correlation, after shuffling: {round(corr, 3)}, p: {round(p, 3)}'
        )

    return new_column, corr
Example #4
0
def create_numerical_feature_classification(y,
                                            a=10,
                                            b=5,
                                            fraction=0.2,
                                            seed=None,
                                            verbose=False):
    """
    Create synthetic numerical column, strongly correlated with binary classification target.
    Each value is calculated according to the formula:
        v = y * a + random(-b, b)
        So its scaled target value with some noise.
    Then a fraction of values is permuted, to reduce the correlation.

    Point biserial correlation is used to measure association.
    Parameters
    ---------
        y : np.ndarray, target vector
        a : int or float (default=10), scaling factor in a formula above
        b : int or float (default=5), value that determines the range of noise to be added
        fraction : float (default=0.2), fraction of values to be permuted to reduce the correlation
        seed : int (default=None), random seed that can be specified to obtain deterministic behaviour
        verbose : bool (default=False), when True, print correlation before and after the shuffling

    Returns
    ----------
        new_column : np.ndarray, new feature vector
        corr : float, correlation of new feature vector with target vector
    """
    if seed is not None:
        np.random.seed(seed)

    new_column = y * a + np.random.uniform(low=-b, high=b, size=len(y))
    if verbose:
        corr, v = pointbiserialr(new_column, y)
        print(
            f'Initial new feature - target point biserial correlation, without shuffling: {round(corr, 3)}, p: {round(v, 3)}'
        )

    # Choose which samples to permute
    indices = np.random.choice(range(len(y)),
                               int(fraction * len(y)),
                               replace=False)

    # Find new order of this samples
    shuffled_indices = np.random.permutation(len(indices))
    new_column[indices] = new_column[indices][shuffled_indices]
    corr, p = pointbiserialr(new_column, y)
    if verbose:
        print(
            f'New feature - target point biserial correlation, after shuffling: {round(corr, 3)}, p: {round(v, 3)}'
        )

    return new_column, corr
Example #5
0
def compute_test():
    r_is_bug_included_c = [5, 5, 25, 25, 30, 30, 80, 80]
    r_scores_c = [False, False, True, True, True, True, True, True]
    c_r = getattr(pointbiserialr(r_is_bug_included_c, r_scores_c),
                  'correlation')
    print(c_r)

    r_is_bug_included_c = [5, 5, 25, 25, 30, 30, 80, 80]
    r_scores_c = [False, False, False, False, False, False, True, True]
    c_r = getattr(pointbiserialr(r_is_bug_included_c, r_scores_c),
                  'correlation')
    print(c_r)
Example #6
0
def calc_corr(x, y, x_datatype, y_datatype, x_label, y_label):
    """
    Calculate the correlation of two arrays x and y

    Parameters
    ----------
    x : array with vals
    y : array with vals
    x_datatype: binary, discrete or continous
    y_datatype: binary, discrete or continous
    
    Returns
    -------
    dict with pval, rval and method

    """

    from scipy import stats

    rval = None
    pval = None
    method = None

    if (y_datatype in ['binary', 'discrete'] and x_datatype
            == 'continous') or (y_datatype == 'continous'
                                and x_datatype in ['binary', 'discrete']):

        # Calculate a point biserial correlation coefficient
        rval = stats.pointbiserialr(x, y)[0]
        pval = stats.pointbiserialr(x, y)[1]
        method = 'pointbiserial'

        # format for scientific notation
        pval = "{:.2e}".format(pval)

    if (y_datatype in ['binary', 'discrete']
            and x_datatype in ['binary', 'discrete']):
        # Calculate pearson correlation
        rval = cramers_v(pd.crosstab(x, y).to_numpy())
        pval = 'None'  # not yet implemented
        method = 'cramers_v'

    if y_datatype == 'continous' and x_datatype == 'continous':
        rval = stats.pearsonr(x, y)[0]
        pval = stats.pearsonr(x, y)[1]
        method = 'pearson'

    return {'rval': rval, 'pval': pval, 'method': method}
Example #7
0
def compute_group_importance(pipelines, scores, up_to_k=5):
    primitive_matrix, primitives = extract_primitive_matrix(pipelines)
    column_idx_map = {p: idx for idx, p in enumerate(primitives)}
    importances = {}
    for k in range(1, up_to_k + 1):
        for selected_columns in combinations(primitives, k):
            selected_columns_idx = np.array(
                [column_idx_map[c] for c in selected_columns])
            sub_matrix = primitive_matrix[:, selected_columns_idx]
            used_all = np.prod(sub_matrix, axis=1)
            importance, _ = pointbiserialr(used_all, scores)
            importance = 0 if np.isnan(importance) else importance
            importances[frozenset(selected_columns)] = importance

    # keeping only the ones for which importance is greater than those of its component parts
    kept = []
    for selected_columns in importances.keys():
        if (len(selected_columns) > 1):
            to_add = True
            for subgroup in chain(*[
                    list(combinations(selected_columns, take))
                    for take in range(1, len(selected_columns))
            ]):
                if abs(importances[frozenset(subgroup)]) >= abs(
                        importances[selected_columns]):
                    to_add = False
                    break
            if to_add:
                kept.append({
                    'importance': importances[selected_columns],
                    'group': list(selected_columns)
                })
    return sorted(kept, key=lambda x: abs(x['importance']), reverse=True)
def correlation(data):

    col_names = data.columns
    param = []
    correlation = []
    abs_corr = []

    for c in col_names:
        #Check if binary or continuous
        if c != "income":
            if len(data[c].unique()) <= 2:
                corr = spearmanr(data['income'], data[c])[0]
            else:
                corr = pointbiserialr(data['income'], data[c])[0]
            param.append(c)
            correlation.append(corr)
            abs_corr.append(abs(corr))

    #Create dataframe for visualization
    param_df = pd.DataFrame({
        'correlation': correlation,
        'parameter': param,
        'abs_corr': abs_corr
    })

    #Sort by absolute correlation
    param_df = param_df.sort_values(by=['abs_corr'], ascending=False)

    #Set parameter name as index
    param_df = param_df.set_index('parameter')

    scoresCV = []
    scores = []

    for i in range(1, len(param_df)):
        new_df = data[
            param_df.index[0:i +
                           1].values]  #sorting DF by correlation importance
        X = new_df.iloc[:, 1::]
        target = new_df.iloc[:, 0]
        clf = DecisionTreeClassifier()
        scoreCV = cross_val_score(clf, X, target, cv=10)
        scores.append(np.mean(scoreCV))

    plt.figure(figsize=(15, 5))
    plt.plot(range(1, len(scores) + 1), scores, '.-')
    plt.axis("tight")
    plt.title('Feature Selection', fontsize=14)
    plt.xlabel('# Features', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.grid()

    new_df = data[param_df.index[1:i + 1].values]
    new_df.shape
    X = new_df.iloc[:, 1::]

    return Bunch(data_corr=correlation,
                 data_par=param,
                 data_df_corr=param_df.copy(),
                 target=target.copy())
Example #9
0
def bin_accuracy_scores_prob(y_true, y_prob):
    """
    A function to calculate accuracy measures for probabilistic responses with sklearn and scipy.
    Function written by Osian Roberts.

    :param y_true: binary class labels, where 0 is absence and 1 is presence.
    :param y_prob: probability of presence scores e.g., generated by a species distribution model.

    :returns: a list containing two arrays - metrics = names of test metrics. scores = test scores for each metric.

    Useful reference:
    https://machinelearningmastery.com/how-to-score-probability-predictions-in-python
    """
    import numpy
    # check inputs:
    if not isinstance(y_true, numpy.ndarray):
        y_true = numpy.array(y_true)
    if not isinstance(y_prob, numpy.ndarray):
        y_prob = numpy.array(y_prob)
    if y_true.ndim != 1:
        raise SystemExit('ERROR: the true labels are not in a 1D array.')
    if y_prob.ndim != 1:
        raise SystemExit('ERROR: the probability of presence values are not in a 1D array.')
    if y_true.size != y_prob.size:
        raise SystemExit('ERROR: unequal number of binary labels and probabilities.')

        # ensure that y_true contains binary labels (i.e. 0 or 1 values):
    y_true = y_true.astype('uint8')
    if numpy.min(y_true) != 0 or numpy.max(y_true) != 1:
        raise SystemExit('ERROR: the true labels are not binary (zero or one values).')

    from sklearn.metrics import roc_auc_score
    # calculates area under the receiver operating curve score.
    # A score of 0.5 shows the model is unable to discriminate between presence and absence.
    roc_auc = roc_auc_score(y_true, y_prob)

    from sklearn.metrics import average_precision_score
    # calculates area under the precision-recall curve. Perfect model = 1.0.
    average_precision = average_precision_score(y_true, y_prob)

    from sklearn.metrics import brier_score_loss
    # This is a quadratic loss function that calculates the mean squared error between
    # predicted probabilities and the true presence-absence (binary) labels.
    # A model with no false positives/negatives has a score of 0.0. Perfect model = 1.0.
    brier_score = brier_score_loss(y_true, y_prob)

    from sklearn.metrics import log_loss
    # The is logarithmic loss function that more heavily penalises false positives/negatives than the brier score.
    # A model with no false positives/negatives has a score of 0.0. There is no upper bound.
    log_loss_score = log_loss(y_true, y_prob)

    from scipy.stats import pointbiserialr
    # The point biserial correlation coefficient, range -1 to 1.
    # Quantifies the correlation between a binary and continuous variable.
    r = pointbiserialr(y_true, y_prob)[0]

    metrics = ['Test AUC', 'Point-Biserial r', 'Av. Precision', 'Brier Score', 'Log-Loss Score']
    scores = numpy.array([roc_auc, r, average_precision, brier_score, log_loss_score]).round(decimals=6)
    del roc_auc, r, average_precision, brier_score, log_loss_score, y_true, y_prob
    return [metrics, scores]
def slicing_analysis(df, n, scoring_columns = None):
    
    df_splits = np.array_split(df, n)
    
    pbs_scores = {}
    
    # Container for n scores indication the average lexical similarity for each sub-dataframe
    # Rouge-1 with stopwords included as the measure of lexical similarty between the two sentences
    lexical_sim_all = []
    lexical_sim_label_0 = []
    lexical_sim_label_1 = []

    for df in df_splits:

        lexical_sim_all.append(df['rouge1_stopwords:False'].mean())
        lexical_sim_label_0.append(df['rouge1_stopwords:False'].loc[df['Label'] == 0].mean())
        lexical_sim_label_1.append(df['rouge1_stopwords:False'].loc[df['Label'] == 1].mean())

        for metric in scoring_columns:
            score = stats.pointbiserialr(df['Label'], df[metric])[0]
            if metric not in pbs_scores:

                pbs_scores[metric] = []

            pbs_scores[metric].append(score)


    # Prints the plots for the analysis of correlation scores as an effect of lexical similarity
    for metric in scoring_columns:
        sbs.set_style(style = 'darkgrid')
        sbs.scatterplot(x = lexical_sim_all, y = pbs_scores[metric])
        plt.xlabel('Lexical Similarity (ROUGE-1 with stopwords)')
        plt.ylabel(metric)
        plt.show()
Example #11
0
def pointBiserialCorrelation(x, y):
    '''
    Calculates the point-biserial correlation coefficients between the NUMERICAL features and the CATEGORICAL label
    :param x (pandas.DataFrame): The numerical features
    :param y (pandas.DataFrame): The categorical label
    '''

    corrs = pd.DataFrame(
        columns=["Feature", "Coefficient", "p-value", "Variance"])
    for column in x:
        # calculate the coefficients
        res = stats.pointbiserialr(x[column], y)
        # calculate additionally the variance of the features
        var = x[column].var()
        coefs = {
            "Feature": column,
            "Coefficient": res[0],
            "p-value": res[1],
            "Variance": var
        }
        corrs = corrs.append(coefs, ignore_index=True)
        print(f"{column}: Correlation = {res[0]}, pvalue = {res[1]}")

    # write the results to the file
    writeFrameToCsv(corrs, "point-biserial-corrs.csv")
Example #12
0
    def get_corr_coef(self, method):
        """
        计算维度间的相关系数,返回相关系数较高的特征对
        :param method: 何种相关系数(Pearson,Spearman,Kendall)
        新增判定系数、点二列相关
        :return:
        """
        flag = 0
        attr_relate = []  # 记录相关系数过高的特征编号和相关系数,例如:(1,2,0.9)表示特征1与特征2的相关系数为0.9
        if method == 'coef_determination':
            method = 'pearson'
            flag = 1
        if method == 'pointbiserialr':
            for i in range(self.attr_num):
                for j in range(i + 1, self.attr_num):
                    temp, _ = stats.pointbiserialr(self.dfattr.iloc[:, i], self.dfattr.iloc[:, j])
                    if temp > 0.8:
                        attr_relate.append((i, j, temp))
            return attr_relate

        pearson_corr_mat = self.dfattr.corr(method=method)

        for i in range(self.attr_num):
            for j in range(i + 1, self.attr_num):
                temp = pearson_corr_mat.iloc[i, j]
                if temp * temp > 0.64:
                    if flag > 0:
                        attr_relate.append((i, j, temp * temp))
                    elif temp > 0.8:
                        attr_relate.append((i, j, temp))
        return attr_relate
def correlation():
    df =  pd.read_csv("dataset/train_new.csv")
    # df = df.dropna(axis=0,how="any")
    print df.describe()
    # print df.head()
    param=[]
    correlation=[]
    abs_corr=[]
    covariance = []
    columns = ["Applicant_Gender","App_age","Applicant_Occupation","Applicant_Qualification","Manager_age","Manager_Status","Manager_Gender","Manager_Business","Manager_Business2","Manager_Num_Application"]
    for c in columns:
        #Check if binary or continuous

        if len(df[c].unique())<=12:
            corr = spearmanr(df['Business_Sourced'],df[c])[0]
            print "spear",c,corr
            y = df['Business_Sourced']
            x = df[c]
            X = np.vstack((y,x))
            covar = np.cov(X)
        else:
            corr = pointbiserialr(df['Business_Sourced'],df[c])[0]
            print "point",c,corr
            y = df['Business_Sourced']
            x = df[c]
            X = np.vstack((y,x))
            covar = np.cov(X)
        param.append(c)
        correlation.append(corr)
        abs_corr.append(abs(corr))
        # covariance.append(covar[0][1])
    print covariance
Example #14
0
    def pointbiserialcorr(s1, s2):
        """ Calculate the mean point biserial correlation of the RTDs of
            the two given solvers on all instances of the experiment.
            Only consider values where the statistical significance is large
            enough (p-value < alpha = 0.05)
        """
        from scipy import stats

        alpha = 0.05 # level of statistical significant difference
        d = 0.0
        num = 0
        for i in instance_ids:
            res1 = solver_config_results[s1.idSolverConfig][i]
            res2 = solver_config_results[s2.idSolverConfig][i]
            ranked_data = list(rankdata(res1 + res2))

            r, p = stats.pointbiserialr([1] * len(res1) + [0] * len(res2), ranked_data)
            # only take instances with significant differences into account
            if p < alpha:
                #print str(s1), str(s2), str(i), r, p
                d += r
                num += 1

        if num > 0:
            return d / num # return mean difference
        else:
            return 0 # s1 == s2
Example #15
0
    def pointbiserialr(self, dataset, specName=0):
        dataset['code'] = pd.factorize(dataset[specName])[0] + 1
        dataset = dataset.drop([specName], axis=1)

        sizes = ['size']
        corrs = ['correlation']
        cats = []

        for col in dataset.columns:
            if col != 'code':
                dataset2 = dataset.filter([col, 'code'], axis=1)
                dataset2 = dataset2.dropna()

                features = dataset2.iloc[:, 0].values
                labels = dataset2.iloc[:, 1].values

                result = stats.pointbiserialr(features, labels)

                if math.isnan(result.correlation) == False and math.isinf(
                        result.correlation) == False:
                    sizes.append(len(features))
                    corrs.append(result.correlation * 100)
                    cats.append(col)

        return cats, [corrs, sizes]
	def test(self, use_saved_embeddings=True):

		self.load('%s/adem_model.pkl' % self.config['exp_folder'])
		test_fname_embeddings = '%s/test_%s_embeddings.pkl' % (self.config['vhrd_data'], self.config['mode'])
		test_x, test_y = self.get_vhrd_embeddings(self.config['test_data'], self.config['mode'], test_fname_embeddings, use_saved_embeddings)
		if self.config['use_pca']:
			test_x = self._apply_pca(test_x)
		
		predictions = (np.array(self._get_outputs(test_x))-1)/4.0
		test_y = (np.array(test_y)-1)/4

		predictions_positive = predictions[test_y==1]
		predictions_negative = predictions[test_y==0]
		
		np.savetxt(os.path.join(self.config['exp_folder'], 'positive_probs.npy'), predictions_positive)
		np.savetxt(os.path.join(self.config['exp_folder'], '{}_negative_probs.npy'.format(self.config['mode'])), predictions_negative)

		acc = accuracy_score(test_y, predictions>0.5)
		print ('Accuracy: ',acc)
		matrix = confusion_matrix(test_y, predictions>0.5)
		print ('confusion_matrix: ', matrix)
		pbc, pval = pointbiserialr(test_y, predictions)
		print ('PBC: ', pbc, 'p-value: ', pval)
		sys.stdout.flush()

		return
Example #17
0
def assign_functions(k,
                     clust,
                     splits,
                     act,
                     dtm,
                     lexicon,
                     list_lens=range(5, 26)):

    from scipy.stats import pointbiserialr

    lists = pd.DataFrame()
    for i in range(k):
        structures = list(clust.loc[clust["CLUSTER"] == i + 1, "STRUCTURE"])
        centroid = np.mean(act.loc[splits["train"], structures], axis=1)
        R = pd.Series([
            pointbiserialr(dtm.loc[splits["train"], word], centroid)[0]
            for word in lexicon
        ],
                      index=lexicon)
        R = R[R > 0].sort_values(ascending=False)[:max(list_lens)]
        R = pd.DataFrame({
            "CLUSTER": [i + 1 for l in range(max(list_lens))],
            "TOKEN": R.index,
            "R": R.values
        })
        lists = lists.append(R)

    return lists
Example #18
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.pointbiserialr(data[0], data[1])
    dof = None
    test_result = TestResult(name=pointbiserial_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
Example #19
0
    def pointbiserialcorr(s1, s2):
        """ Calculate the mean point biserial correlation of the RTDs of
            the two given solvers on all instances of the experiment.
            Only consider values where the statistical significance is large
            enough (p-value < alpha = 0.05)
        """
        from scipy import stats

        alpha = 0.05  # level of statistical significant difference
        d = 0.0
        num = 0
        for i in instance_ids:
            res1 = solver_config_results[s1.idSolverConfig][i]
            res2 = solver_config_results[s2.idSolverConfig][i]
            ranked_data = list(rankdata(res1 + res2))

            r, p = stats.pointbiserialr([1] * len(res1) + [0] * len(res2),
                                        ranked_data)
            # only take instances with significant differences into account
            if p < alpha:
                #print str(s1), str(s2), str(i), r, p
                d += r
                num += 1

        if num > 0:
            return d / num  # return mean difference
        else:
            return 0  # s1 == s2
    def best_relationship_class(self):
        label = self.data_loader.cleaned[:, -1]
        variables_data = self.data_loader.scaled[:, :-1]
        relationships_array = np.array([
            stats.pointbiserialr(variable, label)
            for variable in variables_data.T
        ])
        variables_names = self.data_loader.columns[:-1]

        max_value = 0
        var = 0
        variable_max = []
        for i in relationships_array:
            if max_value == 0 or abs(i[0]) > max_value:
                max_value = i[0]
                variable_max = i
                best_var = var
            var += 1
        print('{} presents a Point Biserial Correlation of {} with the Class'
              ' variable. (p-value={})'.format(variables_names[best_var],
                                               max_value,
                                               str(variable_max[1])))

        return (relationships_array, variable_max, best_var,
                variables_names[best_var])
Example #21
0
def _compute_correlative_all(x, y, xa, ya, method):
    outliers = []
    # where x, y are pd.Series, xa, ya are preprocessed numpy arrays
    if _both_continuous(x, y):
        if method == "pearson":
            r, pval = pearsonr(xa, ya)
        elif method == "spearman":
            r, pval = spearmanr(xa, ya)
        elif method == "kendall":
            r, pval = kendalltau(xa, ya)
        elif method == "percbend":
            r, pval = percbend(xa, ya)
        elif method == "shepherd":
            r, pval, outliers = shepherd(xa, ya)
        elif method == "skipped":
            r, pval, outliers = skipped(xa, ya, method="spearman")
        else:
            raise ValueError("Method not recognized.")

    elif _both_integers(x, y):
        # handle the integer-integer use case.
        r, pval = spearmanr(xa, ya)
    # if they're both categories (strings), then use kramers_v
    elif _continuous_categorical(x, y):
        # correlation ratio [0, 1]
        r, pval = corr_ratio(xa, ya)
    elif _categorical_continuous(x, y):
        # correlation ratio [0, 1]
        r, pval = corr_ratio(ya, xa)
    elif _both_categorical(x, y):
        # kramer's v for categorical-categorical [0, 1]
        r, pval = kramers_v(x, y, True)
    elif _continuous_bool(x, y):
        # sort them into order, it matters
        r, pval = pointbiserialr(xa, ya.astype(np.uint8))
    elif _bool_continuous(x, y):
        # sort them into order, it matters
        r, pval = pointbiserialr(xa.astype(np.uint8), ya)
    elif _both_bool(x, y):
        # use spearman
        r, pval = spearmanr(xa.astype(np.uint8), ya.astype(np.uint8))
    else:
        raise TypeError(
            "columns '{}':{} to '{}':{} combination not accepted for `bicorr`."
            .format(x.name, x.dtype, y.name, y.dtype))
    assert not np.isnan(r), "Correlation returned NaN. Check your data."
    return r, pval, outliers
Example #22
0
def main():
    dataset = pd.read_csv("/Users/rathi/Downloads/BreastCancer.txt")
    X = dataset.iloc[:, :-1].values
    X_labels = dataset.iloc[:, :-1].columns.values
    y = dataset.iloc[:, -1].values

    correlation_coef = []
    indexes = []
    X_feat_labels = []
    for i in range(len(X_labels)):
        pbsr = pointbiserialr(X[:, i], y)
        # print(pbsr)
        # if pbsr.pvalue >= 0.5:
        # Select features with correlation > 0 and pvalue < 0.5
        if pbsr.correlation > 0 and pbsr.pvalue < THR:
            indexes.append(i)
            X_feat_labels.append(X_labels[i])
        correlation_coef.append(pbsr.correlation)
    X_feat = X[:, indexes]
    corr_matrix = pd.DataFrame(X_feat,
                               columns=X_feat_labels).corr(method='spearman')
    sns.heatmap(corr_matrix)
    indexes_final = []
    X_feat_labels_final = []
    for i in indexes:
        coef, pvalue = spearmanr(X[:, i], y)
        # print(coef, pvalue)
        if pvalue < THR:
            X_feat_labels_final.append(X_labels[i])
            indexes_final.append(i)
    X_feat_final = X[:, indexes_final]
    X_train, X_test, y_train, y_test = train_test_split(X_feat_final,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    model = LogisticRegression(solver='liblinear', max_iter=50)
    # model = LogisticRegression(solver='lbfgs', class_weight="balanced")
    model.fit(X_train, y_train)
    preds = np.where(model.predict_proba(X_test)[:, 1] > THR, 1, 0)
    df = pd.DataFrame(X_feat_final, columns=X_feat_labels_final)
    df2 = pd.DataFrame(
        data=[
            accuracy_score(y_test, preds),
            recall_score(y_test, preds),
            precision_score(y_test, preds),
            roc_auc_score(y_test, preds)
        ],
        index=["accuracy", "recall", "precision", "roc_auc_score"])
    log_likelihood = -log_loss(y_test, preds)
    r, c = df.shape
    bic = calculate_bic(log_likelihood, r, c - 1)
    pprint(X_feat_labels_final)
    print("BIC:\t\t", bic)
    print(df2)
    # print(df)
    pprint(y_test)
    pprint(preds)
    plt.show()
    exit(0)
Example #23
0
def test_pointbiserial():
    # copied from mstats tests removing nans
    x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,
         0,0,0,0,1]
    y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0,
         2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1,
         0.8,0.7,0.6,0.5,0.2,0.2,0.1]
    assert_almost_equal(stats.pointbiserialr(x, y)[0], 0.36149, 5)
Example #24
0
def PointBiserial(x, y):
    x, y = PairwiseDeletion(x,y)
    r, prob = stats.pointbiserialr(x, y)
    df = Count(x)-1
    result = {'r':r, 'df':df, 'prob':prob}
    result['quote'] = "<b>Quote: </b> <i>r</i> (%d) = %.3f, <i>p</i> = %1.4f<br />"
    result['quotetxt'] = "Quote: r (%d) = %.3f, p = %1.4f\n"
    return result
Example #25
0
def validation(data_iter, net, save_scores=False, delta=0.8):
    ''' 
    calculate the Acc
    '''
    score_list=[]
    label_list=[]

    net.eval()
    losses, batch_num, acc, acc_num = 0, 0, 0, 0
    # criterion = nn.CrossEntropyLoss()
    criterion = nn.BCELoss()
    for batch_idx, batch in enumerate((data_iter)):
        qbatch, rbatch, qlength, rlength, label = batch
        qbatch = torch.from_numpy(qbatch)
        rbatch = torch.from_numpy(rbatch)
        qlength = torch.from_numpy(qlength)
        rlength = torch.from_numpy(rlength)
        label = torch.from_numpy(label).float()
        batch_size = len(qlength)
                
        if torch.cuda.is_available():
            qbatch, rbatch = qbatch.cuda(), rbatch.cuda()
            qlength, rlength = qlength.cuda(), rlength.cuda()
            label = label.cuda()
            
        qbatch = qbatch.transpose(0, 1)
        rbatch = rbatch.transpose(0, 1)
                    
        scores = net(qbatch, qlength, rbatch, rlength)    # [2 * B]
        loss = criterion(scores, label)
        
        score_list.extend(scores.cpu().data.numpy().tolist())
        label_list.extend(label.cpu().data.numpy().tolist())

        s = scores >= 0.5
        acc += torch.sum(s.float() == label).item()
        
        acc_num += batch_size
        
        batch_num += 1
        losses += loss.item()

    score_list = np.array(score_list)
    label_list = np.array(label_list)

    pbc, pval = pointbiserialr(label_list, score_list)
    acc = accuracy_score(label_list, score_list >=0.5)
    print ('PBC: {}, pval: {}'.format(pbc, pval))

    if save_scores:
        np.savetxt( args.exp_dir + '/test_' + args.mode +'_scores.txt' ,score_list)
        np.savetxt( args.exp_dir + '/test_' + args.mode +'_labels.txt' ,label_list)

        predicted = (score_list >=0.5).astype(np.int32)
        c_matrix = confusion_matrix(label_list,predicted)
        print ('confusion_matrix = ',c_matrix)
    
    return round(losses / (batch_num), 4), acc
Example #26
0
def data_info():
    """
    Function for inspecting the data set.
    """
    # Set font sizes for plotting
    fonts = {
        "font.size": 16,
        "legend.fontsize": "medium",
        "xtick.labelsize": 15,
        "ytick.labelsize": 15,
        "axes.titlesize": 22
    }
    plt.rcParams.update(fonts)

    pulsar_data.info()
    print(pulsar_data.head())  # Print top 5 entries of the data set
    print("Value|Count")
    print(pulsar_data["Target"].value_counts()
          )  # Count number of targets in data

    f, ax = plt.subplots(figsize=(12, 12))
    ax.set_title("Heatmap of the data set features")
    sns.heatmap(pulsar_data.corr(),
                annot=True,
                linecolor="blue",
                fmt=".2f",
                ax=ax)
    #sns.set(font_scale=5)
    plt.tight_layout()
    plt.savefig("Figures/heatmap.png")

    # Information gain of the features
    cols = list(feature.columns)
    infos = mutual_info_classif(X, np.ravel(y), random_state=42)
    info_gain_int = {
        cols[0]: [infos[0]],
        cols[1]: [infos[1]],
        cols[2]: [infos[2]],
        cols[3]: [infos[3]]
    }
    info_gain_int = pd.DataFrame(info_gain_int)
    info_gain_curve = {
        cols[4]: [infos[4]],
        cols[5]: [infos[5]],
        cols[6]: [infos[6]],
        cols[7]: [infos[7]]
    }
    info_gain_curve = pd.DataFrame(info_gain_curve)
    print("Information gain of the features:")
    print(info_gain_int)
    print(info_gain_curve)

    # Point-biserial correlation, linear correlation between the variables for
    # dichotomous target variable
    for i in range(8):
        print(pointbiserialr(np.ravel(y), X[:, i]))

    plt.show()
Example #27
0
def proportion_base_atom(df: pd.DataFrame, base_atom_pref: List) -> List:
    proportion = [
        df[i].sum() / df[i].count() for i in range(len(df.columns))
        if i != 9 and i != 19
    ]
    print(len(proportion))
    print(len(base_atom_pref))
    print(pointbiserialr(proportion, base_atom_pref))
    assert False
Example #28
0
def continuous_significance(df, significant_pval=0.01):

    df = df.copy()
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna(subset=list(continuous_columns(df)))
    return filter(
        lambda y: y[0] <= significant_pval,
        map(lambda x: (stats.pointbiserialr(df["pred_state"], df[x])[1], x),
            continuous_columns(df)))
Example #29
0
def PointBiserial(x, y):
    x, y = PairwiseDeletion(x, y)
    r, prob = stats.pointbiserialr(x, y)
    df = Count(x) - 1
    result = {'r': r, 'df': df, 'prob': prob}
    result[
        'quote'] = "<b>Quote: </b> <i>r</i> (%d) = %.3f, <i>p</i> = %1.4f<br />"
    result['quotetxt'] = "Quote: r (%d) = %.3f, p = %1.4f\n"
    return result
Example #30
0
    def plot_scatters(self, df, show=True):
        factors = [col for col in df.columns if col[:3] in ['NMF', 'ICA', 'PCA']]
        df = self.normalise_component_columns(df, factors)

        rows, cols = len(factors), len(self.scatter_cols)
        plt.figure(figsize=(2.5*cols, 2*rows))
        i = 0
        for factor in factors:
            for feat in self.scatter_cols:
                i += 1
                plt.subplot(rows, cols, i)

                if feat == 'Mutational_load':
                    # Remove an outlier which messes up the scaling
                    outlier = 127424
                    x = df[feat][df[feat] < outlier]
                    y = df[factor][df[feat] < outlier]
                elif feat in ['WGD', 'Which']:
                    # WGD is 0 or 1, so jitter slightly
                    jitter = np.random.uniform(-0.1, 0.1, len(df))
                    x = df[feat]+jitter
                    y = df[factor]
                else:
                    x, y = df[feat], df[factor]
                plt.scatter(x, y, c=self.colours[factor[:3]])

                # Labels only on the left and bottom plots
                if i > (rows - 1) * cols:
                    plt.xlabel(feat, size=16)
                if i % cols == 1:
                    plt.ylabel(factor, size=16)

                # No scales - there's no space and it would not be informative
                plt.xticks([])
                plt.yticks([])

                # Calculate and show correlation coefficients and p-values
                # Must get x and y again to avoid jitter and outlier changes
                x = df[feat].values
                y = df[factor].values
                if feat in ['WGD', 'Which']:
                    # Binary value, so use Point-Biserialr correlation
                    r, p_val = stats.pointbiserialr(x, y)
                else:
                    r, p_val = stats.pearsonr(x, y)
                star = '***' if p_val <= 0.01 else ''
                annotation = 'r=%4.2f, p=%0.3f %s' % (r, p_val, star)
                plt.title(annotation)

        if self.dataset_tag:
            figpath = self.plots_dir + 'genomic_feature_scatters_%s.pdf' % self.dataset_tag
            print("Saving figure to", figpath)
            plt.savefig(figpath, bbox_inches='tight')

        if show:
            plt.show()
Example #31
0
 def corr_categorical_with_wages(df, wages):
     correlations = {}
     columns = df.columns.tolist()
     for col in columns:
         correlations["wages" + '___' + col] = stats.pointbiserialr(
             wages, df[col].values)
     results = pd.DataFrame.from_dict(correlations, orient="index")
     results.columns = ["correlation", "pvalues"]
     results.sort_index(inplace=True)
     return results
Example #32
0
def pointbiserial(a, bs, weather_var):
    weather_dict = {}
    for variable in weather_var:
        b = [j[variable] for j in bs]
        r, p = stats.pointbiserialr(a, b)
        weather_dict[variable] = [p, r]
    result_df = pd.DataFrame.from_dict(weather_dict,
                                       orient='index',
                                       columns=['p-value', 'r-value'])
    return result_df
    def SpearmanCorr(self, values, labels):

        if self.mode == 'birth' or self.mode == 'data':
            if self.mode == 'birth':
                print('--------  Organized with birth date --------')
            elif self.mode == 'data':
                print('--------  Organized with # of data --------')

            coef = spearmanr(values, np.arange(0, 13, step=1))[0]
            p_value = spearmanr(values, np.arange(0, 13, step=1))[1]
            print('Spearman Rank: ', round(coef, 4))
            print('P_Value: ', round(p_value, 4))

            # Calculating Point Biserial Rank
            coef = pointbiserialr(values, np.arange(0, 13, step=1))[0]
            p_value = pointbiserialr(values, np.arange(0, 13, step=1))[1]
            print('Point Biserial Rank: ', round(coef, 4))
            print('P_value: ', round(p_value, 4))

            return
def get_PBS_corr_from_cols(df,target_col,cont_cols,thresh = 0 ):
    res = dict()
    for col in cont_cols:
        correlation, pval = pointbiserialr(df[target_col],df[col])
        res[col] = correlation
    inter = pd.Series(res, name='corr').reset_index()
    inter['abs_corr'] = pd.DataFrame.abs(inter['corr'])
    inter = inter[inter['abs_corr'] > thresh ]
    fin_res = inter.sort_values('corr',ascending=False)
    fin_res = fin_res.drop(columns = ['abs_corr'])
    return(fin_res)
Example #35
0
def correlation(df):
    columns = df.columns.values
    print columns
    param=[]
    correlation=[]
    abs_corr=[]
    covariance = []

    # cor = np.array(df)
    # # print cor
    # x = cor[:,1:]
    # # print x
    # y = np.array(cor[:,0])
    # # print y
    # X = np.vstack((y,x))
    # print np.cov(X)

    for c in columns:
        #Check if binary or continuous
        if len(df[c].unique())<=2:
            corr = spearmanr(df['Survived'],df[c])[0]
            y = df['Survived']
            x = df[c]
            X = np.vstack((y,x))
            covar = np.cov(X)
        else:
            corr = pointbiserialr(df['Survived'],df[c])[0]
            print corr
            y = df['Survived']
            x = df[c]
            X = np.vstack((y,x))
            covar = np.cov(X)
        param.append(c)
        correlation.append(corr)
        abs_corr.append(abs(corr))
        covariance.append(covar[0][1])
    print covariance

    #Create dataframe for visualization
    param_df=pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr,'covariance':covariance})

    #Sort by absolute correlation
    param_df=param_df.sort_values(by=['abs_corr'], ascending=False)

    #Set parameter name as index
    param_df=param_df.set_index('parameter')

    parameter_grid(param_df,df)

    print param_df
Example #36
0
def get_sort_abs_cor(data):
    columns=data.columns.values
    correlation=[]
    #spearmanr计算类别变量之间的相关性
    #pointbiserialr计算类别变量与连续变量之间的相关性
    for i in columns:
        if len(data[i].unique())<=2:
            correlation.append(spearmanr(data['Survived'],data[i])[0])
        else:
            correlation.append(pointbiserialr(data['Survived'],data[i])[0])
        
    cor=pd.DataFrame({'Correlation':correlation})
    cor.index=columns
    cor['abs_cor']=cor.Correlation.apply(lambda x:abs(x))
    cor=cor.iloc[1:,:]
    sort_abs_cor=cor.abs_cor.sort_values(ascending=False)
    
    return sort_abs_cor
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Example #38
0
 def calc_biserial_correlation_coefficient(self):
     """ Calculates a point biserial correlation coefficient and the associated p-value. 
         The point biserial correlation is used to measure the relationship between a binary variable and a continuous variable
     """
     self.scores['Biserial Correlation Coefficient'] = pointbiserialr(self.y_true, self.y_pred) 
Example #39
0
'''
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss, matthews_corrcoef, precision_recall_curve, auc, roc_curve, confusion_matrix, hinge_loss, accuracy_score, classification_report, f1_score, fbeta_score, hamming_loss, jaccard_similarity_score, precision_recall_fscore_support, precision_score, recall_score, zero_one_loss, average_precision_score, roc_auc_score
from scipy.stats import scoreatpercentile, pointbiserialr,ks_2samp,pearsonr, spearmanr
from sklearn.metrics.regression import mean_absolute_error, r2_score
import matplotlib.pyplot as plt 

from sklearn.preprocessing import binarize
import numpy as np

x_true = np.array([1,   0,      0,      0,      1,       1,     0,      0,      0,      0,      1,     0])
y_true = np.array([1.0, 0.01,   1.0,    0.012,  0.42,    0.021, 0.56,   0.011,  0.091,  0.0215, 0.001, 1])
#y_true = np.array([0,   0,      0,      0,      1,      1, 0, 0, 0, 0, 1, 1])

print ks_2samp(x_true, binarize(y_true, scoreatpercentile(y_true, 30))[0])
print pointbiserialr(y_true, x_true)
print scoreatpercentile(y_true, 30)


def validate(y_true, y_pred):
    
    print 'Kolmogorov-Smirnov test = ', ks_2samp(y_true, y_pred)
    print 'mean_squared_error = ', mean_squared_error(y_true, y_pred)
    print 'mean_absolute_error = ', mean_absolute_error(y_true, y_pred)
    print 'r2_score = ', r2_score(y_true, y_pred)
    
    """TBD compute the log-loss to consider boolean"""
    
    print "log_loss = " + str(log_loss(y_true, y_pred)) #Log loss, aka logistic loss or cross-entropy loss.
    
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)    #Compute precision-recall pairs for different probability thresholds
train_df = pd.concat([train_df,dummies_Sex,dummies_Embarked,dummies_Pclass,dummies_Title], axis=1)
train_df = train_df.drop(["Sex","Embarked","Pclass","Title","Name"], axis=1)
train_df.set_index(['PassengerId'])


columns = train_df.columns.values
param = []
correlation = []
abs_corr = []

for c in columns:
    # Check if binary or continuous
    if len(train_df[c].unique()) <= 2:
        corr = spearmanr(train_df["Survived"], train_df[c])[0]
    else:
        corr = pointbiserialr(train_df["Survived"], train_df[c])[0]
    param.append(c)
    correlation.append(corr)
    abs_corr.append(abs(corr))

# Create dataframe for visualization
param_df = pd.DataFrame({"correlation":correlation, "parameter":param, "abs_corr":abs_corr})
# Sort by absolute correlation
param_df = param_df.sort_values(by=["abs_corr"],ascending=False)
# Set parameter name as index
param_df = param_df.set_index("parameter")
print param_df

scoresCV = []
scores = []
print '================='
df = df.set_index(['PassengerId'])


# 计算相关系数
columns = df.columns.values
param=[]
correlation=[]
abs_corr=[]

for c in columns:
    #Check if binary or continuous
    if len(df[c].unique())<=2:
        corr = spearmanr(df['Survived'],df[c])[0]
    else:
        corr = pointbiserialr(df['Survived'],df[c])[0]
    param.append(c)
    correlation.append(corr)
    abs_corr.append(abs(corr))

#Create dataframe for visualization
param_df=pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr})
#Sort by absolute correlation
param_df=param_df.sort_values(by=['abs_corr'], ascending=False)
#Set parameter name as index
param_df=param_df.set_index('parameter')



'''
# 利用DecisionTree进行特征选择