コード例 #1
0
ファイル: bdd.py プロジェクト: valeoai/BEEF
 def end_epoch(self):
     self.all_gts['course'] = np.concatenate(self.all_gts['course'])
     self.all_preds['course'] = np.concatenate(self.all_preds['course'])
     self.all_gts['accel'] = np.concatenate(self.all_gts['accel'])
     self.all_preds['accel'] = np.concatenate(self.all_preds['accel'])
     course_correl = dcor.distance_correlation(self.all_gts['course'], self.all_preds['course'])
     accel_correl = dcor.distance_correlation(self.all_gts['accel'], self.all_preds['accel'])
     Logger().log_value('%s_epoch.course_correl' % self.mode, course_correl, should_print=True)
     Logger().log_value('%s_epoch.accel_correl' % self.mode, accel_correl, should_print=True)
     self.all_gts = {'course': [], 'accel': []}
     self.all_preds = {'course': [], 'accel': []}
     return None
コード例 #2
0
    def calculate_if(X, thresh=0.8):
        variables = list(X.columns)
        variables.sort()
        variables_keep = []
        variables_drop = []
        variables_remaining = variables.copy()
        for var in variables:
            X_remaining = X[variables_remaining]
            if var in variables_drop:
                continue
            print(f'target variable: {var}')
            idx = X_remaining.columns.get_loc(var)
            x_i = X_remaining.iloc[:, idx].values
            k_vars = X_remaining.shape[1]
            mask = np.arange(k_vars) != idx
            x_noti = X_remaining.iloc[:, mask]
            ds = [
                dcor.distance_correlation(x_i, x_noti_i)
                for x_noti_i in x_noti.T.values
            ]
            remaining_variables = x_noti.columns
            for d, remaining_var in zip(ds, remaining_variables):
                if d >= thresh:
                    variables_drop.append(remaining_var)
                    variables_remaining.remove(remaining_var)

                    print(
                        f'dropping {remaining_var} with dcor {np.round(d, 2)}')
            variables_remaining.remove(var)
            variables_keep.append(var)
        X = X[variables_keep]
        print(f'dropped {len(variables_drop)} variables, kept {X.shape[1]}')
        return X, variables_drop
コード例 #3
0
def distance_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:

    tickers = df.columns.tolist()
    df_dcor = pd.DataFrame(index=tickers, columns=tickers)

    k = 0
    for i in tickers:
        v_i = df.loc[:, i].values
        v_i = np.array([i for i in v_i])

        for j in tickers[k:]:
            v_j = df.loc[:, j].values
            v_j = np.array([j for j in v_j])
            dcor_val = dcor.distance_correlation(v_i, v_j)
            df_dcor.at[i, j] = dcor_val
            df_dcor.at[j, i] = dcor_val

        k += 1

    # dcor_matrix = matrix_power(df_dcor.to_numpy(), 2)
    dcor_matrix = expm(df_dcor.to_numpy())
    df_expdcor = pd.DataFrame(dcor_matrix)
    df_expdcor.columns = df_dcor.columns
    df_expdcor.index = df_dcor.index

    return df_expdcor
コード例 #4
0
def get_percentiles_by_bins(ss_column, corr_column, invalid_lines, n_bins=500):
    bin_vals = [[] for i in range(n_bins)]
    #bin_ids  = [[] for i in range(n_bins)]
    for i in range(len(ss_column)):
        if not i in invalid_lines:
            ss_val   = ss_column[i]
            corr_val = corr_column[i]
            corr_bin = int(round(corr_val*n_bins,0))
            if corr_bin >= n_bins:
                corr_bin = n_bins - 1
            #bin_ids[corr_bin].append(i)
            bin_vals[corr_bin].append(ss_val)

    x               = []
    medians         = []
    top_quantile    = []
    bottom_quantile = []
    for i in range(n_bins):
        if len(bin_vals[i]) > 0:
            x1 = i/n_bins
            x2 = (i+1)/n_bins
            x.append((x1,x2))

            bin_vals[i].sort()
            top_quantile.append(np.percentile(bin_vals[i], 75))
            medians.append(np.percentile(bin_vals[i], 50))
            bottom_quantile.append(np.percentile(bin_vals[i], 25))
        '''else:
            top_quantile.append(0.0)
            medians.append(0.0)
            bottom_quantile.append(0.0)'''
    corr = dcor.distance_correlation(np.array([a for a,b in x]), np.array(medians))

    return x, top_quantile, medians, bottom_quantile, corr
コード例 #5
0
ファイル: gpdc_torch.py プロジェクト: vishalbelsare/tigramite
    def _get_dcorr(self, array_resid):
        """Return distance correlation coefficient.

        The variables are transformed to uniform marginals using the empirical
        cumulative distribution function beforehand. Here the null distribution
        is not analytically available, but can be precomputed with the function
        generate_and_save_nulldists(...) which saves a \*.npz file containing
        the null distribution for different sample sizes. This file can then be
        supplied as null_dist_filename.

        Parameters
        ----------
        array_resid : array-like
            data array must be of shape (2, T)

        Returns
        -------
        val : float
            Distance correlation coefficient.
        """
        # Remove ties before applying transformation to uniform marginals
        # array_resid = self._remove_ties(array_resid, verbosity=4)
        x_vals, y_vals = self._trafo2uniform(array_resid)
        val = dcor.distance_correlation(x_vals, y_vals, method='AVL')
        return val
コード例 #6
0
def corr_distance(sd_log):
    # long runtime
    from scipy.spatial.distance import pdist, squareform
    import dcor
    data = sd_log.data
    feat_names = sd_log.columns.tolist()
    df_dcor = pd.DataFrame(index=feat_names, columns=feat_names)

    def compute_matrix(i):
        v1 = data.loc[:, i].as_matrix()

        v1_dist = squareform(pdist(v1[:, np.newaxis]))
        return (i, dcor.double_centered(v1_dist))

    k = 0
    for feat_i in feat_names:
        tmp = data.loc[:, feat_i]
        v1 = data.loc[:, feat_i].to_numpy()

        for feat_j in feat_names[k:]:
            v2 = data.loc[:, feat_j].to_numpy()

            rez = dcor.distance_correlation(v1, v2)

            df_dcor.at[feat_i, feat_j] = rez
            df_dcor.at[feat_j, feat_i] = rez

        k += 1
    return df_dcor
コード例 #7
0
def df_distance_correlation(df, stocks):

    #initializes an empty DataFrame
    df_dcor = pd.DataFrame(index=stocks, columns=stocks)

    #initialzes a counter at zero
    k=0

    # iterates over the time series of eachstocks stock
    for i in stocks:

        # stores the ith time series as a vector
        v_i = df.loc[:, i].values

        # iterates over the time series of each stock subect to the counter k
        for j in stocks[k:]:

            # stores the jth time series as a vector
            v_j = df.loc[:, j].values

            # computes the dcor coefficient between the ith and jth vectors
            dcor_val = dcor.distance_correlation(v_i, v_j)

            # appends the dcor value at every ij entry of the empty DataFrame
            df_dcor.at[i,j] = dcor_val

            # appends the dcor value at every ji entry of the empty DataFrame
            df_dcor.at[j,i] = dcor_val

        # increments counter by 1
        k+=1

    # returns a DataFrame of dcor values for every pair of stocks
    return df_dcor
コード例 #8
0
ファイル: biased.py プロジェクト: Remit/autoscaling-simulator
    def _compute_correlation(self, metrics_vals_1: pd.Series,
                             metrics_vals_2: pd.Series, lag: int):

        return dcor.distance_correlation(
            metrics_vals_1.astype(float),
            metrics_vals_2.shift(lag).fillna(0).astype(float),
            exponent=self.exponent)
コード例 #9
0
def calc_filtered_dist_corr(X: np.ndarray, Y: np.ndarray) -> Tuple[float, float, float]:
    N, n, frac = 1000,  Y.shape[0], 1.0
    indices_list, dist_corr_list = [], []
    for idx in range(N):
        indices = np.random.choice(n, size=int(frac * n), replace=True)
        indices_list.append(indices)
    for indices in indices_list:
        dist_corr = dcor.distance_correlation(X[indices], Y[indices]) # distance_corr(X, Y, seed=9)
        dist_corr_list.append(dist_corr)
    low_dist_corr, up_dist_corr = np.percentile(dist_corr_list, 2.5), np.percentile(dist_corr_list, 97.5)
    dist_corr = dcor.distance_correlation(X, Y)
    # pval = dcor.independence.distance_covariance_test(X, Y, exponent=1.0, num_resamples=100)
    # dist_corr, pval = distance_corr(X, Y, n_boot=100)
    # print(f'dco pval: {dist_corr} {pval}')
    # import ipdb
    # ipdb.set_trace()
    return dist_corr, low_dist_corr, up_dist_corr
コード例 #10
0
ファイル: cka.py プロジェクト: mrhossain/corrsim
def dcorr(X, Y):
    """
    Computes Distance Correlation (dCorr)
    between word embedding matrices X and Y
    :param x: X: word embedding matrix X with shape (k x D)
    :param y: Y: word embedding matrix Y with shape (l x D)
    :return: distance correlation between X and Y
    """
    return dcor.distance_correlation(X.T, Y.T)
コード例 #11
0
ファイル: comparison_python.py プロジェクト: Tzq2doc/sunnies
def aidc(x, y):
    cov_y = np.cov(y)
    cov_x = np.cov(x.T)

    if cov_x.shape is ():
        inv_cov_x = 1.0 / cov_x
        x_trans = np.dot(x, np.sqrt(inv_cov_x))
    else:
        inv_cov_x = np.linalg.inv(cov_x)
        x_trans = np.dot(x, scipy.linalg.sqrtm(inv_cov_x))
    inv_cov_y = 1 / cov_y
    y_trans = np.dot(y, np.sqrt(inv_cov_y))
    return dcor.distance_correlation(x_trans, y_trans)
コード例 #12
0
ファイル: shapley_helpers.py プロジェクト: Tzq2doc/sunnies
def AIDC(X, Y):
    cov_y = numpy.cov(Y)
    cov_x = numpy.cov(X.T)

    if cov_x.shape is ():
        inv_cov_x = 1.0 / cov_x
        X_trans = numpy.dot(X, numpy.sqrt(inv_cov_x))
    else:
        inv_cov_x = numpy.linalg.inv(cov_x)
        X_trans = numpy.dot(X, scipy.linalg.sqrtm(inv_cov_x))

    inv_cov_y = 1 / cov_y
    Y_trans = numpy.dot(Y, numpy.sqrt(inv_cov_y))
    return dcor.distance_correlation(Y_trans, X_trans)
コード例 #13
0
def corr_distance2(sd_log):
    # long runtime
    from scipy.spatial.distance import pdist, squareform
    import dcor
    data = sd_log.data
    feat_names = sd_log.columns.tolist()
    df_dcor = pd.DataFrame(index=feat_names, columns=feat_names)

    k = 0
    for feat_i in feat_names:
        tmp = data.loc[:, feat_i]
        v1 = data.loc[:, feat_i].to_numpy()

        for feat_j in feat_names[k:]:
            v2 = data.loc[:, feat_j].to_numpy()

            rez = dcor.distance_correlation(v1, v2)

            df_dcor.at[feat_i, feat_j] = float(rez)
            df_dcor.at[feat_j, feat_i] = float(rez)

        k += 1

    # plot as heatmap
    fig, ax = plt.subplots(figsize=(12, 9))
    sns.heatmap(df_dcor,
                cmap=sns.diverging_palette(220, 10, as_cmap=True),
                square=True,
                cbar_kws={'shrink': .9},
                ax=ax,
                annot=True,
                linewidths=0.1,
                vmax=1.0,
                linecolor='white',
                annot_kws={'fontsize': 12})
    plt.title("Distance Correlation Among Features")
    plt.show()

    return df_dcor
コード例 #14
0
ファイル: shapley_helpers.py プロジェクト: Tzq2doc/sunnies
def CF(x, y, team, cf_name):
    """
    Available characteristic functions:
        dcor: Distance correlation between y and x
    """
    x = x[:, team]

    if len(team) == 0:
        return 0.0

    if cf_name.lower() == "dcor":
        return dcor.distance_correlation(y, x)

    elif cf_name.lower() == "r2":
        det_C_xy = numpy.linalg.det(numpy.corrcoef(x.T, y))
        if len(team) == 1:
            det_C_x = 1
        else:
            det_C_x = numpy.linalg.det(numpy.corrcoef(x.T))

        # ------------------------------------
        # FOr debugging R2 in Julia
        #print(f"team={team}")
        #print(1 - det_C_xy/det_C_x)
        # ------------------------------------

        return (1 - det_C_xy / det_C_x)

    elif cf_name.lower() == "aidc":
        return dcor.distance_correlation_af_inv(y, x)
        #return AIDC(x, y)

    elif cf_name.lower() == "hsic":
        return dHSIC(x, y)

    else:
        raise NameError(
            "I don't know the characteristic function {0}".format(cf_name))
        return 0
コード例 #15
0
ファイル: dist_corr.py プロジェクト: quanttrade/Hedgecraft
def dist_corr():
    df = dtr.detrend()
    df.dropna(inplace=True)

    # store the column names as a list
    col_names = df.columns.tolist()

    df_dcor = pd.DataFrame(index=col_names, columns=col_names)

    k = 0
    for i in col_names:

        v1 = df.loc[:, i].values
        for j in col_names[k:]:

            v2 = df.loc[:, j].values
            rez = dcor.distance_correlation(v1, v2)
            df_dcor.at[i, j] = rez
            df_dcor.at[j, i] = rez
        k += 1

    return df_dcor
コード例 #16
0
def colwise_partial_distcorr(df, col1: str, partial: str):
    import dcor
    pdc_list = []
    dc_list = []
    ipdc_list = []
    idc_list = []
    #     for col2 in tqdm(df.columns):
    for col2 in df.columns:
        dc = dcor.distance_correlation(x=df[col1], y=df[col2])
        dc_list.append(dc)
        if partial is not None:
            pdc = dcor.partial_distance_correlation(x=df[col1],
                                                    y=df[col2],
                                                    z=df[partial])
            pdc_list.append(pdc)

    result_df = pd.DataFrame()
    result_df['distance_corr'] = dc_list
    result_df['partial_distance_corr'] = pdc_list
    result_df['col1'] = col1
    result_df['col2'] = df.columns
    result_df['partial'] = partial
    return result_df
コード例 #17
0
def test_model(result_model, attrs, test_y, test_x_vars):
    '''Tests a trained model and randonly selects test points to plot'''
    result_y = []
    score = 0.0
    rmse = 0.0
    if "poly" in attrs:
        poly_func = attrs["poly"]
        x_ = poly_func.fit_transform(test_x_vars)
        result_y = result_model.predict(x_)
        score = result_model.score(x_, test_y)
        rmse = np.sqrt(mean_squared_error(test_y, result_y))
    else:
        result_y = result_model.predict(test_x_vars)
        score = result_model.score(test_x_vars, test_y)
        rmse = np.sqrt(mean_squared_error(test_y, result_y))
    correlation = dcor.distance_correlation(np.array(test_y),
                                            np.array(result_y))
    frequencies, bins_x, bins_y = np.histogram2d(
        result_y,
        test_y,
        bins=[np.linspace(0.0, 1.0, 100),
              np.linspace(0.0, 1.0, 100)])
    return score, rmse, correlation, frequencies
コード例 #18
0
def compute_correlation_strength():
    """
    Computes correlation strengths between pairs of attributes.
    Works on currently loaded dataset.
        GET parameters:
        - "ids": List of embedding IDs to consider, with ids=1,2,3,... Note: If "ids" is not specified, all embeddings
                 are taken into account.
    :return:
    """

    df: pd.DataFrame = app.config["EMBEDDING_METADATA"]["original"].drop(
        ["num_records"], axis=1)
    ids: list = request.args.get("ids")
    ids = list(map(int, ids.split(","))) if ids is not None else None

    if ids is not None:
        df = df.iloc[ids]

    # todo (remove, generate data cleanly) Hack: Rename target_domain_performance and n_components here.
    return df.rename(columns={
        "target_domain_performance": "rdp",
        "separability_metric": "separability"
    }).corr(method=lambda x, y: dcor.distance_correlation(x, y)).to_json(
        orient='index')
コード例 #19
0
ファイル: invariance_test.py プロジェクト: Tzq2doc/sunnies
# --- Data
#X = numpy.array([numpy.linspace(-1, 1, N) for _ in range(D)]).T
X = numpy.array([numpy.random.uniform(-1, 1, N) for _ in range(D)]).T
TWO_D = 2 * numpy.array(range(D))
Y = numpy.matmul(numpy.multiply(X, X), TWO_D)
# ---

# --- Transform data
M = numpy.array([numpy.random.uniform(-10, 10, D) for _ in range(D)])
N = numpy.array([numpy.random.uniform(-10, 10, N) for _ in range(D)]).T
X_TRANS1 = numpy.matmul(X, M)
X_TRANS2 = numpy.matmul(X, M) + N

print("Distance correlation:")
print(dcor.distance_correlation(Y, X))
print("Unbiased dcor:")
print(numpy.sqrt(dcor.u_distance_correlation_sqr(Y, X)))

#for _ in range(10000):
#    AIDC(X, Y)
#    dcor.distance_correlation_af_inv(Y, X)
#print("done")
#sys.exit()

print("AIDC original X:")
print(AIDC(X, Y))
print("AIDC built-in X:")
print(dcor.distance_correlation_af_inv(Y, X))
print("AIDC X = M*X:")
print(AIDC(X_TRANS1, Y))
コード例 #20
0
def cal_dCor(x, y):
    #https://github.com/vnmabus/dcor
    return dcor.distance_correlation(x, y, method='AVL')
コード例 #21
0
def _test_umap(xdata, random_state, **kwargs):
    transformer = UMAP(random_state=random_state, **kwargs)
    x = transformer.fit_transform(xdata)
    return x, random_state, distance_correlation(xdata, x)
コード例 #22
0
    # summarize
    # print('MAE: %.3f' % results.best_score_)
    print('\n' + 'For: ' + tickers[i18] + ', Lasso Config: %s' % results.best_params_)

    scores = np.absolute(scores)
    print('For: ' + tickers[i18] + ', Lasso Mean(STD) MAE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

for i19,column in enumerate(lei_cei_lag_predictor_data):
    temp_list = lei_cei_lag_predictor_data[tickers[i19]].to_list()
    temp_df = pd.DataFrame({tickers[i19]:temp_list})
    temp_df = temp_df.set_index(lei_cei_lag_data_returns_df.index)
    temp_dcor_df = pd.concat([lei_cei_lag_data_returns_df.dropna(), temp_df.dropna()], axis=1)
    temp_X = temp_dcor_df.iloc[:, np.arange(0, temp_dcor_df.shape[1]-1, 1).tolist()]
    temp_Y = temp_dcor_df.iloc[:, -1]
    print("For security: " + tickers[i19] + ", distance coerelation is: " +
          str(round(dcor.distance_correlation(temp_X,temp_Y),2)))

for i20,column in enumerate(lei_cei_lag_predictor_data):
    temp_list = lei_cei_lag_predictor_data[tickers[i20]].to_list()
    temp_df = pd.DataFrame({tickers[i20]:temp_list})
    temp_df = temp_df.set_index(lei_cei_lag_data_returns_df.index)
    temp_dcov_df = pd.concat([lei_cei_lag_data_returns_df.dropna(), temp_df.dropna()], axis=1)
    temp_X = temp_dcov_df.iloc[:, np.arange(0, temp_dcov_df.shape[1]-1, 1).tolist()]
    temp_Y = temp_dcov_df.iloc[:, -1]
    print("For security: " + tickers[i20] + ", distance covariance is: " +
          str(round(dcor.distance_covariance(temp_X,temp_Y),2)))

# K-Means of 11 ETF returns
kmeans = KMeans(n_clusters=3).fit(lei_cei_lag_predictor_data)
centroids = kmeans.cluster_centers_
print(centroids)
コード例 #23
0
    from scipy.stats import pearsonr, spearmanr, kendalltau

    for feat in bio_cols:
        print(feat)

        print("Pearson's = {}".format(
            pearsonr(biochemistry_data[feat],
                     questionnaire_data["HYPERTENSION"])[0]))
        print("Spearman's = {}".format(
            spearmanr(biochemistry_data[feat],
                      questionnaire_data["HYPERTENSION"])[0]))
        print("Kendall's Tau = {}\n".format(
            kendalltau(biochemistry_data[feat],
                       questionnaire_data["HYPERTENSION"])[0]))
        print("Distance Correlation = {}".format(
            dcor.distance_correlation(biochemistry_data[feat],
                                      questionnaire_data["HYPERTENSION"])))
        print("Energy Distance = {}\n".format(
            dcor.energy_distance(biochemistry_data[feat],
                                 questionnaire_data["HYPERTENSION"])))

#Plot correlation matrix using Pearson's correlation measure
if False:
    import seaborn as sns
    cols = list(biochemistry_data.columns)
    corr_matrix = np.corrcoef(biochemistry_data[cols].values.T)
    print(corr_matrix)
    plt.figure(1, figsize=(12, 18))
    sns.set(font_scale=1.0)
    heat_map = sns.heatmap(corr_matrix,
                           cbar=False,
                           annot=True,
コード例 #24
0
def CV(X,
       y,
       d,
       m,
       method="ft",
       nolamb=50,
       nofold=10,
       NoB=5,
       NoC=20,
       NoW=2,
       spX=False,
       standard=False):
    """
    Estimate B using the best lambda with cross-validation
    Args:
        X: covariates
        y: outcome
        d: structural dimension
        m: number of transfroms
        method: "ft" or "sir"
        nolamb: the number of lambda
        nofold: the number of fold
        NoB: number of iterate over B within ADMM
        NoC: number of iterate over C
        NoW: number of updating weights
        spX: sparse X or not
        standard: standardize X or not
    Returns:
        B: estimate
        covxx: covariance matrix of X
        lambcv: best lambda
        minimum loss 
    """
    ## par.
    #method = 'sir' # or 'ft'
    #nofold = 10
    #nolamb = 50
    #spX = False
    #standard = False
    #NoB = 5
    #NoC = 20
    #NoW = 2

    ## generate lambda candidate
    lambmax = 1  #np.max(sdr0.M)/10
    lambmin = lambmax / 1000 if method == 'sir' else lambmax / 10
    lambseq = np.exp(np.linspace(np.log(lambmin), np.log(lambmax), num=nolamb))

    kf = KFold(n_splits=nofold)
    cvloss = np.zeros((nofold, nolamb))
    k = 0
    for train_index, test_index in kf.split(X):
        print('Fold-', k)

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #print("TRAIN:", (X_train.shape), "TEST:", (X_test.shape))

        for i in range(nolamb):
            Btrain = estimate(
                X_train, y_train, d, m, lambseq[i], method, NoB, NoC, NoW, spX,
                standard
            )[0]  # estimate(X, y, d, m, lamb, method = "ft", NoB = 5, NoC = 20, NoW=2, spX=False, standard=False)
            if np.linalg.cond(Btrain) < 1 / sys.float_info.epsilon:
                sigs = np.cov(X_train.T)
                Btrain = Btrain @ la.inv(la.sqrtm(Btrain.T @ sigs @ Btrain))
                cvloss[k, i] = 1 - dcor.distance_correlation(
                    X_test @ Btrain, y_test) / d
            else:
                cvloss[k, i] = 100
        k = k + 1
    l_mean = np.mean(cvloss, axis=0)
    lambcv = lambseq[np.argmin(l_mean)]
    B, covxx, err = estimate(X, y, d, m, lambcv, method, NoB, NoC, NoW, spX,
                             standard)
    return B, covxx, lambcv, np.argmin(l_mean)
コード例 #25
0
y_Dior_ny
# y_Prada_london
# y_Prada_milan
# y_Prada_paris
# y_Prada_ny

brand_city = ['Chanel_Milan','Chanel_London','Chanel_Paris','Chanel_NY', 'Dior_Milan','Dior_London','Dior_Paris','Dior_NY']

dcor_df = pd.DataFrame(data = 0, columns=brand_city, index=brand_city)
spearman_df = pd.DataFrame(data = 0, columns=brand_city, index=brand_city)

brand_city_signal = [y_Chanel_milan, y_Chanel_london, y_Chanel_paris, y_Chanel_ny, y_Dior_milan, y_Dior_london, y_Dior_paris, y_Dior_ny]

for i in range(len(brand_city)):
    for j in range(len(brand_city)):
        dcor_df.iloc[i,j] = dcor.distance_correlation(np.array(brand_city_signal[i]), np.array(brand_city_signal[j]))
        spearman_df.iloc[i,j], p_val = np.abs(spearmanr(np.array(brand_city_signal[i]), np.array(brand_city_signal[j])))


dcor_df.to_csv('dcor_brand_city.csv')
spearman_df.to_csv('spearman_brand_city_abs.csv')


# Visualize the covariance matrix using a heatmap
sns.heatmap(spearman_df, annot=True, cmap='Reds')

# Display the heatmap
plt.show()


コード例 #26
0
def ascores(X, y):
    '''
    ----------   
    
    Parameters
    ----------
    X:     numeric dataframe to compute association measure with y
    y:     series containing target values

    Returns
    -------
    Dataframe with the following association scores:
        
    pearson:    pearson correlation
    kendall:    kendall correlation
    spearman:   spearman correlation
    mic:        maximal information coefficient
    dcor:       distance correlation 
            
    
    Example 
    -------
    import exploretransform as et
    df, X, y = et.loadboston()
    X = X.select_dtypes('number')
    et.ascores(X, y)
    
              pearson   kendall  spearman       mic      dcor
    lon      0.322947  0.278908  0.420940  0.379753  0.435849
    lat      0.006826  0.013724  0.021420  0.234796  0.167030
    crim     0.389582  0.406992  0.562982  0.375832  0.528595
    zn       0.360386  0.340738  0.438768  0.290145  0.404253
    indus    0.484754  0.420263  0.580004  0.414140  0.543948
    nox      0.429300  0.398342  0.565899  0.442515  0.523653
    rm       0.696304  0.485182  0.635092  0.461610  0.711034
    age      0.377999  0.391067  0.551747  0.414676  0.480248
    dis      0.249315  0.313745  0.446392  0.316136  0.382746
    tax      0.471979  0.418005  0.566999  0.336899  0.518158
    ptratio  0.505655  0.397146  0.554168  0.371628  0.520320
    b        0.334861  0.126766  0.186011  0.272469  0.385468
    lstat    0.740836  0.671445  0.857447  0.615427  0.781028


    ---------- 
    
    '''
    # Convert any ints to float for dcor calculation
    if len(X.select_dtypes(int).columns) > 0:
        for col in X.select_dtypes(int).columns:
            X.loc[:, col] = X[col].astype('float')

    r = pd.DataFrame()
    mine = MINE(alpha=0.6, c=15)

    for col in X.columns:
        mine.compute_score(X[col], y)
        r.loc[col, 'pearson'] = abs(stats.pearsonr(X[col], y)[0])
        r.loc[col, 'kendall'] = abs(stats.kendalltau(X[col], y)[0])
        r.loc[col, 'spearman'] = abs(stats.spearmanr(X[col], y)[0])
        r.loc[col, 'mic'] = mine.mic()
        r.loc[col, 'dcor'] = distance_correlation(X[col], y)

    return r
コード例 #27
0
def main(gpath):

    rma_df = pd.read_csv("../data-sources/mistry2017/rma")
    rma_df['raw_gene'] = rma_df['Unnamed: 0']

    print(np.sum(rma_df['raw_gene'].str.endswith('_at')))

    print(list(rma_df['raw_gene']))

    annot_df = pd.read_csv(
        "../data-sources/mistry2017/DIPtoAffy_with_additionalAnnotations.tsv",
        sep='\t')
    print(annot_df.columns)
    print(annot_df.head())

    pid_unitprot = defaultdict(set)
    for r in annot_df.itertuples():
        input_parts = r[-1].split(';')
        output_parts = r[-3].split(';')

        for input_part in input_parts:
            for output_part in output_parts:
                if output_part != '' and input_part != '':
                    pid_unitprot[input_part].add(
                        res.get_unified_name(output_part))

    idmap = {}
    for k, vals in pid_unitprot.items():
        if len(vals) > 1:
            print("%s -> %s" % (k, vals))
        elif len(vals) == 1:
            idmap[k] = list(vals)[0]

    print(idmap)
    print(len(idmap))

    ix = rma_df['raw_gene'].isin(idmap)
    rma_df = rma_df[ix].copy()
    rma_df['gene'] = [idmap[g] for g in rma_df['raw_gene']]
    expr_cols = [c for c in rma_df.columns if c.startswith('BT')]

    G = nx.read_gpickle(gpath)

    nodes = sorted(G.nodes())
    node_ix = dict(zip(nodes, range(len(nodes))))

    data = np.array(rma_df[expr_cols])

    F = np.zeros((len(nodes), len(nodes)))
    for i in range(rma_df.shape[0]):
        node_i = rma_df.iloc[i]['gene']
        if node_i not in node_ix:
            continue
        for j in range(i + 1, rma_df.shape[0]):
            node_j = rma_df.iloc[j]['gene']
            if node_j in node_ix:
                node_i_idx = node_ix[node_i]
                node_j_idx = node_ix[node_j]
                F[node_i_idx, node_j_idx] = dcor.distance_correlation(
                    data[i, :], data[j, :])
                F[node_j_idx, node_i_idx] = F[node_i_idx, node_j_idx]
        print("Finished %d" % i)

    #print(np.sum(F))
    output_path = "../generated-data/pairwise_features/%s_rma_dcor" % (
        os.path.basename(gpath))

    np.save(output_path, F)
コード例 #28
0
def dc(reads1, reads2):
    return dcor.distance_correlation(reads1, reads2)
コード例 #29
0
def ts_corr_network(data, corr_param='pcor', prune=0.35):

    if corr_param == "dcor":

        col_names = data.columns.tolist()
        data_dcor = pd.DataFrame(index=col_names, columns=col_names)

        k = 0
        for i in col_names:

            v_i = data.loc[:, i].values
            for j in col_names[k:]:

                v_j = data.loc[:, j].values
                dcor_val = dcor.distance_correlation(v_i, v_j)
                data_dcor.at[i, j] = dcor_val
                data_dcor.at[j, i] = dcor_val
            k += 1

        # converts the dataframe to a matrix (need this to generate the graph from the networkx package)
        dcor_matrix = data_dcor.values.astype("float")
        sim_matrix = 1 - dcor_matrix

        nodes = data_dcor.index.values

        # transforms the similarity matrix into a weighted graph
        G = nx.from_numpy_matrix(sim_matrix)

        # relabel nodes as the stock names
        G = nx.relabel_nodes(G, lambda x: nodes[x])

        # prints the edges with their corresponding weights
        G.edges(data=True)

        # copy correlation network
        H = G.copy()

        # remove self-edges from H (required for graph-theoretic analyses)
        for (u, v) in G.edges:
            if u == v:
                H.remove_edge(u, v)

        if prune != None:
            # removes weakly correlated edges from H
            for (u, v, wt) in G.edges.data("weight"):
                if wt >= 1 - prune:
                    H.remove_edge(u, v)

        return H

    if corr_param == "pcor":

        pcor_matrix = data.iloc[:, 1:].corr()
        nodes = pcor_matrix.index.values

        pcor_matrix = np.asmatrix(pcor_matrix)
        sim_matrix = 1 - abs(pcor_matrix)

        G = nx.from_numpy_matrix(sim_matrix)
        G = nx.relabel_nodes(G, lambda x: nodes[x])
        G.edges(data=True)

        H = G.copy()

        for (u, v) in G.edges:
            if u == v:
                H.remove_edge(u, v)

        if prune != None:
            for (u, v, wt) in G.edges.data("weight"):
                if wt >= 1 - prune:
                    H.remove_edge(u, v)

        return H
コード例 #30
0
        csv_path, dropping_col, seeds[i], Normalize_output)

    print(
        '-----------------------------------Feature Selection-----------------------------------'
    )
    if with_fs_sel:
        f_sel, sp_df = spearsman_FS(Input_TR,
                                    Output_TR,
                                    threshold=threshold_clip,
                                    rank_num=rank_clip,
                                    FSmode="rank")
    else:
        f_sel = Input_TR.columns.values.tolist()

    run_info[str(i)]['Selected Features'] = f_sel
    dcor_index_TR = dcor.distance_correlation(Input_TR.loc[:, f_sel],
                                              Output_TR)
    dcor_index_TE = dcor.distance_correlation(Input_TE.loc[:, f_sel],
                                              Output_TE)
    run_info[str(i)]['dCorr_score_TR'] = dcor_index_TR
    run_info[str(i)]['dCorr_score_TE'] = dcor_index_TE
    print('dcor_index_TR: ', dcor_index_TR)
    print('dcor_index_TE: ', dcor_index_TE)

    scaled_Input_TR, scaler_TR_df = scale_data(Input_TR)
    scaled_Input_TE, scaler_TE_df = scale_data(Input_TE)

    # Prepare TR and TE
    X_TR = scaled_Input_TR[f_sel].values
    Y_TR = Output_TR.values
    X_TE = scaled_Input_TE[f_sel].values
    Y_TE = Output_TE.values