Ejemplo n.º 1
0
def calcular_mi(variable, df):
    """Calcula Mutual Information para las variables independientes utilizando sklearn.feature_selection.mutual_info_regression para un DataFrame
    FUNCIÓN SOLO VALIDA PARA EL DATAFRAME DEL TRABAJO ORIGINAL

    Parameters:
        variable (String): Variable sobre la cual se calcula mi
        df (pandas.DataFrame): Dataframe

    Returns:
        pandas.Series: Serie con los resultados de mi
    """
    variables = [
        'Industry Sector', 'Application Group', 'Development Type',
        'Development Platform', 'Language Type',
        'Primary Programming Language', 'Functional Size',
        'Adjusted Function Points', 'Project Elapsed Time',
        '1st Data Base System', 'Used Methodology'
    ]
    X = df.loc[:, variables]
    y = df.loc[:, variable].values
    mi = mutual_info_regression(X, y, n_neighbors=1)
    mi = pd.Series(mi)
    mi.index = X.columns
    mi = mi.sort_values(ascending=False)
    return mi
def get_collinear_cols_intradf(df1,df2,threshold=0.6,method="pearson",verbose=True):
    """
    function to get the corresponding columns in two dataframes with collinear value ratio gt threshold
    """
    from sklearn.feature_selection import mutual_info_classif,mutual_info_regression
    columns1 = list(set(df1.columns)-set(["id","user_id","label","target"]))
    columns2 = list(set(df2.columns)-set(["id","user_id","label","target"]))
    assert set(columns1)==set(columns2),"the columns in the two dataframes must be identical"
    df1 = df1[columns1]
    df2 = df2[columns2]
    cols_to_select = []
    for col in columns1:
        ms = ["pearson", "kendall", "spearman","mi_classif","mi_regression"]
        assert method in ms,"method should in {}".format(ms)
        cor = 0.0
        if method in ["pearson", "kendall", "spearman"]:
            cor = df1[col].corr(df2[col],method=method)
        elif method == "mi_classif":
            cor = np.reshape(mutual_info_classif(df1[[col]].values, df2[col].values), -1).tolist()[0]
        elif method == "mi_regression":
            cor = np.reshape(mutual_info_regression(df1[[col]].values, df2[col].values), -1).tolist()[0]
        if cor>threshold:
            cols_to_select.append(col)
        if verbose:
            print("column {}: {} {}".format(col,method,cor))

    if verbose:
        print('Number of columns gt threshold {}: {},  out of {} columns : '.format(threshold,len(cols_to_select),len(columns1)))
    return cols_to_select
def get_mi_estimate(K, m, n, r, l, SNR, H, A, filename):

    TxNet, RxNet, Normalize, V, U, R = ae_load_model(K, filename, m, n, r, l, SNR, H, A)
    L1 = 10
    L2 = 1000

    X_data = np.zeros((L1 * L2, K))
    Y_data = np.zeros((L1 * L2, K))

    MI = np.zeros(K)

    for j in range(L1):
        _, SignalIn, _, Noise = prepare_data(K, m, n, r, V, U, R, SNR, L2)
        _, Rx = get_output(TxNet, RxNet, Normalize, K, SignalIn, Noise, H)

        for k in range(K):

            for t1 in range(L2):
                X_data[1000 * j + t1, k] = np.array(SignalIn[k])[t1, 0]
                Y_data[1000 * j + t1, k] = np.array(Rx[k])[t1, 0]

    for k in range(K):

        X = X_data[:, k].reshape(-1, 1)
        Y = np.squeeze(Y_data[:, k])
        M = mutual_info_regression(X, Y)
        MI[k] = M

    return MI
Ejemplo n.º 4
0
def select_data(df_fewNA, num_features=50):
    """ Write selected_variables.txt with the name of all important features, 
        determined with mutual information algorithm.
    
    Parameters
    ----------
    df_fewNA: Output of read_database.clean_data.
    num_features: Maximum number of important variables to output.
   
    Returns
    -------
    selected_variables: Dataframe with the name of all important features 
        and its weight importance.
    """
    # Feature selection
    covs = df_fewNA.drop(["NY_GDP_MKTP_KD_ZG", "residuals"], 1)
    Y = df_fewNA[['residuals']]
    info = mutual_info_regression(covs, np.ravel(Y))
    df_varimp = pd.DataFrame(data={'name': covs.columns, 'varimp': info})
    # Keep top50
    selected_variables = df_varimp.sort_values(by="varimp",
                                               ascending=False)[0:49]
    selected_variables['name'] = selected_variables['name'].str.replace(
        '_', '.')
    selected_variables['name'].to_csv(
        path_or_buf='./utils/selected_variables.txt',
        header=True,
        index=None,
        sep='\t',
        mode='a')
    return selected_variables
def select_feature(x_train, x_test, y_train):
    """
	This function reduces the number of features from the existing 
	g.t 10,000 to something manageable.
	Based on experience with feature selection in homework 1, we do
	not expect the selection to result in improved performance. But
	we expect a reduction in run-time.

	No feature Run Time 
	GPA : 320.58s
	Grit : 280.71
	Hardship : 288.05
	gpa : 37.22

	Note : Code taken as is from homework 1 submission
	"""

    # feature selction-mutual info
    MIC = []
    # Mutual info criteria
    MIC = feature_selection.mutual_info_regression(x_train, y_train)
    # get most descriptive features (here called good features)
    good_features = []
    scores = []
    for k in range(len(MIC)):
        scores.append(MIC[k])
        if MIC[k] > 0.1:  # Criteria for deciding that feature should be included
            good_features.append(k)
    # Adapt the training and testing matrices to good features
    x_train = x_train[:, good_features]
    x_test = x_test[:, good_features]
    print(len(good_features))
    return x_train, x_test, scores
Ejemplo n.º 6
0
def test_mir(fn_in, fn_out='select_features_mir.tsv'):
    print('\nselect_features test_mir:')
    # np.random.seed(0)
    # data = np.random.rand(1000, 3)
    # y = data[:, 0] + np.sin(6 * np.pi * data[:, 1]) + 0.1 * np.random.randn(1000)

    df = utl.create_df(fn_in)
    data = df.values.astype('float32')
    col_names = list(df)

    X = data
    n = X.shape[1]
    with open(fn_out, 'w') as fout:
        for col in range(n):
            print(n, col, col_names[col])

            y = data[:, col]
            mir = mutual_info_regression(X, y, n_neighbors=7)
            mir /= np.max(mir)

            line = str(col) + '\t' + '\t'.join(
                [str(format(x, '.3f'))
                 for x in mir.tolist()]) + '\t' + col_names[col]
            print(line)
            fout.write(line + '\n')

            # normalized F-test for linearity
            # https://scikit-learn.org/stable/auto_examples/feature_selection/plot_f_test_vs_mi.html#sphx-glr-auto-examples-feature-selection-plot-f-test-vs-mi-py
            # f_test, p_val = f_regression(X, y)
            # f_test /= np.max(f_test)
            # print("f_score: \t", f_test)
    return
Ejemplo n.º 7
0
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X,
                                       y,
                                       discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores
Ejemplo n.º 8
0
    def mutual_information(self):
        """
        Calculates the mutual information for each distribution and appends them to a global variable I

        Returns
        --------
        List[float, List[String,]]
            - A list of the mutual information paired with its respective distribution
        """
        _, names, _ = self.traces[0]
        size = len(names)
        mutual_information = [{} for _ in range(size)]
        for i in range(size):
            for trace, names, info in self.traces:
                discrete = ("int" in str(names[i]))
                alice = trace[names[i]]
                try:
                    output = trace["Output"]
                except:
                    pos = int(str(names[i]).split("_")[-1])
                    if pos < 10:
                        output = trace[f"Output_{pos}"]
                    else:
                        continue
                I_ao = mutual_info_regression([[j] for j in alice], output, discrete_features=discrete)[0]
                while (len(info) == 1):
                    info = info[0] # Used to unwrap the inner information in case of subtypes such as List[List[Tuple[...]]]
                if isinstance(info, tuple) or (isinstance(info, list) and isinstance(info[0], list)):
                    info = info[i]
                if info[0] in mutual_information[i]:
                    mutual_information[i][info[0]].append((I_ao,info))
                else:
                    mutual_information[i][info[0]] = [(I_ao, info)]
        self.I = mutual_information
        return mutual_information
 def miregression_selected(self, x, y):
     '''
     用于分类模型,方差分析方法
     Parameters
     ----------
     x: 2维的DataFrame特征
     y: 1维的Series因变量
     
     Returns
     -------
     xSelected: list, [(Mi, colName),......]
     
     Others
     ------
     方法有待推导
     '''
     x_remaining = list(x.columns)
     selected = []
     if self.api == "sklearn":
         Mi = mutual_info_regression(x, y)
         for m, col in zip(Mi, x_remaining):
             selected.append((m, col))
     elif self.api == "self":
         pass
     else:
         raise ValueError("api must be 'sklearn' or 'self' !")
     selected.sort(reverse=True)
     return selected
Ejemplo n.º 10
0
def mutualInformation(ds, threshold=1):
    """calc mutual information for all features and return list of features with dependancies, drop features from ds"""
    logging.info("***start mutualInformation")

    features_to_drop = []
    for column in ds.columns.drop(LabelName):

        if (column in dictFeatures.keys()):
            miMat = mutual_info_regression(ds.drop([LabelName, column],
                                                   axis=1),
                                           ds[column],
                                           discrete_features='auto',
                                           n_neighbors=3,
                                           copy=True,
                                           random_state=None)
        else:
            miMat = mutual_info_classif(ds.drop([LabelName, column], axis=1),
                                        ds[column],
                                        discrete_features='auto',
                                        n_neighbors=3,
                                        copy=True,
                                        random_state=None)
        #print(column, miMat)
        if (any(mi > threshold for mi in miMat)):
            ds.drop(column, axis=1, inplace=True)
            features_to_drop.append(column)

    print("mutualInformation:\n", features_to_drop)
    return features_to_drop
Ejemplo n.º 11
0
    def get_filtered_data_frame_columns(df: pd.DataFrame,
                                        mrmr=False,
                                        features_left_cnt=10):
        if features_left_cnt >= len(df.columns) - 1:
            return df.columns

        if mrmr and len(df.columns) - features_left_cnt < 10:
            import pymrmr
            return [df.columns.values[0]] + pymrmr.mRMR(
                df, 'MID', features_left_cnt)
        else:
            data = df.to_numpy()
            correlations = feature_selection.mutual_info_regression(
                data[:, 1:], data[:, 0])
            treshold = sorted(correlations, reverse=True)[features_left_cnt]

            columns = []
            for i, col in enumerate(df.columns[1:]):
                if len(columns
                       ) < features_left_cnt and correlations[i] > treshold:
                    columns.append(col)
            for i, col in enumerate(df.columns[1:]):
                if len(columns
                       ) < features_left_cnt and correlations[i] == treshold:
                    columns.append(col)

            return [df.columns.values[0]] + columns
Ejemplo n.º 12
0
    def printMetrics(self):

        print()
        print("=================================================")
        print("=========== METRICS =============================")
        print("Features (put in Pandas df): ", self.feature_names)
        print('Mean squared error: %.2f' %
              mean_squared_error(self.y_test, self.y_pred))
        print("Explained variance score: ",
              explained_variance_score(self.y_test, self.y_pred))

        mi = mutual_info_regression(self.x_train, self.y_train)
        mi = mi / np.max(mi)

        fr, pval = f_regression(self.x_train, self.y_train,
                                center=True)  # center?

        print("Mutual Information: ", mi)
        #fr = fr / np.max(fr)
        print("f_regression: ", fr)
        print("pval: ", pval)
        print("R2 score: ", self.r2_score)
        print("=========== END METRICS =========================")
        print("=================================================")
        print()
Ejemplo n.º 13
0
 def predict_Cond_Entropy(self, X):
     from ..MMI.IC.AIC import TableEntropy
     from ..utils import mseEntropy, varEntropy, unifEntropy, ShannonEntropy
     n_var = X.shape[1]
     numCond = 2**(n_var - 1)
     cond_ent = np.zeros((n_var, numCond))
     for Resp in range(n_var):
         cond_ent[Resp, 0] = ShannonEntropy(X[:, Resp])
         for sI in range(1, numCond):
             subset = TableEntropy.subsetVector(n_var - 1, sI)
             subset = np.array(subset)
             cond = []
             for element in subset:
                 if element >= Resp:
                     element += 1
                 cond.append(int(element))
             # self.savefig()
             cond_ent[Resp,
                      sI] = cond_ent[Resp, 0] - mutual_info_regression(
                          X=X[:, cond],
                          y=X[:, Resp],
                          discrete_features=self.discrete_features,
                          n_neighbors=self.n_neighbors,
                          random_state=self.random_state)[0]
     return cond_ent
Ejemplo n.º 14
0
def get_top_1500(filtered_matrix, all_pmi, all_pairs):
    X = []  # labels
    Y = []  # targets

    X = np.array(filtered_matrix).transpose()
    # build pmi array
    pmi = []
    for idx, pair in enumerate(all_pairs):
        pmi.append(all_pmi[pair])
    Y = np.array(pmi)
    assert len(X) == len(Y)
    # mir
    start = time.time()
    mi = mutual_info_regression(X, Y)
    # summarize the selection of the attributes
    print("time taken for mi")
    print(time.time() - start)

    listed_mi = list(mi)
    tupled_mi = []
    for idx, _ in enumerate(listed_mi):
        if str(_) != 'nan':
            tupled_mi.append(tuple((idx, _)))
    # sort by key's of tuple
    tupled_mi.sort(reverse=True, key=operator.itemgetter(1))
    if len(tupled_mi) > 4000:
        tupled_mi = tupled_mi[:4000]
    return(tupled_mi)
Ejemplo n.º 15
0
def fun_extractERPfeatsMultivar(erpSynch, erpSynchFilt, Fs):
    """fun_extractERPfeatsMultivar(erpSynch,erpSynchFilt,Fs)
    extract multivariate features from L channels
    #erpSynch is a LxN matrix --> L channels with N samples each, from which synchrony measures are taken (PLV and correlation)
    #detrend and normalize erpSynch (f***s up PLV values in some cases if it's not detrended)
    #erpSynchFilt is the filtered version of erpSynch, for calculating the PLV
    """
    from itertools import combinations
    from sklearn.feature_selection import mutual_info_regression
    erpSynch = (erpSynch.T - np.mean(erpSynch, axis=1)).T
    erpSynch = (erpSynch.T / np.std(erpSynch, axis=1)).T
    erpSynchFilt = (erpSynchFilt.T - np.mean(erpSynchFilt, axis=1)).T
    featsOut = dict()
    #for all channel pair combinations
    featsOut["combinations"] = list(combinations(range(erpSynch.shape[0]), 2))
    #featsOut["Corr"] = np.zeros(len(featsOut["combinations"]),erpSynch.shape[1])
    #featsOut["PLV"] = np.zeros(len(featsOut["combinations"]),erpSynch.shape[1])
    #*** Coupling Measures ***
    #correlation  (max value)
    featsOut["Corr"] = [
        np.max(signal.correlate(erpSynch[ki[0], :], erpSynch[ki[1], :])) /
        erpSynch.shape[1] for ki in featsOut["combinations"]
    ]

    featsOut["CorrCoefs"] = np.corrcoef(erpSynch)

    featsOut["MI"] = [
        mutual_info_regression(erpSynch[ki[0], :].reshape(-1, 1),
                               erpSynch[ki[1], :])
        for ki in featsOut["combinations"]
    ]

    #PLV
    phases = np.array([
        np.angle(signal.hilbert(erpSynchFilt[ki, :]))
        for ki in range(erpSynchFilt.shape[0])
    ])
    featsOut["PLV"] = [
        np.abs(
            np.sum(np.exp(1j * (phases[ki[0]] - phases[ki[1]]))) /
            phases.shape[1]) for ki in featsOut["combinations"]
    ]
    featsOut["PLVphase"] = [
        np.mean(np.unwrap(phases[ki[0]] - phases[ki[1]], axis=0))
        for ki in featsOut["combinations"]
    ]

    #Coherence
    featsOut["Coh"] = np.zeros([len(featsOut["combinations"])])
    iind = 0
    for ki in featsOut["combinations"]:
        Wxy, Cxy = signal.coherence(erpSynch[ki[0], :],
                                    erpSynch[ki[1], :],
                                    Fs,
                                    nperseg=128)
        featsOut["Coh"][iind] = np.mean(Cxy[0:11])
        iind += 1

    return featsOut
Ejemplo n.º 16
0
def nmi(X, y):
    """
    Normalized mutual information between X and y.
    :param X:
    :param y:
    """
    mi = mutual_info_regression(X, y)
    return mi / mi.max()
Ejemplo n.º 17
0
 def get_params(self, X, y):
     self.m, self.n = X.shape
     self.r = y.shape[1] if len(y.shape) > 1 else 1
     if self.sim == 'corr':
         self.Q = np.abs(get_corr_matrix(X, fill=1))
         self.b = np.sum(np.abs(get_corr_matrix(X, y)), axis=1)[:, np.newaxis]
     elif self.sim == 'info':
         self.Q = np.ones([X.shape[1], X.shape[1]])
         self.b = np.zeros((X.shape[1], 1))
         for j in range(n_features):
             self.Q[:, j] = sklfs.mutual_info_regression(X, X[:, j])
         if len(y.shape) == 1:
             self.b = sklfs.mutual_info_regression(X, y)[:, np.newaxis]
         else:
             for y_ in y:
                 self.b += sklfs.mutual_info_regression(X, y_)
     self.Q, self.lamb_min = shift_spectrum(self.Q)
def getMI(df, LABEL):
    X = df.iloc[:, 0:-1]
    y = df[LABEL].values.flatten()
    mi = mutual_info_regression(X, y)
    mi /= np.max(mi)
    miDF = pd.DataFrame({'feature': X.columns.values})
    miDF = miDF.assign(MI=mi)
    return miDF
Ejemplo n.º 19
0
def run_chrom(chrom, list_of_list, atac_chunk, rna_counts, inboth, masterdict):

    mis = []
    rhos = []

    masterdict[chrom] = []

    for k in range(0, len(list_of_list[0])):

        print('                        ')
        print(k)

        gene_idx = list_of_list[3][k][0]
        idxs = list_of_list[2][k]

        nonzero_rna = np.argwhere(rna_counts[:, gene_idx].todense() != 0)
        nonzero_atac = np.argwhere(atac_chunk[:, idxs].todense() != 0)

        print(np.size(nonzero_rna))
        print(np.size(nonzero_atac))

        inboth = np.intersect1d(nonzero_atac, nonzero_rna)

        cutoff = 3
        if np.size(inboth) < 3:
            print(np.size(inboth))
            print("Too few for a gene")
            mis.append('-')
            rhos.append('-')
            continue

        try:
            mi = mutual_info_regression(atac_chunk[inboth, idxs].todense(),
                                        rna_counts[inboth,
                                                   gene_idx].reshape(-1, 1),
                                        random_state=2,
                                        n_neighbors=3,
                                        discrete_features=False)
        except:
            print("went here")
            mi = 0

        mis.append(mi)

        rho = np.corrcoef(
            np.hstack(atac_chunk[inboth, idxs].todense().transpose(),
                      rna_counts[inboth,
                                 gene_idx].reshape(-1, 1).transpose()))[-1, :]

        rhos.append(rho)

        # n = np.size(atac_chunk[inboth, idxs].todense().transpose())
        # adjusted_rho = rho * hyp2f1(1 / 2, 1 / 2, (n - 1) / 2, 1 - rho ** 2)
        # rhos.append(adjusted_rho)
        print("GOOD GENE!")

    masterdict[chrom].append([mis, rhos])
    return
Ejemplo n.º 20
0
def feature_importance_regression(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # Pearson correlation
        pearson = np.array([stats.pearsonr(feature, target) for _, feature in cont.iteritems()])
        cont_imp['pearson_r'] = pearson[:, 0]
        cont_imp['pearson_r_p_value'] = pearson[:, 1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_regression(cont, target, discrete_features=False,
                                                           n_neighbors=n_neighbors,
                                                           random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # F-test
        f_tests = defaultdict(dict)

        for feature in disc.columns:
            groups = [target[idxs] for idxs in disc.groupby(feature).groups.values()]
            statistic, p_value = stats.f_oneway(*groups)
            f_tests[feature]['f_statistic'] = statistic
            f_tests[feature]['f_p_value'] = p_value

        f_tests_df = pd.DataFrame.from_dict(f_tests, orient='index')
        disc_imp['f_statistic'] = f_tests_df['f_statistic']
        disc_imp['f_p_value'] = f_tests_df['f_p_value']

        # Mutual information
        mut_inf = feature_selection.mutual_info_regression(disc, target, discrete_features=True,
                                                           n_neighbors=n_neighbors,
                                                           random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp
Ejemplo n.º 21
0
def mrmrfilter(data,label,num):
    import numpy as np
    import pandas as pd
    from sklearn.feature_selection import mutual_info_regression

    np.random.seed(0)
    D1 = data.shape[0]
    D2 = data.shape[1]
    # get the mutual information within features
    MI_ff =np.zeros((D2,D2))
    for i in range(D2):
        print('complete {} part of lines' .format(i/D2) )
        for j in np.arange(i,D2):
            MI_ff[i,j] = mutual_info_regression(data.values[:,i].reshape(D1,1),data.values[:,j].reshape(D1,1), n_neighbors=5)  


    for i in range(D2-1):
        for j in np.arange(i,D2):
            MI_ff[j,i] = MI_ff[i,j]


    # get the mutual information between features and label        
    MI_fl = mutual_info_regression(data.values[:,:].reshape(D1,D2), label.values[:,0].reshape(D1,1), n_neighbors=5)

    # the best 50 features
    candidate = np.zeros((D2,1))     # 对应位置为1表示属于best 50
    for i in range(num):
        if i == 0:
            k = (np.where(MI_fl == np.max(MI_fl)))[0]
            candidate[k] = 1       
        else:
            mrmr = np.zeros((D2,1))
            for j in range(D2):
                mrmr[j] = (1-candidate[j])*(MI_fl[j]/i-np.sum(np.dot(MI_ff[j,:],candidate)/i**2))-1000*candidate[j]   # 1000*candidate[j]惩罚项
            k = (np.where(mrmr == np.max(mrmr,axis=0)))[0]
            candidate[k] = 1

    data_num = np.zeros((D1,num))
    count = 0
    for i in range(D2):
        if candidate[i] == 1:
            data_num[:,count] = data.iloc[:,i]
            count += 1

    return data_num
Ejemplo n.º 22
0
def loadscaledata(print_mi = False):
    x_all,y_all = loaddata()
    Xscaler = preprocessing.StandardScaler(copy=False).fit(x_all)
    Yscaler = preprocessing.StandardScaler(copy=False).fit(y_all)
    #Xscaler = preprocessing.MinMaxScaler((0,64),copy=False).fit(X_train)
    #Yscaler = preprocessing.MinMaxScaler((0,64),copy=False).fit(Y_train)
    x_all = Xscaler.transform(x_all)
    y_all = Yscaler.transform(y_all)

    if print_mi:
        mi_tof = mutual_info_regression(x_all,y_all[:,0])
        mi_tof /= np.max(mi_tof)
        print('mi for tof time\t',mi_tof)
        mi_pos = mutual_info_regression(x_all,y_all[:,1])
        mi_pos /= np.max(mi_pos)
        print('mi for y_position',mi_pos)

    return x_all,y_all,Xscaler,Yscaler
Ejemplo n.º 23
0
def plot_regression_categorical(X, target_col, types=None, **kwargs):
    """Plots for categorical features in regression.

    Creates box plots of target distribution for important categorical
    features. Relevant features are identified using mutual information.

    For high cardinality categorical variables (variables with many categories)
    only the most frequent categories are shown.

    Parameters
    ----------
    X : dataframe
        Input data including features and target
    target_col : str or int
        Identifier of the target column in X
    types : dataframe of types, optional.
        Output of detect_types on X. Can be used to avoid recomputing the
        types.
    """
    types = _check_X_target_col(X, target_col, types, task="regression")

    if types is None:
        types = detect_types(X)
    features = X.loc[:, types.categorical]
    if target_col in features.columns:
        features = features.drop(target_col, axis=1)
    if features.shape[1] == 0:
        return
    features = features.astype('category')
    show_top = _get_n_top(features, "categorical")

    # can't use OrdinalEncoder because we might have mix of int and string
    ordinal_encoded = features.apply(lambda x: x.cat.codes)
    target = X[target_col]
    f = mutual_info_regression(ordinal_encoded,
                               target,
                               discrete_features=np.ones(X.shape[1],
                                                         dtype=bool))
    top_k = np.argsort(f)[-show_top:][::-1]

    # large number of categories -> taller plot
    row_height = 3 if X.nunique().max() <= 5 else 5
    fig, axes = _make_subplots(n_plots=show_top, row_height=row_height)
    plt.suptitle("Categorical Feature vs Target")
    for i, (col_ind, ax) in enumerate(zip(top_k, axes.ravel())):
        col = features.columns[i]
        X_new = _prune_category_make_X(X, col, target_col)
        medians = X_new.groupby(col)[target_col].median()
        order = medians.sort_values().index
        sns.boxplot(x=target_col, y=col, data=X_new, order=order, ax=ax)
        ax.set_title("F={:.2E}".format(f[col_ind]))
        # shorten long ticks and labels
        _short_tick_names(ax)

    for j in range(i + 1, axes.size):
        # turn off axis if we didn't fill last row
        axes.ravel()[j].set_axis_off()
Ejemplo n.º 24
0
def _mutual_info(reference, query):
    # mutual information is not symmetric. We don't need to normalise the vectors, it's invariant under scaling.
    from sklearn.feature_selection import mutual_info_regression

    weights = np.zeros((query.shape[1], reference.shape[1]))
    for i, target in enumerate(query.T):
        weights[i, :] = mutual_info_regression(reference, target)

    return weights
def calc_lowf_mutual_info(data, channels):

    x_dr = data[DIFF_REAL].to_numpy()
    x_di = data[DIFF_IMAG].to_numpy()
    x_ar = data[ABS_REAL].to_numpy()
    x_ai = data[ABS_IMAG].to_numpy()

    mi_dr = []
    mi_di = []
    mi_ar = []
    mi_ai = []

    for ch in channels:

        y = data[ch].to_numpy().reshape(-1, )

        mi_dr.append(mutual_info_regression(x_dr, y))
        mi_di.append(mutual_info_regression(x_di, y))
        mi_ar.append(mutual_info_regression(x_ar, y))
        mi_ai.append(mutual_info_regression(x_ai, y))

    #now we can do some averaging over the high frequency range
    mi_dr_av = []
    mi_di_av = []
    mi_ar_av = []
    mi_ai_av = []
    for mi in mi_dr:
        mi_dr_av.append(np.average(mi[6:, ]))

    mi_di_av = []
    for mi in mi_di:
        mi_di_av.append(np.average(mi[6:, ]))

    mi_ar_av = []
    for mi in mi_ar:
        mi_ar_av.append(np.average(mi[6:, ]))

    mi_ai_av = []
    for mi in mi_ai:
        mi_ai_av.append(np.average(mi[6:, ]))

    mutual_info = np.concatenate((mi_dr_av, mi_di_av, mi_ar_av, mi_ai_av))

    return mutual_info
def do_mutual_information(weights, previous, config, **kwargs):
    assert weights.shape[0] == weights.shape[1]
    kernels = previous.shape[0]
    for i in range(kernels):
        for j in range(kernels):
            # print(f'Score kernel {i} and {j}')
            weights[i, j] = feature_selection.mutual_info_regression(previous[i].flatten().reshape(-1, 1),
                                                                     previous[j].flatten())
    print(f'Kernel mean mutual information {np.mean(weights)}')
    return weights
Ejemplo n.º 27
0
def mutual_information_univariate_selection(X, y):
    """Given data instances (X) and their corresponding targets (y),
    this method indicates which features of X are most correlated with y
    according to a mutual information based univariate feature selection
    """
    mi = mutual_info_regression(X, y)
    mi /= np.max(mi)
    dict_mi = {i: j for i, j in enumerate(mi)}
    print('ranked features -- mutual information',
          sorted(dict_mi, key=dict_mi.get, reverse=True))
Ejemplo n.º 28
0
 def MI_features_selection(self, y, x):
     mi = mutual_info_regression(x,
                                 y,
                                 discrete_features='auto',
                                 n_neighbors=3,
                                 copy=True,
                                 random_state=None)
     #n_mi = np.argsort(np.sum(mi, axis = 1))[-n:][::-1]
     n_mi = mi.argsort()[-self.m:][::-1]
     return n_mi
Ejemplo n.º 29
0
def continuous_entropy(ys):
    """Compute continuous mutual entropy
    Args:
        ys: np.array num_points x num_attributes
    """
    num_factors = ys.shape[1]
    h = np.zeros(num_factors)
    for j in tqdm(range(num_factors)):
        h[j] = mutual_info_regression(ys[:, j].reshape(-1, 1), ys[:, j])
    return h
def count_difference(array_1, array_2, show_img=False):
    if show_img:
        plt.clf()
        plt.title("Porównywanie krzywych spektralnych, obliczanie różnicy")
        plt.plot(array_1, label="Array 1")
        plt.plot(array_2, label="Array 2")
        plt.legend()
        plt.axis('tight')
        plt.show()

    X = np.arange(len(array_1))

    mi1 = mutual_info_regression(X.reshape(-1, 1), array_1)
    mi2 = mutual_info_regression(X.reshape(-1, 1), array_2)

    result = np.square(mi1 - mi2)

    # mi1 /= np.max(mi1)
    return result * 1000
Ejemplo n.º 31
0
        imgy = imgy + dy + 20
        os.remove(temppath)
        size(W, HEIGHT+dy+40)
else:
    def pltshow(mplpyplot):
        mplpyplot.show()
# nodebox section end


np.random.seed(0)
X = np.random.rand(1000, 3)
y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)

f_test, _ = f_regression(X, y)
f_test /= np.max(f_test)

mi = mutual_info_regression(X, y)
mi /= np.max(mi)

plt.figure(figsize=(15, 5))
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.scatter(X[:, i], y, edgecolor='black', s=20)
    plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
    if i == 0:
        plt.ylabel("$y$", fontsize=14)
    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
              fontsize=16)
# plt.show()
pltshow(plt)