Ejemplo n.º 1
0
def main():
    hsic_lasso = HSICLasso()
    #out_list = ['c'+str(i) for i in range(1,51)]
    #print (out_list)
    hsic_lasso.input("./user_data_new.csv",
                     output_list=[
                         'c1', 'c2', 'c3', 'c4', 'c5,', 'c6', 'c7', 'c8', 'c9',
                         'c10'
                     ])
    # ,'c11', 'c12', 'c13', 'c14', 'c15,', 'c16', 'c17', 'c18', 'c19', 'c20','c21', 'c22', 'c23', 'c24', 'c25,', 'c26', 'c27', 'c28', 'c29', 'c30'])
    hsic_lasso.regression(100, B=50)
    hsic_lasso.dump()
    select_index = hsic_lasso.get_index()
    print(select_index)
    print(hsic_lasso.get_index_score())
    #hsic_lasso.plot_path()
    print(hsic_lasso.get_features())
    X_select = hsic_lasso.X_in[select_index, :]
    np.savetxt('X_select.txt', X_select, fmt=str('%.5f'), encoding='utf-8')
Ejemplo n.º 2
0
    def HSICLasso(self):

        df_ = self.data.copy()
        cols = list(df_.columns)[:-1] + ['class']
        df_.columns = cols

        hsic_lasso = HSICLasso()
        hsic_lasso.input(self.X_train.values, self.Y_train.values)

        if self.type == CLASSIFICATION:
            hsic_lasso.classification(self.num_top_features)
        elif self.type == REGRESSION:
            hsic_lasso.regression(self.num_top_features)

        feats = [
            df_.columns[int(val) - 1] for val in hsic_lasso.get_features()
        ]

        for feat, imp in zip(feats, hsic_lasso.get_index_score()):
            features_[feat] = imp
        self.report_feature_importance(features_,
                                       self.num_top_features,
                                       label="HSICLasso")
Ejemplo n.º 3
0
    def hsic_lasso_matric(self,
                          data,
                          n_jobs=2,
                          n_sample=False,
                          frac_sample=False):
        '''Calculate hsic lasso (subtract correlation between explanatory variables).
        Since the correlation coefficient matrix is not symmetric, it is viewed in the row direction.
        The correlation between variable 0 and the other variable is stored as the component on the 0th row,
        and the correlation between variable 1 and the other variable is stored as the component on the first row.
        
        n_jobs : (int) Indicates the number of cores to be calculated. -1 for GPU.
        data: (numpy or pandas) A data frame that contains all explanatory and objective variables
        n_sample : (int) How much random sampling to do. False if not.
        If a numerical value is entered, sampling is performed using that number of rows.
        frac_sample: [0 ~ 1] (float) Sampled as a percentage of the number of rows. Not used at the same time as n_sample.
        '''
        data = copy(data)
        data = pd.DataFrame(data).dropna()
        # Sampling when n_sample contains a numerical value
        if not n_sample:
            if not frac_sample:
                # n_sample=False, frac_sample=False
                pass
            else:
                # n_sample=False, frac_sample=int
                data = data.sample(frac=frac_sample, replace=True)
        else:

            if not frac_sample:
                # n_sample=int, frac_sample=False
                data = data.sample(n=n_sample, replace=True)
            else:
                # n_sample=int, frac_sample=int
                raise ValueError(
                    'Please enter a value for `frac` OR `n`, not both')

        data = check_array(data, accept_sparse="csc",
                           dtype=float)  # Convert to numpy.ndarray
        n_col = data.shape[1]
        hsic_array = np.empty((0, n_col - 1), float)
        for i in range(n_col):
            X = np.delete(data, obj=i, axis=1)
            y = data[:, i]

            # Calculation of hsic_lasso
            hsic_lasso = HSICLasso()
            hsic_lasso.input(X, y)
            hsic_lasso.regression(num_feat=X.shape[1],
                                  discrete_x=False,
                                  n_jobs=n_jobs)
            # hsic_lasso only appears in descending order of score, so sort
            hsic_ = np.array(
                [hsic_lasso.get_index(),
                 hsic_lasso.get_index_score()])
            hsic_ = hsic_.T  # Transpose because it is difficult to use
            # Since there are not enough scores that came out, add 0.0 to the index to complement
            lack_set = set([x for x in range(X.shape[1])]) - set(hsic_[:, 0])
            for lack in lack_set:
                lack_list = np.array([[lack, 0.0]])
                hsic_ = np.append(hsic_, lack_list, axis=0)
            hsic_ = hsic_[np.argsort(hsic_[:, 0])]  # Sort by index
            hsic_array = np.append(hsic_array,
                                   hsic_[:, 1].reshape(1, -1),
                                   axis=0)
        # Since it does not include the correlation component with itself, add 1.0
        n_row = hsic_array.shape[0]
        for i in range(n_row):
            insert_i = (n_row + 1) * i
            hsic_array = np.insert(hsic_array, insert_i, 1.0)
        self.hsic_lasso = hsic_array.reshape(n_row, -1)
        return self.hsic_lasso