Esempio n. 1
0
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Esempio n. 2
0
    def ica_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)
        
        ##
        ## ICA
        ##
        ica = FastICA(n_components=X_train_scl.shape[1])
        X_ica = ica.fit_transform(X_train_scl)
        
        ##
        ## Plots
        ##
        ph = plot_helper()

        kurt = kurtosis(X_ica)
        print(kurt)
        
        title = 'Kurtosis (FastICA) for ' + data_set_name
        name = data_set_name.lower() + '_ica_kurt'
        filename = './' + self.out_dir + '/' + name + '.png'
        
        ph.plot_simple_bar(np.arange(1, len(kurt)+1, 1),
                           kurt,
                           np.arange(1, len(kurt)+1, 1).astype('str'),
                           'Feature Index',
                           'Kurtosis',
                           title,
                           filename)
Esempio n. 3
0
def processing(df):
    dummies_df = pd.get_dummies(df["City Group"])

    def add_CG(name):
        return "CG_" + name

    dummies_df = dummies_df.rename(columns=add_CG)
    # print dummies_df.head()
    df = pd.concat([df, dummies_df.iloc[:, 0]], axis=1)

    dummies_df = pd.get_dummies(df["Type"])

    def add_Type(name):
        return "Type_" + name

    dummies_df = dummies_df.rename(columns=add_Type)
    df = pd.concat([df, dummies_df.iloc[:, 0:3]], axis=1)

    # try to put in age as a column
    def add_Age(string):
        age = datetime.datetime.now() - datetime.datetime.strptime(string, "%m/%d/%Y")
        return age.days

    df["Age"] = df["Open Date"].map(add_Age)
    df = df.drop(["Id", "Open Date", "City", "City Group", "Type", "revenue"], axis=1)
    # scaler = StandardScaler().fit(df)
    scaler = RobustScaler().fit(df)
    df = scaler.transform(df)

    # print df.head()
    return df
Esempio n. 4
0
File: tpot.py Progetto: vsolano/tpot
    def _robust_scaler(self, input_df):
        """Uses Scikit-learn's RobustScaler to scale the features using statistics that are robust to outliers

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to scale

        Returns
        -------
        scaled_df: pandas.DataFrame {n_samples, n_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the scaled features

        """
        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

        if len(training_features.columns.values) == 0:
            return input_df.copy()

        # The scaler must be fit on only the training data
        scaler = RobustScaler()
        scaler.fit(training_features.values.astype(np.float64))
        scaled_features = scaler.transform(input_df.drop(['class', 'group', 'guess'], axis=1).values.astype(np.float64))

        for col_num, column in enumerate(input_df.drop(['class', 'group', 'guess'], axis=1).columns.values):
            input_df.loc[:, column] = scaled_features[:, col_num]

        return input_df.copy()
def scale_feature_matrix(feature_M, linear=False, outliers=False):
    from sklearn.preprocessing import StandardScaler, RobustScaler
    import numpy as np
    
    binary_fields = [col for col in feature_M.columns if len(set(feature_M[col])) == 2]
            
    if outliers:
        #Scaling 0 median & unit variance
        scaler_obj = RobustScaler()
        print 'centering around median'

    else:
        #Scale 0 mean & unit variance
        scaler_obj = StandardScaler()
        print 'centering around mean'
    
    print 'found these binaries'
    print '-' * 10
    print '\n'.join(binary_fields)

        
    X_scaled = scaler_obj.fit_transform(feature_M.drop(binary_fields, axis=1))
    X_scaled_w_cats = np.c_[X_scaled, feature_M[binary_fields].as_matrix()]
    
    return X_scaled_w_cats, scaler_obj
Esempio n. 6
0
def num_scaler(d_num,t_num):
    scl = RobustScaler()
    scl.fit(d_num)
    d_num = scl.transform(d_num)
    t_num = scl.transform(t_num)
    
    return d_num, t_num
Esempio n. 7
0
 def best_ica_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ica = FastICA(n_components=X_train_scl.shape[1])
     X_train_transformed = ica.fit_transform(X_train_scl, y_train)
     X_test_transformed = ica.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/wine_ica_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Esempio n. 8
0
 def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     
     ks = []
     for i in range(1000):
         ##
         ## Random Projection
         ##
         rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
         rp.fit(X_train_scl)
         X_train_rp = rp.transform(X_train_scl)
         
         ks.append(kurtosis(X_train_rp))
         
     mean_k = np.mean(ks, 0)
         
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     title = 'Kurtosis (Randomized Projection) for ' + data_set_name
     name = data_set_name.lower() + '_rp_kurt'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1),
                        mean_k,
                        np.arange(1, len(mean_k)+1, 1).astype('str'),
                        'Feature Index',
                        'Kurtosis',
                        title,
                        filename)
Esempio n. 9
0
 def nn_wine_orig(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
Esempio n. 10
0
def standardize_columns(data):
    """
    We decided to standardize the weather factor due to outliers.
    """
    columns_to_standardize = ['temp', 'atemp', 'humidity', 'windspeed']
    min_max_scaler = RobustScaler()

    for column in columns_to_standardize:
        data[column] = min_max_scaler.fit_transform(data[column])
    return data
Esempio n. 11
0
 def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     scores = []
     train_scores = []
     rng = range(1, X_train_scl.shape[1]+1)
     for i in rng:
         lda = LinearDiscriminantAnalysis(n_components=i)
         cv = KFold(X_train_scl.shape[0], 3, shuffle=True)
         
         # cross validation
         cv_scores = []
         for (train, test) in cv:
             lda.fit(X_train_scl[train], y_train[train])
             score = lda.score(X_train_scl[test], y_train[test])
             cv_scores.append(score)
         
         mean_score = np.mean(cv_scores)
         scores.append(mean_score)
         
         # train score
         lda = LinearDiscriminantAnalysis(n_components=i)
         lda.fit(X_train_scl, y_train)
         train_score = lda.score(X_train_scl, y_train)
         train_scores.append(train_score)
         
         print(i, mean_score)
         
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (LDA) for ' + data_set_name
     name = data_set_name.lower() + '_lda_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(rng,
                    [scores, train_scores],
                    [None, None],
                    ['cross validation score', 'training score'],
                    cm.viridis(np.linspace(0, 1, 2)),
                    ['o', '*'],
                    title,
                    'n_components',
                    'Score',
                    filename)
Esempio n. 12
0
def demensionReduction(numFeatures,cateFeatures):
    """

    :param numFeatures:
    :param labels:
    :return:
    """
    scaler = RobustScaler()
    scaledFeatures = scaler.fit_transform(numFeatures)
    pca = PCA(n_components=5)
    reducedFeatures = pca.fit_transform(scaledFeatures)
    allFeatures = np.concatenate((reducedFeatures,cateFeatures),axis=1)
    return allFeatures
Esempio n. 13
0
def test_robustscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.RobustScaler
    # with sklearn.preprocessing.RobustScaler

    robustscalerr = RobustScalerR()
    robustscalerr.fit(np.concatenate(trajs))

    robustscaler = RobustScaler()
    robustscaler.fit(trajs)

    y_ref1 = robustscalerr.transform(trajs[0])
    y1 = robustscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Esempio n. 14
0
 def best_lda_cluster_wine(self):
     dh = data_helper()
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## K-Means
     ##
     km = KMeans(n_clusters=4, algorithm='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
     
     ##
     ## GMM
     ##
     gmm = GaussianMixture(n_components=4, covariance_type='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Esempio n. 15
0
def transform_dataframe(dataframe):

    """
    Function to read dataframe and standardize the dataframe with
    a mean 0 and unit variance on every column

    Parameters:
        dataframe : Input pandas dataframe
    Input types: pd.Dataframe
    Output types: pd.Dataframe

    """
    cols = [col for col in dataframe.columns]
    robust_scaler = RobustScaler()
    df = robust_scaler.fit_transform(dataframe[cols])
    dataframe.columns = df
    return dataframe
Esempio n. 16
0
    def scale(self,columns,categorical_cols,apply_list,target_column):
        from sklearn.preprocessing import RobustScaler
        scaler = RobustScaler()

        if apply_list:
            numerical_cols = columns
        else:
            numerical_cols = []
            for col in self.dataset.columns.values:
                if col not in categorical_cols:
                    numerical_cols.append(col)
                else:
                    pass
        # We don't want to scale the target variable, as it is already binary.
        # The target column uses the same value as target_value from Split Data section
        # in the settings popup.
        numerical_cols.remove(target_column)
        # Scale, fit and transform all the numerical columns
        scaled_data = scaler.fit_transform(self.dataset[numerical_cols])
        self.dataset[numerical_cols] = scaled_data
        return self.dataset
Esempio n. 17
0
 def best_lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     lda = LinearDiscriminantAnalysis(n_components=2)
     X_train_transformed = lda.fit_transform(X_train_scl, y_train)
     X_test_transformed = lda.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/nba_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Esempio n. 18
0
def detect_bad_channels(inst, pick_types=None, threshold=.2):
    from sklearn.preprocessing import RobustScaler
    from sklearn.covariance import EmpiricalCovariance
    from jr.stats import median_abs_deviation
    if pick_types is None:
        pick_types = dict(meg='mag')
    inst = inst.pick_types(copy=True, **pick_types)
    cov = EmpiricalCovariance()
    cov.fit(inst._data.T)
    cov = cov.covariance_
    # center
    scaler = RobustScaler()
    cov = scaler.fit_transform(cov).T
    cov /= median_abs_deviation(cov)
    cov -= np.median(cov)
    # compute robust summary metrics
    mu = np.median(cov, axis=0)
    sigma = median_abs_deviation(cov, axis=0)
    mu /= median_abs_deviation(mu)
    sigma /= median_abs_deviation(sigma)
    distance = np.sqrt(mu ** 2 + sigma ** 2)
    bad = np.where(distance < threshold)[0]
    bad = [inst.ch_names[ch] for ch in bad]
    return bad
Esempio n. 19
0
 def best_pca_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     pca = PCA(n_components=3)
     X_train_transformed = pca.fit_transform(X_train_scl, y_train)
     X_test_transformed = pca.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_pca_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Esempio n. 20
0
    def __init__(self, *args, scale=False, center=False, **kwargs):
        """
        A machine learned model.  Beyond :class:`revscoring.Model`, this
        "Learned" models implement
        :func:`~revscoring.scoring.models.Learned.fit` and
        :func:`~revscoring.scoring.models.Learned.cross_validate`.
        """
        super().__init__(*args, **kwargs)
        self.trained = None
        if scale or center:
            self.scaler = RobustScaler(with_centering=center,
                                       with_scaling=scale)
        else:
            self.scaler = None

        self.params.update({
            'scale': scale,
            'center': center
        })
       u'minutes', u'movie', u'movies', u'music', u'need', u'new',
       u'nolan', u'old', u'opinion', u'original', u'oscar', u'overall',
       u'people', u'perfect', u'performance', u'performances', u'picture',
       u'place', u'played', u'plot', u'point', u'pretty', u'probably',
       u'quite', u'read', u'real', u'really', u'reason', u'right', u'role',
       u'said', u'saw', u'say', u'scene', u'scenes', u'score', u'screen',
       u'script', u'second', u'seeing', u'seen', u'sense', u'set',
       u'shows', u'simply', u'special', u'special effects', u'star',
       u'star wars', u'start', u'story', u'sure', u'takes', u'thats',
       u'theres', u'thing', u'things', u'think', u'thought', u'time',
       u'times', u'trilogy', u'true', u'truly', u'trying', u'understand',
       u'use', u'used', u'violence', u'want', u'war', u'wars', u'wasnt',
       u'watch', u'watched', u'watching', u'way', u'wife', u'wonderful',
       u'work', u'world', u'worth', 'year_tfidf', u'years', u'young']
X_prescale = X[features_to_scale]
X_scaled = RobustScaler().fit_transform(X_prescale)
X_scaled = pd.DataFrame(X_scaled, columns = features_to_scale, index = X_prescale.index)
X_final_scaled = X_scaled.join(X[features_to_not_scale])

X_final_scaled.info()
X.info()
#Train Test Split the scaled data

X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_final_scaled, y, test_size = .2, random_state = 31)

#So what is the baseline prediction?
print y.mean()
y.value_counts()

baseline_not10 = (1-y[y== 10].count()/float(y.count()))
class HousePrices(object):
    seq2 = pd.Series(np.arange(2))

    #Static class models.
    lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
    ENet = make_pipeline(RobustScaler(),
                         ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,
                                 gamma=0.0468,
                                 learning_rate=0.05,
                                 max_depth=3,
                                 min_child_weight=1.7817,
                                 n_estimators=2200,
                                 reg_alpha=0.4640,
                                 reg_lambda=0.8571,
                                 subsample=0.5213,
                                 silent=1,
                                 random_state=7,
                                 nthread=-1)
    GBoost = GradientBoostingRegressor(n_estimators=3000,
                                       learning_rate=0.05,
                                       max_depth=4,
                                       max_features='sqrt',
                                       min_samples_leaf=15,
                                       min_samples_split=10,
                                       loss='huber',
                                       random_state=5)
    model_lgb = lgb.LGBMRegressor(objective='regression',
                                  num_leaves=5,
                                  learning_rate=0.05,
                                  n_estimators=720,
                                  max_bin=55,
                                  bagging_fraction=0.8,
                                  bagging_freq=5,
                                  feature_fraction=0.2319,
                                  feature_fraction_seed=9,
                                  bagging_seed=9,
                                  min_data_in_leaf=6,
                                  min_sum_hessian_in_leaf=11)
    KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

    #Constructor
    def __init__(self, trainData, testData):
        self.trainData = trainData
        self.testData = testData

    def dataImport(self):
        self.train = pd.read_csv(self.trainData)
        self.test = pd.read_csv(self.testData)
        self.train_Id = self.train['Id']
        self.test_Id = self.test['Id']
        self.train.drop("Id", axis=1, inplace=True)
        self.test.drop("Id", axis=1, inplace=True)

    def display(self):
        print(len(self.train.columns))
        fig, ax = plt.subplots()
        ax.scatter(x=self.train['GrLivArea'], y=self.train['SalePrice'])
        plt.ylabel('SalePrice', fontsize=13)
        plt.xlabel('GrLivArea', fontsize=13)
        #plt.show()

        # corrmat = self.train.corr()
        # f, ax = plt.subplots(figsize=(12, 9))
        # sns.heatmap(self.corrmat, vmax=.8, square=True);
        plt.show()

        # sns.distplot(self.train['SalePrice'] , fit=norm);

        # # Get the fitted parameters used by the function
        # (mu, sigma) = norm.fit(self.train['SalePrice'])
        # print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

        # #Now plot the distribution
        # plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
        # plt.ylabel('Frequency')
        # plt.title('SalePrice distribution')

        # #Get also the QQ-plot
        # fig = plt.figure()
        # res = stats.probplot(self.train['SalePrice'], plot=plt)
        # plt.show()

        # f, ax = plt.subplots(figsize=(15, 12))
        # plt.xticks(rotation='90')
        # sns.barplot(x=self.all_data_na.index, y=self.all_data_na)
        # plt.xlabel('Features', fontsize=15)
        # plt.ylabel('Percent of missing values', fontsize=15)
        # plt.title('Percent missing data by feature', fontsize=15)

        #plt.show()

    def removeOutliers(self):
        self.train = self.train.drop(
            self.train[(self.train['GrLivArea'] > 4000)
                       & (self.train['SalePrice'] < 300000)].index)

    def preProcess(self):
        self.removeOutliers()

        self.train['SalePrice'] = np.log1p(self.train['SalePrice'])
        self.ntrain = self.train.shape[0]
        self.ntest = self.test.shape[0]
        self.y_train = self.train.SalePrice.values
        self.all_data = pd.concat(
            (self.train, self.test)).reset_index(drop=True)
        self.all_data.drop(['SalePrice'], axis=1, inplace=True)
        print("all_data size is : {}".format(self.all_data.shape))

        self.all_data_na = (self.all_data.isnull().sum() /
                            len(self.all_data)) * 100
        self.all_data_na = self.all_data_na.drop(
            self.all_data_na[self.all_data_na == 0].index).sort_values(
                ascending=False)[:30]
        self.missing_data = pd.DataFrame({'Missing Ratio': self.all_data_na})

        self.preprocessCategoricalColumns()
        self.preProcessNumericalColumns()

    def preprocessCategoricalColumns(self):
        #Converting PoolQC column to categorical and then using a probability distribution to fill the None values.

        print("Total Number of values ", self.all_data['PoolQC'].shape[0])
        print("Number of Null Values", self.all_data['PoolQC'].isna().sum())

        #
        #				PoolQC
        #
        #

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["PoolQC"] = self.all_data.PoolQC.fillna("None")
        self.all_data['PoolQC'] = pd.Categorical(self.all_data.PoolQC)

        # (2) Finding probabilities of each occurance

        print("Before filling :")
        print(self.all_data['PoolQC'].value_counts())

        self.poolQC_probabilities = [
            0.98, 0.006666667, 0.006666667, 0.006666667
        ]
        self.poolQC_Values = ['None', 'Gd', 'Fa', 'Ex']
        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['PoolQC'] == 'None'].index

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices,
                           65] = np.random.choice(self.poolQC_Values,
                                                  len(self.indices),
                                                  p=self.poolQC_probabilities)

        print("After filling :")
        print(self.all_data.PoolQC.value_counts())

        ############################################################################################

        #
        #				MiscFeature
        #
        #
        #Number of Missing values in MiscFeature
        self.all_data.MiscFeature.isna().sum(
        )  #  1404 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["MiscFeature"] = self.all_data['MiscFeature'].fillna(
            "None")
        self.all_data['MiscFeature'] = pd.Categorical(
            self.all_data['MiscFeature'])
        self.all_data.MiscFeature = self.all_data.MiscFeature.astype(
            'category')

        # print("Before Filling :")
        # print(self.all_data['MiscFeature'].value_counts())

        # (2) Finding probabilities of each occurance
        print(self.all_data['MiscFeature'].value_counts())
        self.MiscFeature_probabilities = [
            0.962962963, 0.033607682, 0.001371742, 0.001371742, 0.000685871
        ]
        self.MiscFeature_Values = ['None', 'Shed', 'Othr', 'Gar2', 'TenC']

        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['MiscFeature'] ==
                                     'None'].index
        #Find the column index so as to use 'iloc'   . 56 is the col
        np.argwhere(self.all_data.columns == 'MiscFeature')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices, 56] = np.random.choice(
            self.MiscFeature_Values,
            len(self.indices),
            p=self.MiscFeature_probabilities)

        # print("After filling")
        # print(self.all_data["MiscFeature"].value_counts())

        ############################################################################################

        #
        #				Alley
        #
        #

        #Number of Missing values in Alley
        self.all_data['Alley'].isna().sum()  #  1367 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["Alley"] = self.all_data['Alley'].fillna("None")
        self.all_data['Alley'] = pd.Categorical(self.all_data['Alley'])

        # (2) Finding probabilities of each occurance

        print("Before filling :")
        print(self.all_data['Alley'].value_counts())

        # Count of 'None' : 1367
        # Count of 'Grvl' : 50
        # Count of 'Pave' : 41

        self.Alley_probabilities = [0.937585734, 0.034293553, 0.028120713]
        self.Alleyy_Values = ['None', 'Grvl', 'Pave']

        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['Alley'] == 'None'].index
        #Find the column index so as to use 'iloc'   . 3 is the col
        np.argwhere(self.all_data.columns == 'Alley')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices,
                           3] = np.random.choice(self.Alleyy_Values,
                                                 len(self.indices),
                                                 p=self.Alley_probabilities)
        print("gg")
        self.all_data['Alley'].value_counts()

        print("After filling :")
        print(self.all_data['Alley'].value_counts())

        ###########################################################################################

        #
        #				Fence
        #
        #

        #Number of Missing values in Alley
        self.all_data['Fence'].isna().sum()  #  1177 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["Fence"] = self.all_data['Fence'].fillna("None")
        self.all_data['Fence'] = pd.Categorical(self.all_data['Fence'])

        # (2) Finding probabilities of each occurance

        print("Before filling :")
        print(self.all_data['Fence'].value_counts())

        # Count of 'None' : 1177
        # Count of 'MnPrv' : 157
        # Count of 'GdPrv' : 59
        # Count of 'GdWo' : 54
        # Count of 'MnWw' : 11

        self.Fence_probabilities = [
            0.807270233, 0.107681756, 0.040466392, 0.037037037, 0.007544582
        ]
        self.Fence_Values = ['None', 'MnPrv', 'GdPrv', 'GdWo', 'MnWw']
        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['Fence'] == 'None'].index
        #Find the column index so as to use 'iloc'   . 25 is the col
        np.argwhere(self.all_data.columns == 'Fence')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices,
                           25] = np.random.choice(self.Fence_Values,
                                                  len(self.indices),
                                                  p=self.Fence_probabilities)

        print("After filling :")
        print(self.all_data['Fence'].value_counts())

        #########################################################################################

        #
        #				FirePlaceQu
        #
        #

        #Number of Missing values in FireplaceQu
        self.all_data['FireplaceQu'].isna().sum(
        )  #  690 Null values in this column

        #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN
        # as one of the values  in the categorical column.

        # (1) Filling NaN with None values and make  the column categorical
        self.all_data["FireplaceQu"] = self.all_data['FireplaceQu'].fillna(
            "None")
        self.all_data['FireplaceQu'] = pd.Categorical(
            self.all_data['FireplaceQu'])

        # (2) Finding probabilities of each occurance
        print("Before filling :")
        print(self.all_data['FireplaceQu'].value_counts())

        # Count of 'None' : 690
        # Count of 'Gd' : 378
        # Count of 'TA' : 313
        # Count of 'Fa' : 33
        # Count of 'Ex' : 24
        # Count of 'Po' : 20

        self.FireplaceQu_probabilities = [
            0.473251029, 0.259259259, 0.214677641, 0.022633745, 0.016460905,
            0.013717421
        ]
        self.FireplaceQu_Values = ['None', 'Gd', 'TA', 'Fa', 'Ex', 'Po']

        #We need to replace only the 'None' type. Generating a sample from probability distribution
        self.indices = self.all_data[self.all_data['FireplaceQu'] ==
                                     'None'].index

        #Find the column index so as to use 'iloc'   . 26 is the col
        np.argwhere(self.all_data.columns == 'FireplaceQu')

        # (3) Use a distribution to fill out "None" values now.
        self.all_data.iloc[self.indices, 26] = np.random.choice(
            self.FireplaceQu_Values,
            len(self.indices),
            p=self.FireplaceQu_probabilities)

        print("After filling :")
        print(self.all_data['FireplaceQu'].value_counts())

        ###########################################################################################

        #
        #				LotFrontage
        #
        #
        '''
		Assuming houses belonging to the same Neighborhood will have similar LotFrontage, we groupby Neighborhood
		and then take mean for each locality. Then we substitute the missing values of a particular Neighborhood with
		the mean of that Neighborhood
		'''

        self.lotFrontage_df = self.all_data[['Neighborhood',
                                             'LotFrontage']].copy()
        self.groupby_Neighborhood = self.lotFrontage_df.groupby('Neighborhood')

        self.indices = self.all_data[self.all_data['LotFrontage'].isna()].index

        self.mean_Neighborhood = self.groupby_Neighborhood.mean()
        self.mean_Neighborhood.head()

        for i in self.indices:
            self.locality = self.all_data.iloc[i, 59]
            self.value = self.mean_Neighborhood.get_value(
                self.locality, 'LotFrontage')
            self.all_data.iloc[i, 49] = self.value

        ###########################################################################################

        #
        #
        #	 (6)GarageYrBlt (7) GarageArea (8) GarageCar
        #
        #   (9)GarageType (10) GarageFinish (11) GarageQual (12)GarageCond

        for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
            self.all_data[col] = self.all_data[col].fillna(0)

        self.all_data['GarageType'] = self.all_data['GarageType'].fillna(
            'None')
        self.all_data['GarageFinish'] = self.all_data['GarageFinish'].fillna(
            'None')
        self.all_data['GarageQual'] = self.all_data['GarageQual'].fillna(
            'None')
        self.all_data['GarageCond'] = self.all_data['GarageCond'].fillna(
            'None')

        for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                    'BsmtFullBath', 'BsmtHalfBath'):
            self.all_data[col] = self.all_data[col].fillna(0)

        for col in ('BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                    'BsmtQual'):
            self.all_data[col] = self.all_data[col].fillna('None')

        #############################################################################################

        #
        #
        #	 Electrical , Exterior1st,Exterior2nd,SaleType,KitchenQual
        #
        #

        #Electrical has only 1 Null value , hence replacing by most frequently occuring value i.e. mode of the column

        self.all_data['Electrical'] = self.all_data['Electrical'].fillna(
            self.all_data['Electrical'].mode()[0])

        #Similarly for Exterior1st, Exterior2nd,SaleType and KitchenQual
        self.all_data['Exterior1st'] = self.all_data['Exterior1st'].fillna(
            self.all_data['Exterior1st'].mode()[0])
        self.all_data['Exterior2nd'] = self.all_data['Exterior2nd'].fillna(
            self.all_data['Exterior2nd'].mode()[0])
        self.all_data['KitchenQual'] = self.all_data['KitchenQual'].fillna(
            self.all_data['KitchenQual'].mode()[0])
        self.all_data['SaleType'] = self.all_data['SaleType'].fillna(
            self.all_data['SaleType'].mode()[0])

        ##############################################################################################

        #
        #
        #
        #    'MasVnrArea','MasVnrType' and other columns
        #
        #

        self.indices = self.all_data[self.all_data['MasVnrArea'] == 0].index

        self.all_data['MasVnrArea'] = self.all_data['MasVnrArea'].fillna(0)
        self.all_data['MasVnrType'] = self.all_data['MasVnrType'].fillna(
            'None')
        self.all_data = self.all_data.drop(['Utilities'], axis=1)

        self.all_data["Functional"] = self.all_data["Functional"].fillna("Typ")
        self.all_data['MSSubClass'] = self.all_data['MSSubClass'].fillna(
            "None")

        ##############################################################################################

        # Hence no remaining Columns with missing values.

        # MSSubClass is categorical as only a certain set of numbers are appearing. Hence converting it to categorical

        # OverallCond is categorical as only a certain set of numbers are appearing. Hence converting it to categorical

        self.all_data['MSSubClass'].unique()
        #array([ 20, 180,  60,  80,  50,  75,  30,  70,  90, 120,  45, 190,  85,  160,  40])

        self.all_data['MSSubClass'] = self.all_data['MSSubClass'].apply(str)

        self.all_data['OverallCond'].unique()
        #array([6, 5, 7, 8, 3, 4, 9, 2, 1])

        self.all_data['OverallCond'] = self.all_data['OverallCond'].apply(str)

        #Unlike Yrbuilt , YrSold is taking only a set of numbers converting it to categorical.
        self.all_data['YrSold'].unique()
        #array([2008, 2006, 2010, 2007, 2009])

        self.all_data['YrSold'] = self.all_data['YrSold'].astype(str)

        #Similarly for MonthSold ie MoSold
        self.all_data['MoSold'].unique()
        #array([ 5,  6,  3,  4, 12,  7,  8, 11,  1, 10,  2,  9])

        self.all_data['MoSold'] = self.all_data['MoSold'].astype(str)

        #	 Linear regression works only on columns with numeric values , Using labelEncoder to convert
        #	the categorical colums to a numeric values

        #Set of columns which have categorical values:

        self.columns = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual',
                        'GarageCond', 'ExterQual', 'ExterCond', 'HeatingQC',
                        'PoolQC', 'KitchenQual', 'BsmtFinType1',
                        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure',
                        'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive',
                        'Street', 'Alley', 'CentralAir', 'MSSubClass',
                        'OverallCond', 'YrSold', 'MoSold')

        for column in self.columns:
            self.lbl = LabelEncoder()
            self.lbl.fit(list(self.all_data[column].values))
            self.all_data[column] = self.lbl.transform(
                list(self.all_data[column].values))

        # skewness = skewness[abs(skewness) > 0.75]
        # print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

        # from scipy.special import boxcox1p
        # self.skewed_features = skewness.index
        # lam = 0.15
        # for feat in self.skewed_features:
        #     #all_data[feat] += 1
        #     self.all_data[feat] = boxcox1p(self.all_data[feat], self.lam)

        # This will map the labels of categorical data to 0,1,2,3 etc.
        self.all_data = pd.get_dummies(self.all_data)

    def preProcessNumericalColumns(self):
        #These features are positively correlated with the salePrice hence creating new features by
        #taking 3 polynomials square, cube and square root

        # Taking the top 10 correlated valuse.

        # OverallQual    0.817315
        # GrLivArea      0.715624
        # GarageCars     0.687771
        # GarageArea     0.662332
        # TotalBsmtSF    0.637558
        # 1stFlrSF       0.608198
        # FullBath       0.582020
        # YearBuilt      0.572574

        # As total square feet is important. Adding total sqfootage feature
        self.all_data[
            'TotalSF'] = self.all_data['TotalBsmtSF'] + self.all_data[
                '1stFlrSF'] + self.all_data['2ndFlrSF']

        self.all_data["OverallQual-s2"] = self.all_data["OverallQual"]**2
        self.all_data["OverallQual-s3"] = self.all_data["OverallQual"]**3
        self.all_data["OverallQual-Sq"] = np.sqrt(self.all_data["OverallQual"])

        self.all_data["GrLivArea-s2"] = self.all_data["GrLivArea"]**2
        self.all_data["GrLivArea-s3"] = self.all_data["GrLivArea"]**3
        self.all_data["GrLivArea-Sq"] = np.sqrt(self.all_data["GrLivArea"])

        self.all_data["GarageCars-s2"] = self.all_data["GarageCars"]**2
        self.all_data["GarageCars-s3"] = self.all_data["GarageCars"]**3
        self.all_data["GarageCars-Sq"] = np.sqrt(self.all_data["GarageCars"])

        self.all_data["GarageArea-s2"] = self.all_data["GarageArea"]**2
        self.all_data["GarageArea-s3"] = self.all_data["GarageArea"]**3
        self.all_data["GarageArea-Sq"] = np.sqrt(self.all_data["GarageArea"])

        self.all_data["TotalBsmtSF-s2"] = self.all_data["TotalBsmtSF"]**2
        self.all_data["TotalBsmtSF-s3"] = self.all_data["TotalBsmtSF"]**3
        self.all_data["TotalBsmtSF-Sq"] = np.sqrt(self.all_data["TotalBsmtSF"])

        self.all_data["1stFlrSF-s2"] = self.all_data["1stFlrSF"]**2
        self.all_data["1stFlrSF-s3"] = self.all_data["1stFlrSF"]**3
        self.all_data["1stFlrSF-Sq"] = np.sqrt(self.all_data["1stFlrSF"])

        self.all_data["FullBath-s2"] = self.all_data["FullBath"]**2
        self.all_data["FullBath-s3"] = self.all_data["FullBath"]**3
        self.all_data["FullBath-Sq"] = np.sqrt(self.all_data["FullBath"])

        self.all_data["YearBuilt-s2"] = self.all_data["YearBuilt"]**2
        self.all_data["YearBuilt-s3"] = self.all_data["YearBuilt"]**3
        self.all_data["YearBuilt-Sq"] = np.sqrt(self.all_data["YearBuilt"])

        self.all_data["TotalSF-s2"] = self.all_data["TotalSF"]**2
        self.all_data["TotalSF-s3"] = self.all_data["TotalSF"]**3
        self.all_data["TotalSF-Sq"] = np.sqrt(self.all_data["TotalSF"])

        self.train = self.all_data[:1020]
        self.test = self.all_data[1020:]

        self.all_data.to_csv('./all.csv')

    #Validation function

    def rmsle_cv(self, model):
        #self.n_folds = 5
        self.kf = KFold(5, shuffle=True,
                        random_state=42).get_n_splits(self.train.values)
        self.rmse = np.sqrt(-cross_val_score(model,
                                             self.train.values,
                                             self.y_train,
                                             scoring="neg_mean_squared_error",
                                             cv=self.kf))
        return (self.rmse)

    #Lasso. Best alpha : 0.0005 / 91% accuracy
    def lasso_model(self):
        self.lasso_m = Lasso()
        self.alpha = [0.0005, 0.0003, 0.0007]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.lasso_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.lasso = self.grid_search.best_estimator_
        # #self.lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
        # #self.score = self.rmsle_cv(self.lasso)
        # self.score = self.rmsle_cv(HousePrices.lasso)
        # print("\nLasso score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    # ElasticNet. Best Alpha : 0.001  / 91% accuracy.
    def elasticNet(self):
        self.enet_m = ElasticNet()
        self.alpha = [0.0005, 0.0007, 0.001]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.enet_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.enet_m = self.grid_search.best_estimator_

        # #self.ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
        # self.score = self.rmsle_cv(HousePrices.ENet)
        # print("ElasticNet score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    #Kernel Ridge regression. Best alpha : .0005 / 79% accuracy
    def kernelRegression(self):
        self.krr_m = KernelRidge()
        self.alpha = [0.0005, 0.0007, 0.001, 0.0006, 0.0001]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.krr_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.krr_m = self.grid_search.best_estimator_

        # #self.KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
        # self.score = self.rmsle_cv(HousePrices.KRR)
        # print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    #GradientBoosting. Best alpha : .00065 / 89% accuracy
    def gradientBoosting(self):
        self.gboost_m = GradientBoostingRegressor()
        self.alpha = [0.00068, 0.00065, 0.00066]
        self.param_grid = dict(alpha=self.alpha)
        self.grid_search = GridSearchCV(self.gboost_m,
                                        self.param_grid,
                                        scoring="r2",
                                        cv=10)
        self.grid_result = self.grid_search.fit(self.train, self.y_train)
        print("Best: %f using %s" %
              (self.grid_result.best_score_, self.grid_result.best_params_))
        self.krr_m = self.grid_search.best_estimator_

        # #self.GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10,loss='huber', random_state =5)
        # self.score = self.rmsle_cv(HousePrices.GBoost)
        # print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std()))

    # XgbRegressor.Best alpha : .0005 / 79% accuracy
    def xgbRegressor(self):
        #self.model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,learning_rate=0.05, max_depth=3,min_child_weight=1.7817, n_estimators=2200,reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1,random_state =7, nthread = -1)
        self.score = self.rmsle_cv(HousePrices.model_xgb)
        print("Xgboost score: {:.4f} ({:.4f})\n".format(
            self.score.mean(), self.score.std()))

    # LgbRegressor. Best alpha : .0005 / 79% accuracy
    def lgbRegressor(self):
        #model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,learning_rate=0.05, n_estimators=720,max_bin = 55, bagging_fraction = 0.8,bagging_freq = 5, feature_fraction = 0.2319,feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
        self.score = self.rmsle_cv(HousePrices.model_lgb)
        print("LgbRegressor score: {:.4f} ({:.4f})\n".format(
            self.score.mean(), self.score.std()))

    def rmsle(self, y, y_pred):
        return np.sqrt(mean_squared_error(y, y_pred))

    def stackingModels(self):
        #Lasso
        self.lasso_stacking = make_pipeline(
            RobustScaler(), Lasso(alpha=0.0005, random_state=1))
        #ElasticNet
        self.ENet_stacking = make_pipeline(
            RobustScaler(),
            ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
        #Kernel Ridge regression
        self.KRR_stacking = KernelRidge(alpha=0.6,
                                        kernel='polynomial',
                                        degree=2,
                                        coef0=2.5)
        #GBoost
        self.GBoost_stacking = GradientBoostingRegressor(n_estimators=3000,
                                                         learning_rate=0.05,
                                                         max_depth=4,
                                                         max_features='sqrt',
                                                         min_samples_leaf=15,
                                                         min_samples_split=10,
                                                         loss='huber',
                                                         random_state=5)

        #Lgb
        self.lgb_stacking = lgb.LGBMRegressor(objective='regression',
                                              num_leaves=5,
                                              learning_rate=0.05,
                                              n_estimators=720,
                                              max_bin=55,
                                              bagging_fraction=0.8,
                                              bagging_freq=5,
                                              feature_fraction=0.2319,
                                              feature_fraction_seed=9,
                                              bagging_seed=9,
                                              min_data_in_leaf=6,
                                              min_sum_hessian_in_leaf=11)

        #Stacking
        self.stacked_averaged_models = StackingAveragedModels(
            base_models=(self.ENet_stacking, self.GBoost_stacking,
                         self.KRR_stacking),
            meta_model=self.lasso_stacking)

        self.score = self.rmsle_cv(self.stacked_averaged_models)
        print("Stacking Averaged models score: {:.4f} ({:.4f})".format(
            self.score.mean(), self.score.std()))

        self.stacked_averaged_models.fit(self.train.values, self.y_train)
        self.stacked_train_pred = self.stacked_averaged_models.predict(
            self.train.values)
        self.stacked_pred = np.expm1(
            self.stacked_averaged_models.predict(self.test.values))
        print("RMSE of stacked ")
        print(self.rmsle(self.y_train, self.stacked_train_pred))
Esempio n. 23
0




band_names = ['theta', 'alpha', 'low-beta', 'beta', 'high-beta']
bands = [(4, 8), (8, 12), (12, 16), (16, 20), (20, 24)]
band_dict = dict(zip(band_names, bands))
bands_to_analyse = band_names[:3]


subj_dir = '/home/nikolai/_Work/predict_alpha/!each data/Dasha'

raw, channels, fs = load_p4_data(subj_dir)
for day in []:
    scaler = RobustScaler()
    scaler.fit(raw.loc[raw.day == day, 'p4'])
    scaler.fit(raw.loc[raw.day == day, 'p4'])

for band in bands_to_analyse:
    exp_sm = ExponentialSmoother(0.99)
    env_detector = ButterBandEnvelopeDetector(band_dict[band], fs, exp_sm, 3)
    raw[band] = env_detector.apply(raw['p4'])
    for day in []:
        #raw.loc[raw.day == day, band] -= raw.loc[(raw.day == day) & (raw.block_name == 'fon'), band].quantile(0.05)
        #print('mode', raw.loc[(raw.day == day) & (raw.block_name == 'fon'), band].mode())
        raw.loc[raw.day == day, band] /= raw.loc[(raw.day == day), band].quantile(0.01)

plt.plot(raw.loc[(raw.day == 1) & (raw.block_name == 'fon'), 'p4'], label='day1')
plt.plot(raw.loc[(raw.day == 2) & (raw.block_name == 'fon'), 'p4'], label='day2')
plt.legend()
# --------------
#  MINMAXSCALAR |
# --------------
# Scaling features to lie between a given minimum and maximum value, often between 0 and 1
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

print("\nMinMaxScalar:" "\n=============" "\nX_train:", X_train)
print('\nX_test:', X_test)

# --------------
#  ROBUSTSCALAR |
# --------------
# This removed the median and scaled the data according to the quantile range
robust_scaler = RobustScaler()
X_train = robust_scaler.fit_transform(X_train)
X_test = robust_scaler.transform(X_test)

print("\nRobustScalar:" "\n=============" "\nX_train:", X_train)
print('\nX_test:', X_test)

# --------------
#  NORMALIZER   |
# --------------
# Normalize samples individually to unit norm
# Each sample (each row of the data matrix) with at least one non zero component is rescaled
# indepentently o other samples so that its norm (|1 or |2) equals 1
normalizer_scaler = Normalizer()
X_train = normalizer_scaler.fit_transform(X_train)
X_test = normalizer_scaler.transform(X_test)
Esempio n. 25
0
    def fit(self, X, y=None):
        self.scaler = RobustScaler()
        self.scaler.fit(X[['thinking_time', 'actual_interval']])

        return self
Esempio n. 26
0
df_train.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True)
df_test.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True)

public_data = df_train.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1)
PA_data = df_test.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1)

public_labels = df_train.Histology
PA_labels = df_test.Histology

encoder = LabelEncoder()

#Scalers

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
scalers_to_test = [StandardScaler(), RobustScaler(), MinMaxScaler(), None]

df = pd.DataFrame()

# Designate distributions to sample hyperparameters from 
R = np.arange(0.1, 10, 0.2) 
n_features_to_test = np.arange(1, 11)


for i in range(1, 21):

       #Train test split
       X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, 
       stratify=public_labels, random_state=i*500)

       #Vettorizzare i label
Esempio n. 27
0
    def generate_batch_data(self, raw_data, name, timesteps=24):
        # if "data_processed"==name or "wind_direction" == name or "wind_speed" == name:
        #     raw_data = self.one_hot_encoding(raw_data)
        raw_data = pd.DataFrame(raw_data)
        value = raw_data.values
        print('feature ------------ ', name.upper())

        if self.scaler_type == 'standard':
            scaler = StandardScaler()
        if self.scaler_type == 'robust':
            scaler = RobustScaler()
        if self.scaler_type == 'min_max':
            scaler = MinMaxScaler(feature_range=(0, 1))

        scaler = scaler.fit(value)
        normalized = scaler.transform(value)
        data = normalized

        print('Max: %f, Min: %f, Format: %d*%d' %
              (np.amax(data), np.amin(data), data.shape[0], data.shape[1]))
        # data = pd.DataFrame(data)
        # print(data)

        if name != 'target':
            input_serise = data[:(len(data) - 24 * 11)]
            x_batches = np.array([])
        else:
            target_serise = self.shift(data, -(timesteps)).astype(np.float32)
            y_batches = np.array([])

        # check if file exists
        if (self.scaler_type is None):
            seq_file_name = "test_np_processed_" + name + "_" + str(
                timesteps) + "_" + str(self.pca) + "_" + str(
                    self.normal) + ".npz"
        else:
            seq_file_name = "test_np_" + self.scaler_type + "_processed_" + name + "_" + str(
                timesteps) + "_" + str(self.pca) + "_" + str(
                    self.normal) + ".npz"

        if os.path.isfile("data_log/" + seq_file_name):
            npzfile = np.load("data_log/" + seq_file_name)
            if name != 'target':
                input_batches = npzfile['arr_0']
                ret = input_batches
            else:
                target_batches = npzfile['arr_0']
                ret = target_batches
            return ret, scaler
        else:
            for i in range(783):
                try:
                    if name != 'target':
                        x_batches = np.append(
                            x_batches,
                            input_serise[i * 11:(i + timesteps) * 11].reshape(
                                -1, timesteps, 11))
                    else:
                        y_batches = np.append(
                            y_batches, target_serise[i:i + timesteps].reshape(
                                -1, timesteps))
                except ValueError:
                    break

            if name != 'target':
                x_batches = x_batches.reshape(-1, timesteps, 11)
                np.savez("data_log/" + seq_file_name, x_batches)
                return x_batches, scaler
            else:
                y_batches = y_batches.reshape(-1, timesteps)
                np.savez("data_log/" + seq_file_name, y_batches)
                return y_batches, scaler
Esempio n. 28
0
import numpy as np
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from keras.utils import np_utils
ss = StandardScaler()
mms = MinMaxScaler()
mas = MaxAbsScaler()
rs = RobustScaler()
pca = PCA(n_components=3)

### 1. 데이터
x, y = load_iris(return_X_y=True)
print(x.shape)  # (150, 4)
print(y.shape)  # (150,)

## 1-1. 데이터 분리
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=77)
print(x_train.shape)  # (120, 4)
print(x_test.shape)  # (30, 4)
print(y_train.shape)  # (120,)
Esempio n. 29
0
    result[col] = abs((result[col].apply(hash))%2**(16))

del result['unique_id']
#%% handle missing value
print ("handle missing data")
result.fillna(result.mean(),inplace=True)



#%% data preprocessing
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, RobustScaler


standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

X_train = robust_scaler.fit_transform(result)
X_train1 = standard_scaler.fit_transform(result)
#%% performace 


def performence(clf,train,label,clfName):
    re = cross_validation.ShuffleSplit(train.shape[0],n_iter=10,test_size =0.25,random_state =43)
    
    aucList = []
    accuracyList = []
    for train_index, test_index in re:
        clf.fit(train.iloc[train_index,:],y.iloc[train_index])
        pre_y = clf.predict_proba(train.iloc[test_index,:])  # probablity to get the AUC
        aucList.append(roc_auc_score(y.iloc[test_index],pre_y[:,1]))
Esempio n. 30
0
    max_bin=200,
    bagging_fraction=0.75,
    bagging_freq=5,
    bagging_seed=7,
    feature_fraction=0.2,
    feature_fraction_seed=7,
    verbose=-1,
)

score = rmsle_cv(lgb)
lgb.fit(train, y_train)
y_train_pred = np.expm1(lgb.predict(train))
print(f"lightgbm score: {score.mean():.4f} ({score.std():.4f})")
LGB = np.expm1(lgb.predict(test))

lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0004, random_state=1))
score = rmsle_cv(lasso)
print(f"Lasso score: {score.mean():.4f} ({score.std():.4f})")

reg = lasso.fit(train, y_train)
predictions = lasso.predict(test)
LASSO = np.expm1(predictions)

svr = make_pipeline(RobustScaler(), SVR(
    C=20,
    epsilon=0.008,
    gamma=0.0003,
))
score = rmsle_cv(svr)
print(f"SVR score: {score.mean():.4f} ({score.std():.4f})")
Esempio n. 31
0
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

n_folds = 5


def fold_cv(model):
    kf = KFold(n_folds, shuffle=True,
               random_state=42).get_n_splits(train_dummy.values)
    rmse = np.sqrt(-cross_val_score(
        model, X_train, y_values, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)


lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.001, random_state=1))
ENet = make_pipeline(RobustScaler(),
                     ElasticNet(alpha=0.005, l1_ratio=0.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000,
                                   learning_rate=0.05,
                                   max_depth=4,
                                   max_features='sqrt',
                                   min_samples_leaf=15,
                                   min_samples_split=10,
                                   loss='huber',
                                   random_state=5)
mode_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,
                            gamma=0.0468,
                            learning_rate=0.05,
                            max_depth=3,
var_dums = pd.get_dummies(all_data["Variety"])
all_data = all_data.drop(columns="Variety")
all_data = pd.concat([all_data, var_dums], axis=1)

all_data = all_data.drop(columns="Site ID")
all_data = all_data.dropna()
all_data = all_data[all_data["Assessment Score"] != '*']

#split features and target
Y = all_data["Assessment Score"]
X = all_data.drop(columns="Assessment Score")

#scale features
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X = transformer.transform(X)

Y = np.array(Y)
Y[Y == ''] = 0.0
Y = Y.astype(np.float)

#make dense network model
import neural_net

NeuralNet = neural_net.NeuralNet

#crop_score_model = NeuralNet(X, Y, 6, 256, "r", 20)

#check accuracy
from sklearn.metrics import mean_squared_error
Esempio n. 33
0
    if CD == 1:
        NCH = NK
        SSHP0 = (SNFL, SNBETA, SNS, NT, NCH)
        SSHP1 = (SNFL*SNBETA*SNS, NT, NCH)
        SSHP2 = (SNFL*SNBETA*SNS, NT*NCH)
    if CD == 3:
        NKS = np.int32(np.sqrt(NK))
        NCH = 1
        SSHP0 = (SNFL, SNBETA, SNS, NT, NKS, NKS, NCH)
        SSHP1 = (SNFL*SNBETA*SNS, NT, NKS, NKS, NCH)
        SSHP2 = (SNFL*SNBETA*SNS, NT*NKS*NKS*NCH)

    # scaler dictionary
    SCLRS = {'minmax':MinMaxScaler(feature_range=(0, 1)),
             'standard':StandardScaler(),
             'robust':RobustScaler(),
             'tanh':TanhScaler()}

    try:
        SCDAT = np.load(CWD+'/results/%s.%d.%d.%d.%d.%d.%s.%d.dmp.sc.npy' \
                        % (NAME, NT, NK, CD, SNI, SNS, SCLR, SEED)).reshape(*SSHP1)
        if VERBOSE:
            print('scaled selected classification samples loaded from file')
            print(100*'-')
    except:
        if SCLR == 'none':
            SCDAT = CDAT.reshape(*SSHP1)
        else:
            SCDAT = SCLRS[SCLR].fit_transform(CDAT.reshape(*SSHP2)).reshape(*SSHP1)
        np.save(CWD+'/results/%s.%d.%d.%d.%d.%d.%s.%d.dmp.sc.npy' % (NAME, NT, NK, CD, SNI, SNS, SCLR, SEED), SCDAT.reshape(*SSHP0))
        if VERBOSE:
Esempio n. 34
0
    data['descriptor'] = data['smi'].swifter.apply(descriptors.QEDScore)
elif conf['descriptor'] == 'SYBA_score':
    syba = SybaClassifier()
    syba.fitDefaultScore()
    data['descriptor'] = data['smi'].swifter.apply(syba.predict)
else:
    print('Descriptor not recognised')

X = np.stack(data['descriptor'].values)
Y = np.stack(data['activity'].values)

if isinstance(X[0], float):
    X = np.array([[i] for i in X])

if conf['descriptor'] == 'features':
    scaler = RobustScaler()
    X = scaler.fit_transform(X)

study = optuna.create_study(direction="maximize")
objective = Objective(X, Y, conf)
study.optimize(objective, n_trials=conf['n_trials'])

with open(best_params_file, 'w') as outfile:
    json.dump(study.best_params, outfile)
    
df = study.trials_dataframe()
df.to_csv(out_df_file)

with open(best_value_file, 'w') as outfile:
    outfile.write("Best Trial Value: {}".format(study.best_value))
testy,testx=imdb_bag_of_word_libs.loadFeatsText('./exp/ivectors_imdb_test_NGMM_2048_W_2_DIM_200/feats.txt')

print 'done in',time.time()-ts,len(x),len(y)

y=imdb_bag_of_word_libs.kaldiID_2_LB(y)
print y[0],x[0]


x=np.array(x)
y=np.array(y)



trainx,trainy=x,y

robust_scaler = RobustScaler()
trainx=robust_scaler.fit_transform(trainx)
evalx=robust_scaler.transform(testx)
clf= LinearDiscriminantAnalysis()
clf.fit(trainx,trainy)
predictValue=clf.predict(evalx)

sdict=dict()
ptrue=list()
for id,score in zip(testy,predictValue):
    sdict[id]=score
    #print id,score
    truevalue=int(id.split('_')[2])
    if truevalue>=5:
        ptrue.append('1')
    else:
Esempio n. 36
0
athena = pd.read_csv('./data/UA_AthenaData.csv')

# Drop the encrypted phone number (LineNumber), and the Call category (As labeled by data team)
athena = athena.drop(['LineNumber', 'CallCategory'], axis=1)

# Split into subgroups, as training on the entire dataset breaks my computer
group = np.array_split(athena, 4)

# Iterate through each group
for i in range(len(group)):
    print('======= GROUP {} ======'.format(i))
    subdata = group[i]

    ## Scale the data to have mean=0 and unit variance:
    print('Scaling Data')
    scaler = RobustScaler().fit(athena)
    scaler.transform(athena)

    ## Reduce data for clustering
    print('Reducing dimensions')
    model = umap.UMAP(n_neighbors=20, min_dist=0.15, metric='braycurtis')
    data_2d = model.fit_transform(subdata)

    print('Clustering Data')
    cluster = DBSCAN(eps=3, min_samples=2).fit(subdata)

    print('Configuring data to clusters')
    subdata['PCA1'] = data_2d[:, 0]
    subdata['PCA2'] = data_2d[:, 1]
    cluster.labels_[cluster.labels_ > 0] = 1
    subdata['cluster'] = cluster.labels_
#%% 
#aa = X.groupby('VisitNumber').groups   
#X_new = pd.DataFrame(columns = X.keys())    
#for key in aa.keys():
#    X_new = X_new.append(X.iloc[aa[key],:].mean(),ignore_index=True)    
#%%    


#%%
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, RobustScaler


standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

X_train = robust_scaler.fit_transform(aa)
X_train1 = standard_scaler.fit_transform(aa)


#%% for the test data

X_test = testData

for col in colName:
    X_test[col] = abs((X_test[col].apply(hash))%2**(16))
    
#%%    
print ("handle missing data")
X_test.fillna(X_test.mean(),inplace=True)
Esempio n. 38
0
print(test_length)
upper_test = test_length + timesteps * 2
testset_length = test_length - upper_train
print(testset_length)
print(upper_train, upper_test, len(df_data_1))

# construct test set

#subsetting
df_data_1_test = df_data_1[upper_train:upper_test]
test_set_y = np.nan_to_num(df_data_1_test['user_ts'].values)
test_set = np.nan_to_num(df_data_1_test.loc[:, :].values)

#scaling
sc = RobustScaler(with_centering=True,
                  with_scaling=True,
                  quantile_range=(25.0, 75.0))
scaled_test_values = sc.fit_transform(np.float64(test_set))
scaled_test_values_y = np.sign(test_set_y.reshape(-1, 1))
#scaled_test_values_y = sc.fit_transform(np.float64(test_set_y.reshape(-1,1)))

#scaled_test_values = np.tanh(np.float64(test_set))
#scaled_test_values_y = np.tanh(np.float64(test_set_y.reshape(-1,1)))

#creating input data
x_test = []
y_test = []
for i in range(timesteps, testset_length + timesteps):
    x_test.append(scaled_test_values[i - timesteps:i, :])
    y_test.append(
        scaled_test_values_y[i:timesteps +
Esempio n. 39
0
def get_evoked_feats(f_list,
                     stim_chan,
                     sig_chan,
                     pre_win=1.,
                     post_win=1.5,
                     thresh=3,
                     t_thresh=0.1):

    all_evoked_burst = None
    IBI = []
    all_evoked_onset = []
    all_prev_onset = []
    stim_lockout_s = 1.

    for f in f_list:
        dat = pyabf.ABF(f)
        stim_id = abf.get_channel_id_by_label(dat, stim_chan)
        sig_id = abf.get_channel_id_by_label(dat, sig_chan)
        sr = dat.dataRate

        scl = RobustScaler()
        Y_cat = cat_sweeps(dat, sig_chan).T.ravel()
        scl.fit(Y_cat[:, np.newaxis])

        for ii in range(dat.sweepCount):
            dat.setSweep(ii, stim_id)
            stim_samp = rlab_signal.binary_onsets(dat.sweepY, 4.)[0]
            dat.setSweep(ii, sig_id)
            #             if sr == 10000:
            #                 print('Downsampling')
            #                 y = dat.sweepY
            #                 y = scipy.signal.decimate(y, 10)
            #                 sr = sr / 10
            #             else:
            #                 y = dat.sweepY
            y = dat.sweepY
            stim_lockout = int(stim_lockout_s * sr)
            yscl = scl.transform(y[:, np.newaxis]).ravel()
            yscl_NN = yscl - np.min(yscl)
            onsets, offsets = burst.detect_burst(yscl,
                                                 sr,
                                                 thresh=thresh,
                                                 t_thresh=t_thresh)
            # onsets, offsets = burst.rm_endpoint_bursts(yscl, onsets, offsets, pre_win * sr, post_win * sr)

            # Get the threshold crossing time of the bursts that happened within a time window of the evoked
            #Used to get the evoked burst shapek
            try:
                evoked_onset_idx = np.where(
                    onsets > (stim_samp - int(pre_win / 9. * sr)))[0][0]
                next_onset_idx = evoked_onset_idx + 1
                prev_onset_idx = evoked_onset_idx - 1
                evoked_onset = onsets[evoked_onset_idx]
            except:
                IBI.append(np.nan)
                all_prev_onset.append(np.nan)
                all_evoked_onset.append(np.nan)
                evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1
                                         ]) * np.nan
                if all_evoked_burst is None:
                    all_evoked_burst = evoked_burst
                else:
                    all_evoked_burst = np.concatenate(
                        [all_evoked_burst, evoked_burst], axis=1)

                continue
                # evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1]) * np.nan

            if next_onset_idx > len(onsets) - 1:
                next_onset = np.nan
            else:
                next_onset = onsets[next_onset_idx]

            if prev_onset_idx < 0:
                prev_onset = np.nan
            else:
                prev_onset = onsets[prev_onset_idx]

            # Get the threshold crossing of the second burst after stim (good for IBI)

            if evoked_onset < int(stim_samp + stim_lockout):
                evoked_burst = burst.get_aligned_bursts(
                    yscl_NN, [evoked_onset], int(pre_win * sr),
                    int(post_win * sr))
                IBI.append(next_onset - evoked_onset)
                all_evoked_onset.append(evoked_onset)
                all_prev_onset.append(prev_onset)
            else:
                IBI.append(np.nan)
                all_prev_onset.append(np.nan)
                all_evoked_onset.append(np.nan)
                evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1
                                         ]) * np.nan

            if all_evoked_burst is None:
                all_evoked_burst = evoked_burst
            else:
                all_evoked_burst = np.concatenate(
                    [all_evoked_burst, evoked_burst], axis=1)
    evoked_onset = np.array(all_evoked_onset) / sr
    prev_onset = np.array(all_prev_onset) / sr
    IBI = np.array(IBI) / sr

    return (all_evoked_burst, evoked_onset, prev_onset, IBI)
Esempio n. 40
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sgmcmc_ssm.models.gauss_hmm import GaussHMMSampler
from tqdm import tqdm

np.random.seed(12345)

# Load and Scale Data
from scipy.io import loadmat
ion_data = loadmat('data/alamethicin.mat')

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
observations = scaler.fit_transform(ion_data['originaldata'][1095:-3000])
filtered_observations = scaler.transform(ion_data['filtereddata'])
T = len(observations)

# Plot Data
fig, ax = plt.subplots(1, 1)
ax.plot(np.arange(T)[::50], observations[::50], '-', label='scaled data')
ax.plot(np.arange(T)[::50],
        filtered_observations[::50],
        '-',
        label='scaled filtered data')
ax.set_title('Scaled Ion Data')
ax.set_xlabel('Time')
ax.set_ylabel('Voltage (Scaled)')
ax.legend()
Esempio n. 41
0
print(len(train), len(test))
print('--------------------------------------------------')
print('\n')

df_data['MONTH'] = [d.strftime('%m') for d in df_data.index]

from sklearn.preprocessing import RobustScaler

f_columns = [
    '2_prev', '3_prev', '4_prev', '5_prev', '6_prev', '7_prev', '8_prev',
    '9_prev', '10_prev', '11_prev', '12_prev', 'MONTH', 'HOUR', 'WEEKDAY',
    'WEEKEND', 'Demand Forecast', 'SPOT Market Volume', 'Wind Forecast',
    'RoR Forecast', 'Yuk Tahmin Planı (MWh)', 'Market Clearing Price'
]

f_transformer = RobustScaler()
cnt_transformer = RobustScaler()

f_transformer = f_transformer.fit(train[f_columns].to_numpy())
cnt_transformer = cnt_transformer.fit(train[['NetOrder']])

train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
train['NetOrder'] = cnt_transformer.transform(train[['NetOrder']])

test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
test['NetOrder'] = cnt_transformer.transform(test[['NetOrder']])


def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
Esempio n. 42
0
# In[12]:

# 使用Z-标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
regressor = KNeighborsRegressor()
regressor.fit(X_train_scaled, Y_train)
Y_est = regressor.predict(X_test_scaled)
print("MAE=", mean_squared_error(Y_test, Y_est))

# In[13]:

# 鲁棒性缩放
scaler2 = RobustScaler()
X_train_scaled = scaler2.fit_transform(X_train)
X_test_scaled = scaler2.transform(X_test)
regressor = KNeighborsRegressor()
regressor.fit(X_train_scaled, Y_train)
Y_est = regressor.predict(X_test_scaled)
print("MAE=", mean_squared_error(Y_test, Y_est))

# In[14]:

# 对特定特征使用非线性修正
non_linear_feat = 5
X_train_new_feat = np.sqrt(X_train[:, non_linear_feat])
X_test_new_feat = np.sqrt(X_test[:, non_linear_feat])

X_train_new_feat.shape = (X_train_new_feat.shape[0], 1)
Esempio n. 43
0
def elasticReg(data, test):
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.001, l1_ratio=.8, random_state=3))

    stackedVerData = data.copy()

    Y_res = data['AdjustedPrice']
    data = data.drop(['AdjustedPrice', 'Id'], axis=1)
    ENet.fit(data, Y_res)

    score = scoreTest(ENet, data, Y_res)
    print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

    lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
    lasso.fit(data, Y_res)

    score = scoreTest(lasso, data, Y_res)
    print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

    predId = test['Id']
    test = test.drop(['Id'], axis=1)

    y_pred_Enet = ENet.predict(test)
    y_pred_Lasso = lasso.predict(test)

    ENetS = make_pipeline(RobustScaler(), ElasticNet(alpha=0.001, l1_ratio=.8, random_state=3))
    lassoS = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
    # KRS = KernelRidge(alpha=0.8, kernel="polynomial")
    # GBRS = GradientBoostingRegressor(loss="huber", n_estimators=5000, learning_rate=0.001)
    GBRS = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state =5)
    KRS = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

    # YResStacked = pd.DataFrame(stackedVerData['AdjustedPrice'])
    # YResStacked.insert(0, 'Id', range(0, len(YResStacked)))
    # YResStacked.set_index(YResStacked.columns[0], inplace=True)

    # dataStacked = stackedVerData.drop(['AdjustedPrice', 'Id'], axis = 1)
    # dataStacked.insert(0, 'Id', range(0, len(dataStacked)))
    # dataStacked.set_index(dataStacked.columns[0], inplace=True)
    # YResStacked = stackedVerData['AdjustedPrice'].values
    YResStacked = pd.DataFrame(stackedVerData['AdjustedPrice'])
    dataStacked = stackedVerData.drop(['AdjustedPrice', 'Id'], axis=1)

    # print(dataStacked.shape)
    # print(YResStacked.shape)
    # print(type(dataStacked))
    # print(type(YResStacked))
    # print(np.any(np.isnan(dataStacked)))
    # print(np.any(np.isnan(YResStacked)))
    # print(np.all(np.isfinite(dataStacked)))
    # print(np.all(np.isfinite(YResStacked)))
    # exit()

    # exit()
    averageStackedModel = StackingAveragedModel(base_models=(ENet, KRS, GBRS), meta_model=lasso)
    averageStackedModel.fit(dataStacked, YResStacked)
    y_pred_stacked = averageStackedModel.predict(test.values)
    # print('FIT')
    # score = scoreTest(averageStackedModel, data, Y_res)
    # print("\nAVGStacked score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

    saveSubmission(predId, y_pred_stacked)
Esempio n. 44
0
    """Get non NaN samples in column of X"""
    return X[:, [col]][~np.isnan(X[:, col])]


@pytest.mark.parametrize(
    "est, func, support_sparse, strictly_positive, omit_kwargs",
    [
        (MaxAbsScaler(), maxabs_scale, True, False, []),
        (MinMaxScaler(), minmax_scale, False, False, ["clip"]),
        (StandardScaler(), scale, False, False, []),
        (StandardScaler(with_mean=False), scale, True, False, []),
        (PowerTransformer("yeo-johnson"), power_transform, False, False, []),
        (PowerTransformer("box-cox"), power_transform, False, True, []),
        (QuantileTransformer(n_quantiles=10), quantile_transform, True, False,
         []),
        (RobustScaler(), robust_scale, False, False, []),
        (RobustScaler(with_centering=False), robust_scale, True, False, []),
    ],
)
def test_missing_value_handling(est, func, support_sparse, strictly_positive,
                                omit_kwargs):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
      rng.randint(X.shape[1], size=n_missing)] = np.nan
    if strictly_positive:
        X += np.nanmin(X) + 0.1
    X_train, X_test = train_test_split(X, random_state=1)
    # sanity check
Esempio n. 45
0
eeg_dataset.head()

X = eeg_dataset[['alpha', 'betha', 'delta', 'gamma', 'theta']].values
y = eeg_dataset[['class']].values.ravel()

# Segmentar los datos

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=.7,
                                                    test_size=.3,
                                                    random_state=25)

# Escalado de caracteristicas
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

scaler.fit(X_train)
x_train = scaler.transform(x_train)

x_test = scaler.transform(x_test)

# Arquitectura de modelo
max_features = 512

model = Sequential()
model.add(Embedding(max_features, output_dim=64))
model.add(LSTM(64))
model.add(Dropout(0.8))
model.add(Dense(1, activation='sigmoid'))
Esempio n. 46
0
dff = df.loc[(df.Date >= '4/29/2019') & (df.Date <= '5/3/2019')]
dff['Timestamp'] = dff['Date'] + ' ' + dff['Time']
dff['Timestamp'] = pd.to_datetime(dff['Timestamp'])
dff = dff.sort_values(by=['Timestamp'])

X = dff.loc[dff.index, ['Position', 'Count']].to_numpy()
#y_full = dff.loc[dff.index,['Count']].to_numpy()
y_full = Y_full[:len(X)]

distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling', StandardScaler().fit_transform(X)),
    ('Data after min-max scaling', MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)),
    ('Data after robust scaling',
     RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Data after power transformation (Yeo-Johnson)',
     PowerTransformer(method='yeo-johnson').fit_transform(X)),
    ('Data after power transformation (Box-Cox)',
     PowerTransformer(method='box-cox').fit_transform(X)),
    ('Data after quantile transformation (uniform pdf)',
     QuantileTransformer(output_distribution='uniform').fit_transform(X)),
    ('Data after quantile transformation (gaussian pdf)',
     QuantileTransformer(output_distribution='normal').fit_transform(X)),
    ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)),
]

# scale the output between 0 and 1 for the colorbar
y = minmax_scale(y_full)

# plasma does not exist in matplotlib < 1.5
axes[0].legend(fontsize=13)
axes[0].set_xlabel('Redshift (distance)', fontsize=18)

axes[1].set_xlim([18, 32])
axes[1].set_title('Magnitude distributions (5 filters)', fontsize=18)
axes[1].legend(fontsize=13)
axes[1].set_xlabel('Magnitudes (higher == fainter)', fontsize=18)

axes[2].set_xlim([-6, 3])
axes[2].set_title('Distributions of the log-error', fontsize=18)
axes[2].legend(fontsize=11)
axes[2].set_xlabel('Log of error', fontsize=18)

df_train.head()

RS = RobustScaler()

### The training features ### 
feat_train = ['g', 'log_g_err', 'r', 'log_r_err', 'i', 'log_i_err',
              'z', 'log_z_err', 'y', 'log_y_err']

### The features for the validation set, ###  ###
### each galaxy has 5 distinct features 1 for each exposer time ###
feat_SN_1 = ['g_SN_1', 'log_g_err_SN_1', 'r_SN_1', 'log_r_err_SN_1',
             'i_SN_1', 'log_i_err_SN_1', 'z_SN_1', 'log_z_err_SN_1',
             'y_SN_1', 'log_y_err_SN_1']

feat_SN_2 = ['g_SN_2', 'log_g_err_SN_2', 'r_SN_2', 'log_r_err_SN_2',
             'i_SN_2', 'log_i_err_SN_2', 'z_SN_2', 'log_z_err_SN_2',
             'y_SN_2', 'log_y_err_SN_2']
Esempio n. 48
0
    if 'Unnamed' in col:
        del test[col]
        
train.to_csv(path_or_buf= filepath + "/trainfinal.csv", index=False)
test.to_csv(path_or_buf= filepath + "/testfinal.csv", index=False)
print("Exported")
train = []
test = []

#Obtaining the columns required for training the model
train = pd.read_csv(filepath + "/trainfinal.csv")
test = pd.read_csv(filepath + "/testfinal.csv")
cols = [c for c in train.columns if c not in ['is_churn','msno']]

#Pre-processing the file with Robust Scaler
scaler = RobustScaler()
scaler.fit(train[cols])
train_x = scaler.transform(train[cols])
test_x = scaler.transform(test[cols])
train_y = train['is_churn']
print("Pre-processing completed")

#Training Random Forest Classifier
model = RandomForestClassifier(n_estimators = 50)
model.fit(train_x,train_y)
print("Training Completed")

#Predicting the test data with the trained model
predictions = model.predict(test_x)

#Exporting the msno and predicted values to a csv file
Esempio n. 49
0
from sklearn.svm import SVR
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler


path = "/Users/xiaofeifei/I/Kaggle/Benz/"
train = pd.read_csv(path+'train_start.csv')
# test = pd.read_csv(path+'test_start.csv')

y = train["y"]

train = train.drop(["y"], axis = 1)

# # poly
svm = SVR(kernel='rbf', C=1.0, epsilon=0.05)

a= RobustScaler()
train = a.fit_transform(train,y)
kr = GridSearchCV(SVR(kernel='rbf', C=1.0, epsilon=0.05), cv=5, n_jobs = 6,verbose=1,scoring='r2',
                  param_grid={"C": [20,30],
                              "epsilon": [0.02,0.03,0.05,0.07]})

kr.fit(train, y)
print kr.best_params_
print kr.best_score_
print kr.best_estimator_

# {'epsilon': 0.01, 'C': 30}
# 0.536811148843
Esempio n. 50
0
    def __init__(
        self,
        X_train: np.array,
        y_train: np.array,
        X_valid: np.array,
        y_valid: np.array,
        n: int = 10,
        eval_method: str = "kfold",
        scaler: str = "standard",
        modelpath: str = "models",
        max_evals: int = 250,
        voting: str = "hard",
        calibrate: str = "sigmoid",
        timeout: int = 600,
        max_workers: int = 16,
        experiment: Experiment = None,
    ):  # pylint:disable=too-many-arguments

        self.x = X_train
        self.y = y_train
        self.x_valid = X_valid
        self.y_valid = y_valid
        self.experiment = experiment

        # We make sure that everything is logged on comet
        assert isinstance(experiment, Experiment)

        assert len(self.x) == len(self.y)
        assert len(self.x_valid) == len(self.y_valid)

        self.n = n
        self.eval_method = eval_method
        if scaler == "robust":
            self.scalername = "robust"
            self.scaler = RobustScaler()
        elif scaler == "standard":
            self.scalername = "standard"
            self.scaler = StandardScaler()
        elif scaler == "minmax":
            self.scalername = "minmax"
            self.scaler = MinMaxScaler()

        self.x = self.scaler.fit_transform(self.x)
        self.x_valid = self.scaler.transform(self.x_valid)

        classcounter = dict(Counter(self.y))
        trainlogger.info("the classdistribution is %s", classcounter)
        classes_to_keep = []
        for oxidationstate, count in classcounter.items():
            if count > MIN_SAMPLES:
                classes_to_keep.append(oxidationstate)
            else:
                trainlogger.warning(
                    "will drop class %s since it has not enough examples",
                    oxidationstate,
                )

        selected_idx = np.where(np.isin(self.y, classes_to_keep))[0]
        self.x = self.x[selected_idx]
        self.y = self.y[selected_idx]

        self.max_evals = max_evals
        self.voting = voting
        self.timeout = timeout
        self.timings = []
        self.modelpath = modelpath
        self.mix_ratios = {"rand": 0.15, "tpe": 0.7, "anneal": 0.15}
        self.max_workers = max_workers
        self.calibrate = calibrate
        self.classes = [1, 2, 3, 4, 5, 6, 7, 8]

        self.y = self.y.astype(np.int)
        self.y_valid = self.y_valid.astype(np.int)

        trainlogger.info("intialized training class")
Esempio n. 51
0
                                                        random_state=6)
hbo2_x, hbo2_x_test, hbo2_y, hbo2_y_test = train_test_split(hbo2_x,
                                                            hbo2_y,
                                                            test_size=0.1,
                                                            random_state=6)
ca_x, ca_x_test, ca_y, ca_y_test = train_test_split(ca_x,
                                                    ca_y,
                                                    test_size=0.1,
                                                    random_state=6)
na_x, na_x_test, na_y, na_y_test = train_test_split(na_x,
                                                    na_y,
                                                    test_size=0.1,
                                                    random_state=6)

# scalling
scaler = RobustScaler()
# scaler = MinMaxScaler()

hhb_x = scaler.fit_transform(hhb_x)
hhb_x_test = scaler.transform(hhb_x_test)
x_pred_hhb = scaler.transform(x_pred_hhb)

hbo2_x = scaler.fit_transform(hbo2_x)
hbo2_x_test = scaler.transform(hbo2_x_test)
x_pred_hbo2 = scaler.transform(x_pred_hbo2)

ca_x = scaler.fit_transform(ca_x)
ca_x_test = scaler.transform(ca_x_test)
x_pred_ca = scaler.transform(x_pred_ca)

na_x = scaler.fit_transform(na_x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, stratify=public_labels, random_state=1)

#Vettorizzare i label

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_labels_encoded = encoder.fit_transform(y_train)
test_labels_encoded = encoder.transform(y_test)


#Scalers

from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
scalers_to_test = [StandardScaler(), RobustScaler()]


# Designate distributions to sample hyperparameters from 
C_range = np.array([9.78736006e+00, 2.23814334e+01, 1.00000000e-04, 1.00000000e-04,
       1.74371223e+01, 1.00000000e-04, 2.96832303e-01, 1.06931597e+01,
       8.90706391e+00, 1.75488618e+01, 1.49564414e+01, 1.06939267e+01,
       1.00000000e-04, 7.94862668e+00, 3.14271995e+00, 1.00000000e-04,
       1.41729905e+01, 8.07236535e+00, 4.54900806e-01, 1.00000000e-04,
       1.00000000e-04, 1.99524074e+00, 4.68439119e+00, 1.00000000e-04,
       1.16220405e+01, 1.00000000e-04, 1.00000000e-04, 1.03972709e+01,
       1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
       1.25523737e+01, 1.00000000e-04, 1.66095249e+01, 8.07308186e+00,
       1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
       2.08711336e+01, 1.64441230e+00, 1.15020554e+01, 1.00000000e-04,
       1.81035130e+00, 1.17786194e+01, 1.00000000e-04, 1.03111446e+01,
X_train = np.vstack([X1, X2])

X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints)
X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints)
Y_test = np.hstack([[-1]*n_datapoints, [1]*n_datapoints])
X_test = np.vstack([X1, X2])

X_train[0, 0] = -1000  # a fairly large outlier


# Scale data
standard_scaler = StandardScaler()
Xtr_s = standard_scaler.fit_transform(X_train)
Xte_s = standard_scaler.transform(X_test)

robust_scaler = RobustScaler()
Xtr_r = robust_scaler.fit_transform(X_train)
Xte_r = robust_scaler.fit_transform(X_test)


# Plot data
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(X_train[:, 0], X_train[:, 1],
              color=np.where(Y_train > 0, 'r', 'b'))
ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(Y_train > 0, 'r', 'b'))
ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(Y_train > 0, 'r', 'b'))
ax[0].set_title("Unscaled data")
ax[1].set_title("After standard scaling (zoomed in)")
ax[2].set_title("After robust scaling (zoomed in)")
# for the scaled data, we zoom in to the data center (outlier can't be seen!)
for a in ax[1:]:
Esempio n. 54
0
test.drop('Id', axis=1, inplace=True)

x = train.drop('SalePrice', axis=1)  #Drop Target feature from train.
y = train['SalePrice']
test = test.drop('SalePrice', axis=1)

#known outliers(some from author notes and some from notebook guides)
outliers = [30, 88, 462, 631, 1322]
x = x.drop(x.index[outliers])
y = y.drop(y.index[outliers])

x = x.drop('MSSubClass_150', axis=1)
test = test.drop('MSSubClass_150', axis=1)

#Robustscalar normalizes the data so it is more robust to outliers.
sc = RobustScaler()
x = sc.fit_transform(x)
test = sc.transform(test)

#Train
model = Lasso(alpha=0.0005, random_state=1)  #other alphas were tried too .
model.fit(x, y)

#Predict
pred = model.predict(test)
predFinal = np.exp(pred)  #Revert the log.

#Data export
output = pd.DataFrame({'Id': test2.Id, 'SalePrice': predFinal})
output.to_csv('submission.csv', index=False)
output.head()
Esempio n. 55
0
new

# In[204]:

pd.concat([new, data],
          axis=1).drop('number', axis=1).rename(columns={'number1': 'number'})

# In[218]:

data1 = pd.get_dummies(data)
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# In[219]:

scaler = RobustScaler()
scaler.fit(data1)
scaler.transform(data1).shape
pd.DataFrame(scaler.transform(data1))

# In[212]:

data1

# In[214]:

pd.DataFrame(scaler.transform(data1))

# In[221]:

train.isnull().sum().sum()
 def __init__(self, attribs=None, scaler=RobustScaler()):
     self.attribs = attribs
     self.scaler = scaler
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt'
dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt'
train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt'



trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train)
trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy)
evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev)
evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly)

evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest)
evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2)


robust_scaler = RobustScaler()
trainx=robust_scaler.fit_transform(trainx)
evalx=robust_scaler.transform(evalx)

clf= LinearDiscriminantAnalysis() #
clf.fit(trainx,trainy)
predictValue=clf.predict(evalx)

print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV)

evalx2=robust_scaler.transform(evalx2)
predictValue=clf.predict(evalx2)


print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9597368421052632
exported_pipeline = make_pipeline(
    RobustScaler(),
    LogisticRegression(C=25.0, dual=True, penalty="l2")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
# # Min-Max Scaler $\frac{x_i - min(x)}{max(x) - min(x)}$

# In[5]:

mms = MinMaxScaler()
views['minmax'] = mms.fit_transform(views[['views']])
views


# In[6]:

(vw[0] - np.min(vw)) / (np.max(vw) - np.min(vw))


# # Robust Scaler $\frac{x_i - median(x)}{IQR_{(1,3)}(x)}$

# In[7]:

rs = RobustScaler()
views['robust'] = rs.fit_transform(views[['views']])
views


# In[8]:

quartiles = np.percentile(vw, (25., 75.))
iqr = quartiles[1] - quartiles[0]
(vw[0] - np.median(vw)) / iqr

Esempio n. 60
0
def run_basic_svm(X_train,
                  y_train,
                  selected_features,
                  scorers,
                  refit_scorer_name,
                  subset_share=0.1,
                  n_splits=10,
                  parameters=None):
    '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier.
    The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data.

    subset_share=0.1'''

    #Create a subset to train on
    print("[Step 1]: Create a data subset")
    subset_min = 300  #Minimal subset is 100 samples.

    if subset_share * X_train.shape[0] < subset_min:
        number_of_samples = subset_min
        print("minimal number of samples used: ", number_of_samples)
    else:
        number_of_samples = subset_share * X_train.shape[0]

    X_train_subset, y_train_subset = modelutil.extract_data_subset(
        X_train, y_train, number_of_samples)
    print("Got subset sizes X train: {} and y train: {}".format(
        X_train_subset.shape, y_train_subset.shape))

    print("[Step 2]: Define test parameters")
    if parameters is None:  #If no parameters have been defined, then do full definition
        # Guides used from
        # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines
        # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem
        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            NearMiss(version=1),
            EditedNearestNeighbours(),
            AllKNN(),
            CondensedNearestNeighbour(random_state=0),
            InstanceHardnessThreshold(random_state=0,
                                      estimator=LogisticRegression(
                                          solver='lbfgs', multi_class='auto')),
            SMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

        # gamma default parameters
        param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        parameters = [
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['linear', 'sigmoid']
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['poly'],
                'svm__degree': [2, 3]  # Only relevant for poly
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['rbf'],
                'svm__gamma':
                [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2,
                 1e3]  # Only relevant in rbf, default='auto'=1/n_features
            }
        ]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

        print("Selected Parameters: ", parameters)
    else:
        print("Parameters defined in the input: ", parameters)

    # Main pipeline for the grid search
    pipe_run1 = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('svm', SVC())])

    print("Pipeline: ", pipe_run1)

    print("Stratified KFold={} used.".format(n_splits))
    skf = StratifiedKFold(n_splits=n_splits)

    pipe_run1 = pipe_run1
    params_run1 = parameters  #params_debug #params_run1
    grid_search_run1 = GridSearchCV(pipe_run1,
                                    params_run1,
                                    verbose=1,
                                    cv=skf,
                                    scoring=scorers,
                                    refit=refit_scorer_name,
                                    return_train_score=True,
                                    iid=True,
                                    n_jobs=-1).fit(X_train_subset,
                                                   y_train_subset)

    results_run1 = modelutil.generate_result_table(grid_search_run1,
                                                   params_run1,
                                                   refit_scorer_name)
    print("Result size=", results_run1.shape)
    print("Number of NaN results: {}. Replace them with 0".format(
        np.sum(results_run1['mean_test_' + refit_scorer_name].isna())))

    return grid_search_run1, params_run1, pipe_run1, results_run1