def best_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def ica_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## ICA ## ica = FastICA(n_components=X_train_scl.shape[1]) X_ica = ica.fit_transform(X_train_scl) ## ## Plots ## ph = plot_helper() kurt = kurtosis(X_ica) print(kurt) title = 'Kurtosis (FastICA) for ' + data_set_name name = data_set_name.lower() + '_ica_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(kurt)+1, 1), kurt, np.arange(1, len(kurt)+1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def processing(df): dummies_df = pd.get_dummies(df["City Group"]) def add_CG(name): return "CG_" + name dummies_df = dummies_df.rename(columns=add_CG) # print dummies_df.head() df = pd.concat([df, dummies_df.iloc[:, 0]], axis=1) dummies_df = pd.get_dummies(df["Type"]) def add_Type(name): return "Type_" + name dummies_df = dummies_df.rename(columns=add_Type) df = pd.concat([df, dummies_df.iloc[:, 0:3]], axis=1) # try to put in age as a column def add_Age(string): age = datetime.datetime.now() - datetime.datetime.strptime(string, "%m/%d/%Y") return age.days df["Age"] = df["Open Date"].map(add_Age) df = df.drop(["Id", "Open Date", "City", "City Group", "Type", "revenue"], axis=1) # scaler = StandardScaler().fit(df) scaler = RobustScaler().fit(df) df = scaler.transform(df) # print df.head() return df
def _robust_scaler(self, input_df): """Uses Scikit-learn's RobustScaler to scale the features using statistics that are robust to outliers Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to scale Returns ------- scaled_df: pandas.DataFrame {n_samples, n_features + ['guess', 'group', 'class']} Returns a DataFrame containing the scaled features """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) if len(training_features.columns.values) == 0: return input_df.copy() # The scaler must be fit on only the training data scaler = RobustScaler() scaler.fit(training_features.values.astype(np.float64)) scaled_features = scaler.transform(input_df.drop(['class', 'group', 'guess'], axis=1).values.astype(np.float64)) for col_num, column in enumerate(input_df.drop(['class', 'group', 'guess'], axis=1).columns.values): input_df.loc[:, column] = scaled_features[:, col_num] return input_df.copy()
def scale_feature_matrix(feature_M, linear=False, outliers=False): from sklearn.preprocessing import StandardScaler, RobustScaler import numpy as np binary_fields = [col for col in feature_M.columns if len(set(feature_M[col])) == 2] if outliers: #Scaling 0 median & unit variance scaler_obj = RobustScaler() print 'centering around median' else: #Scale 0 mean & unit variance scaler_obj = StandardScaler() print 'centering around mean' print 'found these binaries' print '-' * 10 print '\n'.join(binary_fields) X_scaled = scaler_obj.fit_transform(feature_M.drop(binary_fields, axis=1)) X_scaled_w_cats = np.c_[X_scaled, feature_M[binary_fields].as_matrix()] return X_scaled_w_cats, scaler_obj
def num_scaler(d_num,t_num): scl = RobustScaler() scl.fit(d_num) d_num = scl.transform(d_num) t_num = scl.transform(t_num) return d_num, t_num
def best_ica_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ica = FastICA(n_components=X_train_scl.shape[1]) X_train_transformed = ica.fit_transform(X_train_scl, y_train) X_test_transformed = ica.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/wine_ica_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) ks = [] for i in range(1000): ## ## Random Projection ## rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) rp.fit(X_train_scl) X_train_rp = rp.transform(X_train_scl) ks.append(kurtosis(X_train_rp)) mean_k = np.mean(ks, 0) ## ## Plots ## ph = plot_helper() title = 'Kurtosis (Randomized Projection) for ' + data_set_name name = data_set_name.lower() + '_rp_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1), mean_k, np.arange(1, len(mean_k)+1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def nn_wine_orig(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
def standardize_columns(data): """ We decided to standardize the weather factor due to outliers. """ columns_to_standardize = ['temp', 'atemp', 'humidity', 'windspeed'] min_max_scaler = RobustScaler() for column in columns_to_standardize: data[column] = min_max_scaler.fit_transform(data[column]) return data
def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## Plots ## ph = plot_helper() scores = [] train_scores = [] rng = range(1, X_train_scl.shape[1]+1) for i in rng: lda = LinearDiscriminantAnalysis(n_components=i) cv = KFold(X_train_scl.shape[0], 3, shuffle=True) # cross validation cv_scores = [] for (train, test) in cv: lda.fit(X_train_scl[train], y_train[train]) score = lda.score(X_train_scl[test], y_train[test]) cv_scores.append(score) mean_score = np.mean(cv_scores) scores.append(mean_score) # train score lda = LinearDiscriminantAnalysis(n_components=i) lda.fit(X_train_scl, y_train) train_score = lda.score(X_train_scl, y_train) train_scores.append(train_score) print(i, mean_score) ## ## Score Plot ## title = 'Score Summary Plot (LDA) for ' + data_set_name name = data_set_name.lower() + '_lda_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(rng, [scores, train_scores], [None, None], ['cross validation score', 'training score'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'n_components', 'Score', filename)
def demensionReduction(numFeatures,cateFeatures): """ :param numFeatures: :param labels: :return: """ scaler = RobustScaler() scaledFeatures = scaler.fit_transform(numFeatures) pca = PCA(n_components=5) reducedFeatures = pca.fit_transform(scaledFeatures) allFeatures = np.concatenate((reducedFeatures,cateFeatures),axis=1) return allFeatures
def test_robustscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.RobustScaler # with sklearn.preprocessing.RobustScaler robustscalerr = RobustScalerR() robustscalerr.fit(np.concatenate(trajs)) robustscaler = RobustScaler() robustscaler.fit(trajs) y_ref1 = robustscalerr.transform(trajs[0]) y1 = robustscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def best_lda_cluster_wine(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def transform_dataframe(dataframe): """ Function to read dataframe and standardize the dataframe with a mean 0 and unit variance on every column Parameters: dataframe : Input pandas dataframe Input types: pd.Dataframe Output types: pd.Dataframe """ cols = [col for col in dataframe.columns] robust_scaler = RobustScaler() df = robust_scaler.fit_transform(dataframe[cols]) dataframe.columns = df return dataframe
def scale(self,columns,categorical_cols,apply_list,target_column): from sklearn.preprocessing import RobustScaler scaler = RobustScaler() if apply_list: numerical_cols = columns else: numerical_cols = [] for col in self.dataset.columns.values: if col not in categorical_cols: numerical_cols.append(col) else: pass # We don't want to scale the target variable, as it is already binary. # The target column uses the same value as target_value from Split Data section # in the settings popup. numerical_cols.remove(target_column) # Scale, fit and transform all the numerical columns scaled_data = scaler.fit_transform(self.dataset[numerical_cols]) self.dataset[numerical_cols] = scaled_data return self.dataset
def best_lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) lda = LinearDiscriminantAnalysis(n_components=2) X_train_transformed = lda.fit_transform(X_train_scl, y_train) X_test_transformed = lda.transform(X_test_scl) # save filename = './' + self.save_dir + '/nba_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def detect_bad_channels(inst, pick_types=None, threshold=.2): from sklearn.preprocessing import RobustScaler from sklearn.covariance import EmpiricalCovariance from jr.stats import median_abs_deviation if pick_types is None: pick_types = dict(meg='mag') inst = inst.pick_types(copy=True, **pick_types) cov = EmpiricalCovariance() cov.fit(inst._data.T) cov = cov.covariance_ # center scaler = RobustScaler() cov = scaler.fit_transform(cov).T cov /= median_abs_deviation(cov) cov -= np.median(cov) # compute robust summary metrics mu = np.median(cov, axis=0) sigma = median_abs_deviation(cov, axis=0) mu /= median_abs_deviation(mu) sigma /= median_abs_deviation(sigma) distance = np.sqrt(mu ** 2 + sigma ** 2) bad = np.where(distance < threshold)[0] bad = [inst.ch_names[ch] for ch in bad] return bad
def best_pca_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) pca = PCA(n_components=3) X_train_transformed = pca.fit_transform(X_train_scl, y_train) X_test_transformed = pca.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_pca_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def __init__(self, *args, scale=False, center=False, **kwargs): """ A machine learned model. Beyond :class:`revscoring.Model`, this "Learned" models implement :func:`~revscoring.scoring.models.Learned.fit` and :func:`~revscoring.scoring.models.Learned.cross_validate`. """ super().__init__(*args, **kwargs) self.trained = None if scale or center: self.scaler = RobustScaler(with_centering=center, with_scaling=scale) else: self.scaler = None self.params.update({ 'scale': scale, 'center': center })
u'minutes', u'movie', u'movies', u'music', u'need', u'new', u'nolan', u'old', u'opinion', u'original', u'oscar', u'overall', u'people', u'perfect', u'performance', u'performances', u'picture', u'place', u'played', u'plot', u'point', u'pretty', u'probably', u'quite', u'read', u'real', u'really', u'reason', u'right', u'role', u'said', u'saw', u'say', u'scene', u'scenes', u'score', u'screen', u'script', u'second', u'seeing', u'seen', u'sense', u'set', u'shows', u'simply', u'special', u'special effects', u'star', u'star wars', u'start', u'story', u'sure', u'takes', u'thats', u'theres', u'thing', u'things', u'think', u'thought', u'time', u'times', u'trilogy', u'true', u'truly', u'trying', u'understand', u'use', u'used', u'violence', u'want', u'war', u'wars', u'wasnt', u'watch', u'watched', u'watching', u'way', u'wife', u'wonderful', u'work', u'world', u'worth', 'year_tfidf', u'years', u'young'] X_prescale = X[features_to_scale] X_scaled = RobustScaler().fit_transform(X_prescale) X_scaled = pd.DataFrame(X_scaled, columns = features_to_scale, index = X_prescale.index) X_final_scaled = X_scaled.join(X[features_to_not_scale]) X_final_scaled.info() X.info() #Train Test Split the scaled data X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_final_scaled, y, test_size = .2, random_state = 31) #So what is the baseline prediction? print y.mean() y.value_counts() baseline_not10 = (1-y[y== 10].count()/float(y.count()))
class HousePrices(object): seq2 = pd.Series(np.arange(2)) #Static class models. lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state=7, nthread=-1) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11) KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #Constructor def __init__(self, trainData, testData): self.trainData = trainData self.testData = testData def dataImport(self): self.train = pd.read_csv(self.trainData) self.test = pd.read_csv(self.testData) self.train_Id = self.train['Id'] self.test_Id = self.test['Id'] self.train.drop("Id", axis=1, inplace=True) self.test.drop("Id", axis=1, inplace=True) def display(self): print(len(self.train.columns)) fig, ax = plt.subplots() ax.scatter(x=self.train['GrLivArea'], y=self.train['SalePrice']) plt.ylabel('SalePrice', fontsize=13) plt.xlabel('GrLivArea', fontsize=13) #plt.show() # corrmat = self.train.corr() # f, ax = plt.subplots(figsize=(12, 9)) # sns.heatmap(self.corrmat, vmax=.8, square=True); plt.show() # sns.distplot(self.train['SalePrice'] , fit=norm); # # Get the fitted parameters used by the function # (mu, sigma) = norm.fit(self.train['SalePrice']) # print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) # #Now plot the distribution # plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best') # plt.ylabel('Frequency') # plt.title('SalePrice distribution') # #Get also the QQ-plot # fig = plt.figure() # res = stats.probplot(self.train['SalePrice'], plot=plt) # plt.show() # f, ax = plt.subplots(figsize=(15, 12)) # plt.xticks(rotation='90') # sns.barplot(x=self.all_data_na.index, y=self.all_data_na) # plt.xlabel('Features', fontsize=15) # plt.ylabel('Percent of missing values', fontsize=15) # plt.title('Percent missing data by feature', fontsize=15) #plt.show() def removeOutliers(self): self.train = self.train.drop( self.train[(self.train['GrLivArea'] > 4000) & (self.train['SalePrice'] < 300000)].index) def preProcess(self): self.removeOutliers() self.train['SalePrice'] = np.log1p(self.train['SalePrice']) self.ntrain = self.train.shape[0] self.ntest = self.test.shape[0] self.y_train = self.train.SalePrice.values self.all_data = pd.concat( (self.train, self.test)).reset_index(drop=True) self.all_data.drop(['SalePrice'], axis=1, inplace=True) print("all_data size is : {}".format(self.all_data.shape)) self.all_data_na = (self.all_data.isnull().sum() / len(self.all_data)) * 100 self.all_data_na = self.all_data_na.drop( self.all_data_na[self.all_data_na == 0].index).sort_values( ascending=False)[:30] self.missing_data = pd.DataFrame({'Missing Ratio': self.all_data_na}) self.preprocessCategoricalColumns() self.preProcessNumericalColumns() def preprocessCategoricalColumns(self): #Converting PoolQC column to categorical and then using a probability distribution to fill the None values. print("Total Number of values ", self.all_data['PoolQC'].shape[0]) print("Number of Null Values", self.all_data['PoolQC'].isna().sum()) # # PoolQC # # #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["PoolQC"] = self.all_data.PoolQC.fillna("None") self.all_data['PoolQC'] = pd.Categorical(self.all_data.PoolQC) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['PoolQC'].value_counts()) self.poolQC_probabilities = [ 0.98, 0.006666667, 0.006666667, 0.006666667 ] self.poolQC_Values = ['None', 'Gd', 'Fa', 'Ex'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['PoolQC'] == 'None'].index # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 65] = np.random.choice(self.poolQC_Values, len(self.indices), p=self.poolQC_probabilities) print("After filling :") print(self.all_data.PoolQC.value_counts()) ############################################################################################ # # MiscFeature # # #Number of Missing values in MiscFeature self.all_data.MiscFeature.isna().sum( ) # 1404 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["MiscFeature"] = self.all_data['MiscFeature'].fillna( "None") self.all_data['MiscFeature'] = pd.Categorical( self.all_data['MiscFeature']) self.all_data.MiscFeature = self.all_data.MiscFeature.astype( 'category') # print("Before Filling :") # print(self.all_data['MiscFeature'].value_counts()) # (2) Finding probabilities of each occurance print(self.all_data['MiscFeature'].value_counts()) self.MiscFeature_probabilities = [ 0.962962963, 0.033607682, 0.001371742, 0.001371742, 0.000685871 ] self.MiscFeature_Values = ['None', 'Shed', 'Othr', 'Gar2', 'TenC'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['MiscFeature'] == 'None'].index #Find the column index so as to use 'iloc' . 56 is the col np.argwhere(self.all_data.columns == 'MiscFeature') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 56] = np.random.choice( self.MiscFeature_Values, len(self.indices), p=self.MiscFeature_probabilities) # print("After filling") # print(self.all_data["MiscFeature"].value_counts()) ############################################################################################ # # Alley # # #Number of Missing values in Alley self.all_data['Alley'].isna().sum() # 1367 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["Alley"] = self.all_data['Alley'].fillna("None") self.all_data['Alley'] = pd.Categorical(self.all_data['Alley']) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['Alley'].value_counts()) # Count of 'None' : 1367 # Count of 'Grvl' : 50 # Count of 'Pave' : 41 self.Alley_probabilities = [0.937585734, 0.034293553, 0.028120713] self.Alleyy_Values = ['None', 'Grvl', 'Pave'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['Alley'] == 'None'].index #Find the column index so as to use 'iloc' . 3 is the col np.argwhere(self.all_data.columns == 'Alley') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 3] = np.random.choice(self.Alleyy_Values, len(self.indices), p=self.Alley_probabilities) print("gg") self.all_data['Alley'].value_counts() print("After filling :") print(self.all_data['Alley'].value_counts()) ########################################################################################### # # Fence # # #Number of Missing values in Alley self.all_data['Fence'].isna().sum() # 1177 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["Fence"] = self.all_data['Fence'].fillna("None") self.all_data['Fence'] = pd.Categorical(self.all_data['Fence']) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['Fence'].value_counts()) # Count of 'None' : 1177 # Count of 'MnPrv' : 157 # Count of 'GdPrv' : 59 # Count of 'GdWo' : 54 # Count of 'MnWw' : 11 self.Fence_probabilities = [ 0.807270233, 0.107681756, 0.040466392, 0.037037037, 0.007544582 ] self.Fence_Values = ['None', 'MnPrv', 'GdPrv', 'GdWo', 'MnWw'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['Fence'] == 'None'].index #Find the column index so as to use 'iloc' . 25 is the col np.argwhere(self.all_data.columns == 'Fence') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 25] = np.random.choice(self.Fence_Values, len(self.indices), p=self.Fence_probabilities) print("After filling :") print(self.all_data['Fence'].value_counts()) ######################################################################################### # # FirePlaceQu # # #Number of Missing values in FireplaceQu self.all_data['FireplaceQu'].isna().sum( ) # 690 Null values in this column #Filling NaN with None because if you convert to categorical without filling out NaN values, pandas does not consider NaN # as one of the values in the categorical column. # (1) Filling NaN with None values and make the column categorical self.all_data["FireplaceQu"] = self.all_data['FireplaceQu'].fillna( "None") self.all_data['FireplaceQu'] = pd.Categorical( self.all_data['FireplaceQu']) # (2) Finding probabilities of each occurance print("Before filling :") print(self.all_data['FireplaceQu'].value_counts()) # Count of 'None' : 690 # Count of 'Gd' : 378 # Count of 'TA' : 313 # Count of 'Fa' : 33 # Count of 'Ex' : 24 # Count of 'Po' : 20 self.FireplaceQu_probabilities = [ 0.473251029, 0.259259259, 0.214677641, 0.022633745, 0.016460905, 0.013717421 ] self.FireplaceQu_Values = ['None', 'Gd', 'TA', 'Fa', 'Ex', 'Po'] #We need to replace only the 'None' type. Generating a sample from probability distribution self.indices = self.all_data[self.all_data['FireplaceQu'] == 'None'].index #Find the column index so as to use 'iloc' . 26 is the col np.argwhere(self.all_data.columns == 'FireplaceQu') # (3) Use a distribution to fill out "None" values now. self.all_data.iloc[self.indices, 26] = np.random.choice( self.FireplaceQu_Values, len(self.indices), p=self.FireplaceQu_probabilities) print("After filling :") print(self.all_data['FireplaceQu'].value_counts()) ########################################################################################### # # LotFrontage # # ''' Assuming houses belonging to the same Neighborhood will have similar LotFrontage, we groupby Neighborhood and then take mean for each locality. Then we substitute the missing values of a particular Neighborhood with the mean of that Neighborhood ''' self.lotFrontage_df = self.all_data[['Neighborhood', 'LotFrontage']].copy() self.groupby_Neighborhood = self.lotFrontage_df.groupby('Neighborhood') self.indices = self.all_data[self.all_data['LotFrontage'].isna()].index self.mean_Neighborhood = self.groupby_Neighborhood.mean() self.mean_Neighborhood.head() for i in self.indices: self.locality = self.all_data.iloc[i, 59] self.value = self.mean_Neighborhood.get_value( self.locality, 'LotFrontage') self.all_data.iloc[i, 49] = self.value ########################################################################################### # # # (6)GarageYrBlt (7) GarageArea (8) GarageCar # # (9)GarageType (10) GarageFinish (11) GarageQual (12)GarageCond for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'): self.all_data[col] = self.all_data[col].fillna(0) self.all_data['GarageType'] = self.all_data['GarageType'].fillna( 'None') self.all_data['GarageFinish'] = self.all_data['GarageFinish'].fillna( 'None') self.all_data['GarageQual'] = self.all_data['GarageQual'].fillna( 'None') self.all_data['GarageCond'] = self.all_data['GarageCond'].fillna( 'None') for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'): self.all_data[col] = self.all_data[col].fillna(0) for col in ('BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual'): self.all_data[col] = self.all_data[col].fillna('None') ############################################################################################# # # # Electrical , Exterior1st,Exterior2nd,SaleType,KitchenQual # # #Electrical has only 1 Null value , hence replacing by most frequently occuring value i.e. mode of the column self.all_data['Electrical'] = self.all_data['Electrical'].fillna( self.all_data['Electrical'].mode()[0]) #Similarly for Exterior1st, Exterior2nd,SaleType and KitchenQual self.all_data['Exterior1st'] = self.all_data['Exterior1st'].fillna( self.all_data['Exterior1st'].mode()[0]) self.all_data['Exterior2nd'] = self.all_data['Exterior2nd'].fillna( self.all_data['Exterior2nd'].mode()[0]) self.all_data['KitchenQual'] = self.all_data['KitchenQual'].fillna( self.all_data['KitchenQual'].mode()[0]) self.all_data['SaleType'] = self.all_data['SaleType'].fillna( self.all_data['SaleType'].mode()[0]) ############################################################################################## # # # # 'MasVnrArea','MasVnrType' and other columns # # self.indices = self.all_data[self.all_data['MasVnrArea'] == 0].index self.all_data['MasVnrArea'] = self.all_data['MasVnrArea'].fillna(0) self.all_data['MasVnrType'] = self.all_data['MasVnrType'].fillna( 'None') self.all_data = self.all_data.drop(['Utilities'], axis=1) self.all_data["Functional"] = self.all_data["Functional"].fillna("Typ") self.all_data['MSSubClass'] = self.all_data['MSSubClass'].fillna( "None") ############################################################################################## # Hence no remaining Columns with missing values. # MSSubClass is categorical as only a certain set of numbers are appearing. Hence converting it to categorical # OverallCond is categorical as only a certain set of numbers are appearing. Hence converting it to categorical self.all_data['MSSubClass'].unique() #array([ 20, 180, 60, 80, 50, 75, 30, 70, 90, 120, 45, 190, 85, 160, 40]) self.all_data['MSSubClass'] = self.all_data['MSSubClass'].apply(str) self.all_data['OverallCond'].unique() #array([6, 5, 7, 8, 3, 4, 9, 2, 1]) self.all_data['OverallCond'] = self.all_data['OverallCond'].apply(str) #Unlike Yrbuilt , YrSold is taking only a set of numbers converting it to categorical. self.all_data['YrSold'].unique() #array([2008, 2006, 2010, 2007, 2009]) self.all_data['YrSold'] = self.all_data['YrSold'].astype(str) #Similarly for MonthSold ie MoSold self.all_data['MoSold'].unique() #array([ 5, 6, 3, 4, 12, 7, 8, 11, 1, 10, 2, 9]) self.all_data['MoSold'] = self.all_data['MoSold'].astype(str) # Linear regression works only on columns with numeric values , Using labelEncoder to convert # the categorical colums to a numeric values #Set of columns which have categorical values: self.columns = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold') for column in self.columns: self.lbl = LabelEncoder() self.lbl.fit(list(self.all_data[column].values)) self.all_data[column] = self.lbl.transform( list(self.all_data[column].values)) # skewness = skewness[abs(skewness) > 0.75] # print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0])) # from scipy.special import boxcox1p # self.skewed_features = skewness.index # lam = 0.15 # for feat in self.skewed_features: # #all_data[feat] += 1 # self.all_data[feat] = boxcox1p(self.all_data[feat], self.lam) # This will map the labels of categorical data to 0,1,2,3 etc. self.all_data = pd.get_dummies(self.all_data) def preProcessNumericalColumns(self): #These features are positively correlated with the salePrice hence creating new features by #taking 3 polynomials square, cube and square root # Taking the top 10 correlated valuse. # OverallQual 0.817315 # GrLivArea 0.715624 # GarageCars 0.687771 # GarageArea 0.662332 # TotalBsmtSF 0.637558 # 1stFlrSF 0.608198 # FullBath 0.582020 # YearBuilt 0.572574 # As total square feet is important. Adding total sqfootage feature self.all_data[ 'TotalSF'] = self.all_data['TotalBsmtSF'] + self.all_data[ '1stFlrSF'] + self.all_data['2ndFlrSF'] self.all_data["OverallQual-s2"] = self.all_data["OverallQual"]**2 self.all_data["OverallQual-s3"] = self.all_data["OverallQual"]**3 self.all_data["OverallQual-Sq"] = np.sqrt(self.all_data["OverallQual"]) self.all_data["GrLivArea-s2"] = self.all_data["GrLivArea"]**2 self.all_data["GrLivArea-s3"] = self.all_data["GrLivArea"]**3 self.all_data["GrLivArea-Sq"] = np.sqrt(self.all_data["GrLivArea"]) self.all_data["GarageCars-s2"] = self.all_data["GarageCars"]**2 self.all_data["GarageCars-s3"] = self.all_data["GarageCars"]**3 self.all_data["GarageCars-Sq"] = np.sqrt(self.all_data["GarageCars"]) self.all_data["GarageArea-s2"] = self.all_data["GarageArea"]**2 self.all_data["GarageArea-s3"] = self.all_data["GarageArea"]**3 self.all_data["GarageArea-Sq"] = np.sqrt(self.all_data["GarageArea"]) self.all_data["TotalBsmtSF-s2"] = self.all_data["TotalBsmtSF"]**2 self.all_data["TotalBsmtSF-s3"] = self.all_data["TotalBsmtSF"]**3 self.all_data["TotalBsmtSF-Sq"] = np.sqrt(self.all_data["TotalBsmtSF"]) self.all_data["1stFlrSF-s2"] = self.all_data["1stFlrSF"]**2 self.all_data["1stFlrSF-s3"] = self.all_data["1stFlrSF"]**3 self.all_data["1stFlrSF-Sq"] = np.sqrt(self.all_data["1stFlrSF"]) self.all_data["FullBath-s2"] = self.all_data["FullBath"]**2 self.all_data["FullBath-s3"] = self.all_data["FullBath"]**3 self.all_data["FullBath-Sq"] = np.sqrt(self.all_data["FullBath"]) self.all_data["YearBuilt-s2"] = self.all_data["YearBuilt"]**2 self.all_data["YearBuilt-s3"] = self.all_data["YearBuilt"]**3 self.all_data["YearBuilt-Sq"] = np.sqrt(self.all_data["YearBuilt"]) self.all_data["TotalSF-s2"] = self.all_data["TotalSF"]**2 self.all_data["TotalSF-s3"] = self.all_data["TotalSF"]**3 self.all_data["TotalSF-Sq"] = np.sqrt(self.all_data["TotalSF"]) self.train = self.all_data[:1020] self.test = self.all_data[1020:] self.all_data.to_csv('./all.csv') #Validation function def rmsle_cv(self, model): #self.n_folds = 5 self.kf = KFold(5, shuffle=True, random_state=42).get_n_splits(self.train.values) self.rmse = np.sqrt(-cross_val_score(model, self.train.values, self.y_train, scoring="neg_mean_squared_error", cv=self.kf)) return (self.rmse) #Lasso. Best alpha : 0.0005 / 91% accuracy def lasso_model(self): self.lasso_m = Lasso() self.alpha = [0.0005, 0.0003, 0.0007] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.lasso_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.lasso = self.grid_search.best_estimator_ # #self.lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1)) # #self.score = self.rmsle_cv(self.lasso) # self.score = self.rmsle_cv(HousePrices.lasso) # print("\nLasso score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) # ElasticNet. Best Alpha : 0.001 / 91% accuracy. def elasticNet(self): self.enet_m = ElasticNet() self.alpha = [0.0005, 0.0007, 0.001] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.enet_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.enet_m = self.grid_search.best_estimator_ # #self.ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) # self.score = self.rmsle_cv(HousePrices.ENet) # print("ElasticNet score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) #Kernel Ridge regression. Best alpha : .0005 / 79% accuracy def kernelRegression(self): self.krr_m = KernelRidge() self.alpha = [0.0005, 0.0007, 0.001, 0.0006, 0.0001] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.krr_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.krr_m = self.grid_search.best_estimator_ # #self.KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) # self.score = self.rmsle_cv(HousePrices.KRR) # print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) #GradientBoosting. Best alpha : .00065 / 89% accuracy def gradientBoosting(self): self.gboost_m = GradientBoostingRegressor() self.alpha = [0.00068, 0.00065, 0.00066] self.param_grid = dict(alpha=self.alpha) self.grid_search = GridSearchCV(self.gboost_m, self.param_grid, scoring="r2", cv=10) self.grid_result = self.grid_search.fit(self.train, self.y_train) print("Best: %f using %s" % (self.grid_result.best_score_, self.grid_result.best_params_)) self.krr_m = self.grid_search.best_estimator_ # #self.GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10,loss='huber', random_state =5) # self.score = self.rmsle_cv(HousePrices.GBoost) # print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(self.score.mean(), self.score.std())) # XgbRegressor.Best alpha : .0005 / 79% accuracy def xgbRegressor(self): #self.model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,learning_rate=0.05, max_depth=3,min_child_weight=1.7817, n_estimators=2200,reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1,random_state =7, nthread = -1) self.score = self.rmsle_cv(HousePrices.model_xgb) print("Xgboost score: {:.4f} ({:.4f})\n".format( self.score.mean(), self.score.std())) # LgbRegressor. Best alpha : .0005 / 79% accuracy def lgbRegressor(self): #model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,learning_rate=0.05, n_estimators=720,max_bin = 55, bagging_fraction = 0.8,bagging_freq = 5, feature_fraction = 0.2319,feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =6, min_sum_hessian_in_leaf = 11) self.score = self.rmsle_cv(HousePrices.model_lgb) print("LgbRegressor score: {:.4f} ({:.4f})\n".format( self.score.mean(), self.score.std())) def rmsle(self, y, y_pred): return np.sqrt(mean_squared_error(y, y_pred)) def stackingModels(self): #Lasso self.lasso_stacking = make_pipeline( RobustScaler(), Lasso(alpha=0.0005, random_state=1)) #ElasticNet self.ENet_stacking = make_pipeline( RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) #Kernel Ridge regression self.KRR_stacking = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #GBoost self.GBoost_stacking = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) #Lgb self.lgb_stacking = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11) #Stacking self.stacked_averaged_models = StackingAveragedModels( base_models=(self.ENet_stacking, self.GBoost_stacking, self.KRR_stacking), meta_model=self.lasso_stacking) self.score = self.rmsle_cv(self.stacked_averaged_models) print("Stacking Averaged models score: {:.4f} ({:.4f})".format( self.score.mean(), self.score.std())) self.stacked_averaged_models.fit(self.train.values, self.y_train) self.stacked_train_pred = self.stacked_averaged_models.predict( self.train.values) self.stacked_pred = np.expm1( self.stacked_averaged_models.predict(self.test.values)) print("RMSE of stacked ") print(self.rmsle(self.y_train, self.stacked_train_pred))
band_names = ['theta', 'alpha', 'low-beta', 'beta', 'high-beta'] bands = [(4, 8), (8, 12), (12, 16), (16, 20), (20, 24)] band_dict = dict(zip(band_names, bands)) bands_to_analyse = band_names[:3] subj_dir = '/home/nikolai/_Work/predict_alpha/!each data/Dasha' raw, channels, fs = load_p4_data(subj_dir) for day in []: scaler = RobustScaler() scaler.fit(raw.loc[raw.day == day, 'p4']) scaler.fit(raw.loc[raw.day == day, 'p4']) for band in bands_to_analyse: exp_sm = ExponentialSmoother(0.99) env_detector = ButterBandEnvelopeDetector(band_dict[band], fs, exp_sm, 3) raw[band] = env_detector.apply(raw['p4']) for day in []: #raw.loc[raw.day == day, band] -= raw.loc[(raw.day == day) & (raw.block_name == 'fon'), band].quantile(0.05) #print('mode', raw.loc[(raw.day == day) & (raw.block_name == 'fon'), band].mode()) raw.loc[raw.day == day, band] /= raw.loc[(raw.day == day), band].quantile(0.01) plt.plot(raw.loc[(raw.day == 1) & (raw.block_name == 'fon'), 'p4'], label='day1') plt.plot(raw.loc[(raw.day == 2) & (raw.block_name == 'fon'), 'p4'], label='day2') plt.legend()
# -------------- # MINMAXSCALAR | # -------------- # Scaling features to lie between a given minimum and maximum value, often between 0 and 1 min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) print("\nMinMaxScalar:" "\n=============" "\nX_train:", X_train) print('\nX_test:', X_test) # -------------- # ROBUSTSCALAR | # -------------- # This removed the median and scaled the data according to the quantile range robust_scaler = RobustScaler() X_train = robust_scaler.fit_transform(X_train) X_test = robust_scaler.transform(X_test) print("\nRobustScalar:" "\n=============" "\nX_train:", X_train) print('\nX_test:', X_test) # -------------- # NORMALIZER | # -------------- # Normalize samples individually to unit norm # Each sample (each row of the data matrix) with at least one non zero component is rescaled # indepentently o other samples so that its norm (|1 or |2) equals 1 normalizer_scaler = Normalizer() X_train = normalizer_scaler.fit_transform(X_train) X_test = normalizer_scaler.transform(X_test)
def fit(self, X, y=None): self.scaler = RobustScaler() self.scaler.fit(X[['thinking_time', 'actual_interval']]) return self
df_train.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True) df_test.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True) public_data = df_train.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1) PA_data = df_test.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1) public_labels = df_train.Histology PA_labels = df_test.Histology encoder = LabelEncoder() #Scalers from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler scalers_to_test = [StandardScaler(), RobustScaler(), MinMaxScaler(), None] df = pd.DataFrame() # Designate distributions to sample hyperparameters from R = np.arange(0.1, 10, 0.2) n_features_to_test = np.arange(1, 11) for i in range(1, 21): #Train test split X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, stratify=public_labels, random_state=i*500) #Vettorizzare i label
def generate_batch_data(self, raw_data, name, timesteps=24): # if "data_processed"==name or "wind_direction" == name or "wind_speed" == name: # raw_data = self.one_hot_encoding(raw_data) raw_data = pd.DataFrame(raw_data) value = raw_data.values print('feature ------------ ', name.upper()) if self.scaler_type == 'standard': scaler = StandardScaler() if self.scaler_type == 'robust': scaler = RobustScaler() if self.scaler_type == 'min_max': scaler = MinMaxScaler(feature_range=(0, 1)) scaler = scaler.fit(value) normalized = scaler.transform(value) data = normalized print('Max: %f, Min: %f, Format: %d*%d' % (np.amax(data), np.amin(data), data.shape[0], data.shape[1])) # data = pd.DataFrame(data) # print(data) if name != 'target': input_serise = data[:(len(data) - 24 * 11)] x_batches = np.array([]) else: target_serise = self.shift(data, -(timesteps)).astype(np.float32) y_batches = np.array([]) # check if file exists if (self.scaler_type is None): seq_file_name = "test_np_processed_" + name + "_" + str( timesteps) + "_" + str(self.pca) + "_" + str( self.normal) + ".npz" else: seq_file_name = "test_np_" + self.scaler_type + "_processed_" + name + "_" + str( timesteps) + "_" + str(self.pca) + "_" + str( self.normal) + ".npz" if os.path.isfile("data_log/" + seq_file_name): npzfile = np.load("data_log/" + seq_file_name) if name != 'target': input_batches = npzfile['arr_0'] ret = input_batches else: target_batches = npzfile['arr_0'] ret = target_batches return ret, scaler else: for i in range(783): try: if name != 'target': x_batches = np.append( x_batches, input_serise[i * 11:(i + timesteps) * 11].reshape( -1, timesteps, 11)) else: y_batches = np.append( y_batches, target_serise[i:i + timesteps].reshape( -1, timesteps)) except ValueError: break if name != 'target': x_batches = x_batches.reshape(-1, timesteps, 11) np.savez("data_log/" + seq_file_name, x_batches) return x_batches, scaler else: y_batches = y_batches.reshape(-1, timesteps) np.savez("data_log/" + seq_file_name, y_batches) return y_batches, scaler
import numpy as np from sklearn.svm import SVC, LinearSVC from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.preprocessing import MaxAbsScaler, RobustScaler from sklearn.decomposition import PCA from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from keras.utils import np_utils ss = StandardScaler() mms = MinMaxScaler() mas = MaxAbsScaler() rs = RobustScaler() pca = PCA(n_components=3) ### 1. 데이터 x, y = load_iris(return_X_y=True) print(x.shape) # (150, 4) print(y.shape) # (150,) ## 1-1. 데이터 분리 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=77) print(x_train.shape) # (120, 4) print(x_test.shape) # (30, 4) print(y_train.shape) # (120,)
result[col] = abs((result[col].apply(hash))%2**(16)) del result['unique_id'] #%% handle missing value print ("handle missing data") result.fillna(result.mean(),inplace=True) #%% data preprocessing from sklearn import linear_model from sklearn.preprocessing import StandardScaler, RobustScaler standard_scaler = StandardScaler() robust_scaler = RobustScaler() X_train = robust_scaler.fit_transform(result) X_train1 = standard_scaler.fit_transform(result) #%% performace def performence(clf,train,label,clfName): re = cross_validation.ShuffleSplit(train.shape[0],n_iter=10,test_size =0.25,random_state =43) aucList = [] accuracyList = [] for train_index, test_index in re: clf.fit(train.iloc[train_index,:],y.iloc[train_index]) pre_y = clf.predict_proba(train.iloc[test_index,:]) # probablity to get the AUC aucList.append(roc_auc_score(y.iloc[test_index],pre_y[:,1]))
max_bin=200, bagging_fraction=0.75, bagging_freq=5, bagging_seed=7, feature_fraction=0.2, feature_fraction_seed=7, verbose=-1, ) score = rmsle_cv(lgb) lgb.fit(train, y_train) y_train_pred = np.expm1(lgb.predict(train)) print(f"lightgbm score: {score.mean():.4f} ({score.std():.4f})") LGB = np.expm1(lgb.predict(test)) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0004, random_state=1)) score = rmsle_cv(lasso) print(f"Lasso score: {score.mean():.4f} ({score.std():.4f})") reg = lasso.fit(train, y_train) predictions = lasso.predict(test) LASSO = np.expm1(predictions) svr = make_pipeline(RobustScaler(), SVR( C=20, epsilon=0.008, gamma=0.0003, )) score = rmsle_cv(svr) print(f"SVR score: {score.mean():.4f} ({score.std():.4f})")
from sklearn.model_selection import KFold, cross_val_score, train_test_split from sklearn.metrics import mean_squared_error import xgboost as xgb n_folds = 5 def fold_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_dummy.values) rmse = np.sqrt(-cross_val_score( model, X_train, y_values, scoring="neg_mean_squared_error", cv=kf)) return (rmse) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.001, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.005, l1_ratio=0.9, random_state=3)) KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) mode_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3,
var_dums = pd.get_dummies(all_data["Variety"]) all_data = all_data.drop(columns="Variety") all_data = pd.concat([all_data, var_dums], axis=1) all_data = all_data.drop(columns="Site ID") all_data = all_data.dropna() all_data = all_data[all_data["Assessment Score"] != '*'] #split features and target Y = all_data["Assessment Score"] X = all_data.drop(columns="Assessment Score") #scale features from sklearn.preprocessing import RobustScaler transformer = RobustScaler().fit(X) X = transformer.transform(X) Y = np.array(Y) Y[Y == ''] = 0.0 Y = Y.astype(np.float) #make dense network model import neural_net NeuralNet = neural_net.NeuralNet #crop_score_model = NeuralNet(X, Y, 6, 256, "r", 20) #check accuracy from sklearn.metrics import mean_squared_error
if CD == 1: NCH = NK SSHP0 = (SNFL, SNBETA, SNS, NT, NCH) SSHP1 = (SNFL*SNBETA*SNS, NT, NCH) SSHP2 = (SNFL*SNBETA*SNS, NT*NCH) if CD == 3: NKS = np.int32(np.sqrt(NK)) NCH = 1 SSHP0 = (SNFL, SNBETA, SNS, NT, NKS, NKS, NCH) SSHP1 = (SNFL*SNBETA*SNS, NT, NKS, NKS, NCH) SSHP2 = (SNFL*SNBETA*SNS, NT*NKS*NKS*NCH) # scaler dictionary SCLRS = {'minmax':MinMaxScaler(feature_range=(0, 1)), 'standard':StandardScaler(), 'robust':RobustScaler(), 'tanh':TanhScaler()} try: SCDAT = np.load(CWD+'/results/%s.%d.%d.%d.%d.%d.%s.%d.dmp.sc.npy' \ % (NAME, NT, NK, CD, SNI, SNS, SCLR, SEED)).reshape(*SSHP1) if VERBOSE: print('scaled selected classification samples loaded from file') print(100*'-') except: if SCLR == 'none': SCDAT = CDAT.reshape(*SSHP1) else: SCDAT = SCLRS[SCLR].fit_transform(CDAT.reshape(*SSHP2)).reshape(*SSHP1) np.save(CWD+'/results/%s.%d.%d.%d.%d.%d.%s.%d.dmp.sc.npy' % (NAME, NT, NK, CD, SNI, SNS, SCLR, SEED), SCDAT.reshape(*SSHP0)) if VERBOSE:
data['descriptor'] = data['smi'].swifter.apply(descriptors.QEDScore) elif conf['descriptor'] == 'SYBA_score': syba = SybaClassifier() syba.fitDefaultScore() data['descriptor'] = data['smi'].swifter.apply(syba.predict) else: print('Descriptor not recognised') X = np.stack(data['descriptor'].values) Y = np.stack(data['activity'].values) if isinstance(X[0], float): X = np.array([[i] for i in X]) if conf['descriptor'] == 'features': scaler = RobustScaler() X = scaler.fit_transform(X) study = optuna.create_study(direction="maximize") objective = Objective(X, Y, conf) study.optimize(objective, n_trials=conf['n_trials']) with open(best_params_file, 'w') as outfile: json.dump(study.best_params, outfile) df = study.trials_dataframe() df.to_csv(out_df_file) with open(best_value_file, 'w') as outfile: outfile.write("Best Trial Value: {}".format(study.best_value))
testy,testx=imdb_bag_of_word_libs.loadFeatsText('./exp/ivectors_imdb_test_NGMM_2048_W_2_DIM_200/feats.txt') print 'done in',time.time()-ts,len(x),len(y) y=imdb_bag_of_word_libs.kaldiID_2_LB(y) print y[0],x[0] x=np.array(x) y=np.array(y) trainx,trainy=x,y robust_scaler = RobustScaler() trainx=robust_scaler.fit_transform(trainx) evalx=robust_scaler.transform(testx) clf= LinearDiscriminantAnalysis() clf.fit(trainx,trainy) predictValue=clf.predict(evalx) sdict=dict() ptrue=list() for id,score in zip(testy,predictValue): sdict[id]=score #print id,score truevalue=int(id.split('_')[2]) if truevalue>=5: ptrue.append('1') else:
athena = pd.read_csv('./data/UA_AthenaData.csv') # Drop the encrypted phone number (LineNumber), and the Call category (As labeled by data team) athena = athena.drop(['LineNumber', 'CallCategory'], axis=1) # Split into subgroups, as training on the entire dataset breaks my computer group = np.array_split(athena, 4) # Iterate through each group for i in range(len(group)): print('======= GROUP {} ======'.format(i)) subdata = group[i] ## Scale the data to have mean=0 and unit variance: print('Scaling Data') scaler = RobustScaler().fit(athena) scaler.transform(athena) ## Reduce data for clustering print('Reducing dimensions') model = umap.UMAP(n_neighbors=20, min_dist=0.15, metric='braycurtis') data_2d = model.fit_transform(subdata) print('Clustering Data') cluster = DBSCAN(eps=3, min_samples=2).fit(subdata) print('Configuring data to clusters') subdata['PCA1'] = data_2d[:, 0] subdata['PCA2'] = data_2d[:, 1] cluster.labels_[cluster.labels_ > 0] = 1 subdata['cluster'] = cluster.labels_
#%% #aa = X.groupby('VisitNumber').groups #X_new = pd.DataFrame(columns = X.keys()) #for key in aa.keys(): # X_new = X_new.append(X.iloc[aa[key],:].mean(),ignore_index=True) #%% #%% from sklearn import linear_model from sklearn.preprocessing import StandardScaler, RobustScaler standard_scaler = StandardScaler() robust_scaler = RobustScaler() X_train = robust_scaler.fit_transform(aa) X_train1 = standard_scaler.fit_transform(aa) #%% for the test data X_test = testData for col in colName: X_test[col] = abs((X_test[col].apply(hash))%2**(16)) #%% print ("handle missing data") X_test.fillna(X_test.mean(),inplace=True)
print(test_length) upper_test = test_length + timesteps * 2 testset_length = test_length - upper_train print(testset_length) print(upper_train, upper_test, len(df_data_1)) # construct test set #subsetting df_data_1_test = df_data_1[upper_train:upper_test] test_set_y = np.nan_to_num(df_data_1_test['user_ts'].values) test_set = np.nan_to_num(df_data_1_test.loc[:, :].values) #scaling sc = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0)) scaled_test_values = sc.fit_transform(np.float64(test_set)) scaled_test_values_y = np.sign(test_set_y.reshape(-1, 1)) #scaled_test_values_y = sc.fit_transform(np.float64(test_set_y.reshape(-1,1))) #scaled_test_values = np.tanh(np.float64(test_set)) #scaled_test_values_y = np.tanh(np.float64(test_set_y.reshape(-1,1))) #creating input data x_test = [] y_test = [] for i in range(timesteps, testset_length + timesteps): x_test.append(scaled_test_values[i - timesteps:i, :]) y_test.append( scaled_test_values_y[i:timesteps +
def get_evoked_feats(f_list, stim_chan, sig_chan, pre_win=1., post_win=1.5, thresh=3, t_thresh=0.1): all_evoked_burst = None IBI = [] all_evoked_onset = [] all_prev_onset = [] stim_lockout_s = 1. for f in f_list: dat = pyabf.ABF(f) stim_id = abf.get_channel_id_by_label(dat, stim_chan) sig_id = abf.get_channel_id_by_label(dat, sig_chan) sr = dat.dataRate scl = RobustScaler() Y_cat = cat_sweeps(dat, sig_chan).T.ravel() scl.fit(Y_cat[:, np.newaxis]) for ii in range(dat.sweepCount): dat.setSweep(ii, stim_id) stim_samp = rlab_signal.binary_onsets(dat.sweepY, 4.)[0] dat.setSweep(ii, sig_id) # if sr == 10000: # print('Downsampling') # y = dat.sweepY # y = scipy.signal.decimate(y, 10) # sr = sr / 10 # else: # y = dat.sweepY y = dat.sweepY stim_lockout = int(stim_lockout_s * sr) yscl = scl.transform(y[:, np.newaxis]).ravel() yscl_NN = yscl - np.min(yscl) onsets, offsets = burst.detect_burst(yscl, sr, thresh=thresh, t_thresh=t_thresh) # onsets, offsets = burst.rm_endpoint_bursts(yscl, onsets, offsets, pre_win * sr, post_win * sr) # Get the threshold crossing time of the bursts that happened within a time window of the evoked #Used to get the evoked burst shapek try: evoked_onset_idx = np.where( onsets > (stim_samp - int(pre_win / 9. * sr)))[0][0] next_onset_idx = evoked_onset_idx + 1 prev_onset_idx = evoked_onset_idx - 1 evoked_onset = onsets[evoked_onset_idx] except: IBI.append(np.nan) all_prev_onset.append(np.nan) all_evoked_onset.append(np.nan) evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1 ]) * np.nan if all_evoked_burst is None: all_evoked_burst = evoked_burst else: all_evoked_burst = np.concatenate( [all_evoked_burst, evoked_burst], axis=1) continue # evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1]) * np.nan if next_onset_idx > len(onsets) - 1: next_onset = np.nan else: next_onset = onsets[next_onset_idx] if prev_onset_idx < 0: prev_onset = np.nan else: prev_onset = onsets[prev_onset_idx] # Get the threshold crossing of the second burst after stim (good for IBI) if evoked_onset < int(stim_samp + stim_lockout): evoked_burst = burst.get_aligned_bursts( yscl_NN, [evoked_onset], int(pre_win * sr), int(post_win * sr)) IBI.append(next_onset - evoked_onset) all_evoked_onset.append(evoked_onset) all_prev_onset.append(prev_onset) else: IBI.append(np.nan) all_prev_onset.append(np.nan) all_evoked_onset.append(np.nan) evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1 ]) * np.nan if all_evoked_burst is None: all_evoked_burst = evoked_burst else: all_evoked_burst = np.concatenate( [all_evoked_burst, evoked_burst], axis=1) evoked_onset = np.array(all_evoked_onset) / sr prev_onset = np.array(all_prev_onset) / sr IBI = np.array(IBI) / sr return (all_evoked_burst, evoked_onset, prev_onset, IBI)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sgmcmc_ssm.models.gauss_hmm import GaussHMMSampler from tqdm import tqdm np.random.seed(12345) # Load and Scale Data from scipy.io import loadmat ion_data = loadmat('data/alamethicin.mat') from sklearn.preprocessing import RobustScaler scaler = RobustScaler() observations = scaler.fit_transform(ion_data['originaldata'][1095:-3000]) filtered_observations = scaler.transform(ion_data['filtereddata']) T = len(observations) # Plot Data fig, ax = plt.subplots(1, 1) ax.plot(np.arange(T)[::50], observations[::50], '-', label='scaled data') ax.plot(np.arange(T)[::50], filtered_observations[::50], '-', label='scaled filtered data') ax.set_title('Scaled Ion Data') ax.set_xlabel('Time') ax.set_ylabel('Voltage (Scaled)') ax.legend()
print(len(train), len(test)) print('--------------------------------------------------') print('\n') df_data['MONTH'] = [d.strftime('%m') for d in df_data.index] from sklearn.preprocessing import RobustScaler f_columns = [ '2_prev', '3_prev', '4_prev', '5_prev', '6_prev', '7_prev', '8_prev', '9_prev', '10_prev', '11_prev', '12_prev', 'MONTH', 'HOUR', 'WEEKDAY', 'WEEKEND', 'Demand Forecast', 'SPOT Market Volume', 'Wind Forecast', 'RoR Forecast', 'Yuk Tahmin Planı (MWh)', 'Market Clearing Price' ] f_transformer = RobustScaler() cnt_transformer = RobustScaler() f_transformer = f_transformer.fit(train[f_columns].to_numpy()) cnt_transformer = cnt_transformer.fit(train[['NetOrder']]) train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy()) train['NetOrder'] = cnt_transformer.transform(train[['NetOrder']]) test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy()) test['NetOrder'] = cnt_transformer.transform(test[['NetOrder']]) def create_dataset(X, y, time_steps=1): Xs, ys = [], [] for i in range(len(X) - time_steps):
# In[12]: # 使用Z-标准化 scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) regressor = KNeighborsRegressor() regressor.fit(X_train_scaled, Y_train) Y_est = regressor.predict(X_test_scaled) print("MAE=", mean_squared_error(Y_test, Y_est)) # In[13]: # 鲁棒性缩放 scaler2 = RobustScaler() X_train_scaled = scaler2.fit_transform(X_train) X_test_scaled = scaler2.transform(X_test) regressor = KNeighborsRegressor() regressor.fit(X_train_scaled, Y_train) Y_est = regressor.predict(X_test_scaled) print("MAE=", mean_squared_error(Y_test, Y_est)) # In[14]: # 对特定特征使用非线性修正 non_linear_feat = 5 X_train_new_feat = np.sqrt(X_train[:, non_linear_feat]) X_test_new_feat = np.sqrt(X_test[:, non_linear_feat]) X_train_new_feat.shape = (X_train_new_feat.shape[0], 1)
def elasticReg(data, test): ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.001, l1_ratio=.8, random_state=3)) stackedVerData = data.copy() Y_res = data['AdjustedPrice'] data = data.drop(['AdjustedPrice', 'Id'], axis=1) ENet.fit(data, Y_res) score = scoreTest(ENet, data, Y_res) print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) lasso.fit(data, Y_res) score = scoreTest(lasso, data, Y_res) print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) predId = test['Id'] test = test.drop(['Id'], axis=1) y_pred_Enet = ENet.predict(test) y_pred_Lasso = lasso.predict(test) ENetS = make_pipeline(RobustScaler(), ElasticNet(alpha=0.001, l1_ratio=.8, random_state=3)) lassoS = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) # KRS = KernelRidge(alpha=0.8, kernel="polynomial") # GBRS = GradientBoostingRegressor(loss="huber", n_estimators=5000, learning_rate=0.001) GBRS = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =5) KRS = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) # YResStacked = pd.DataFrame(stackedVerData['AdjustedPrice']) # YResStacked.insert(0, 'Id', range(0, len(YResStacked))) # YResStacked.set_index(YResStacked.columns[0], inplace=True) # dataStacked = stackedVerData.drop(['AdjustedPrice', 'Id'], axis = 1) # dataStacked.insert(0, 'Id', range(0, len(dataStacked))) # dataStacked.set_index(dataStacked.columns[0], inplace=True) # YResStacked = stackedVerData['AdjustedPrice'].values YResStacked = pd.DataFrame(stackedVerData['AdjustedPrice']) dataStacked = stackedVerData.drop(['AdjustedPrice', 'Id'], axis=1) # print(dataStacked.shape) # print(YResStacked.shape) # print(type(dataStacked)) # print(type(YResStacked)) # print(np.any(np.isnan(dataStacked))) # print(np.any(np.isnan(YResStacked))) # print(np.all(np.isfinite(dataStacked))) # print(np.all(np.isfinite(YResStacked))) # exit() # exit() averageStackedModel = StackingAveragedModel(base_models=(ENet, KRS, GBRS), meta_model=lasso) averageStackedModel.fit(dataStacked, YResStacked) y_pred_stacked = averageStackedModel.predict(test.values) # print('FIT') # score = scoreTest(averageStackedModel, data, Y_res) # print("\nAVGStacked score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) saveSubmission(predId, y_pred_stacked)
"""Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive, omit_kwargs", [ (MaxAbsScaler(), maxabs_scale, True, False, []), (MinMaxScaler(), minmax_scale, False, False, ["clip"]), (StandardScaler(), scale, False, False, []), (StandardScaler(with_mean=False), scale, True, False, []), (PowerTransformer("yeo-johnson"), power_transform, False, False, []), (PowerTransformer("box-cox"), power_transform, False, True, []), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []), (RobustScaler(), robust_scale, False, False, []), (RobustScaler(with_centering=False), robust_scale, True, False, []), ], ) def test_missing_value_handling(est, func, support_sparse, strictly_positive, omit_kwargs): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan if strictly_positive: X += np.nanmin(X) + 0.1 X_train, X_test = train_test_split(X, random_state=1) # sanity check
eeg_dataset.head() X = eeg_dataset[['alpha', 'betha', 'delta', 'gamma', 'theta']].values y = eeg_dataset[['class']].values.ravel() # Segmentar los datos x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=.7, test_size=.3, random_state=25) # Escalado de caracteristicas from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaler.fit(X_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # Arquitectura de modelo max_features = 512 model = Sequential() model.add(Embedding(max_features, output_dim=64)) model.add(LSTM(64)) model.add(Dropout(0.8)) model.add(Dense(1, activation='sigmoid'))
dff = df.loc[(df.Date >= '4/29/2019') & (df.Date <= '5/3/2019')] dff['Timestamp'] = dff['Date'] + ' ' + dff['Time'] dff['Timestamp'] = pd.to_datetime(dff['Timestamp']) dff = dff.sort_values(by=['Timestamp']) X = dff.loc[dff.index, ['Position', 'Count']].to_numpy() #y_full = dff.loc[dff.index,['Count']].to_numpy() y_full = Y_full[:len(X)] distributions = [ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after power transformation (Yeo-Johnson)', PowerTransformer(method='yeo-johnson').fit_transform(X)), ('Data after power transformation (Box-Cox)', PowerTransformer(method='box-cox').fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform').fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ] # scale the output between 0 and 1 for the colorbar y = minmax_scale(y_full) # plasma does not exist in matplotlib < 1.5
axes[0].legend(fontsize=13) axes[0].set_xlabel('Redshift (distance)', fontsize=18) axes[1].set_xlim([18, 32]) axes[1].set_title('Magnitude distributions (5 filters)', fontsize=18) axes[1].legend(fontsize=13) axes[1].set_xlabel('Magnitudes (higher == fainter)', fontsize=18) axes[2].set_xlim([-6, 3]) axes[2].set_title('Distributions of the log-error', fontsize=18) axes[2].legend(fontsize=11) axes[2].set_xlabel('Log of error', fontsize=18) df_train.head() RS = RobustScaler() ### The training features ### feat_train = ['g', 'log_g_err', 'r', 'log_r_err', 'i', 'log_i_err', 'z', 'log_z_err', 'y', 'log_y_err'] ### The features for the validation set, ### ### ### each galaxy has 5 distinct features 1 for each exposer time ### feat_SN_1 = ['g_SN_1', 'log_g_err_SN_1', 'r_SN_1', 'log_r_err_SN_1', 'i_SN_1', 'log_i_err_SN_1', 'z_SN_1', 'log_z_err_SN_1', 'y_SN_1', 'log_y_err_SN_1'] feat_SN_2 = ['g_SN_2', 'log_g_err_SN_2', 'r_SN_2', 'log_r_err_SN_2', 'i_SN_2', 'log_i_err_SN_2', 'z_SN_2', 'log_z_err_SN_2', 'y_SN_2', 'log_y_err_SN_2']
if 'Unnamed' in col: del test[col] train.to_csv(path_or_buf= filepath + "/trainfinal.csv", index=False) test.to_csv(path_or_buf= filepath + "/testfinal.csv", index=False) print("Exported") train = [] test = [] #Obtaining the columns required for training the model train = pd.read_csv(filepath + "/trainfinal.csv") test = pd.read_csv(filepath + "/testfinal.csv") cols = [c for c in train.columns if c not in ['is_churn','msno']] #Pre-processing the file with Robust Scaler scaler = RobustScaler() scaler.fit(train[cols]) train_x = scaler.transform(train[cols]) test_x = scaler.transform(test[cols]) train_y = train['is_churn'] print("Pre-processing completed") #Training Random Forest Classifier model = RandomForestClassifier(n_estimators = 50) model.fit(train_x,train_y) print("Training Completed") #Predicting the test data with the trained model predictions = model.predict(test_x) #Exporting the msno and predicted values to a csv file
from sklearn.svm import SVR import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import RobustScaler path = "/Users/xiaofeifei/I/Kaggle/Benz/" train = pd.read_csv(path+'train_start.csv') # test = pd.read_csv(path+'test_start.csv') y = train["y"] train = train.drop(["y"], axis = 1) # # poly svm = SVR(kernel='rbf', C=1.0, epsilon=0.05) a= RobustScaler() train = a.fit_transform(train,y) kr = GridSearchCV(SVR(kernel='rbf', C=1.0, epsilon=0.05), cv=5, n_jobs = 6,verbose=1,scoring='r2', param_grid={"C": [20,30], "epsilon": [0.02,0.03,0.05,0.07]}) kr.fit(train, y) print kr.best_params_ print kr.best_score_ print kr.best_estimator_ # {'epsilon': 0.01, 'C': 30} # 0.536811148843
def __init__( self, X_train: np.array, y_train: np.array, X_valid: np.array, y_valid: np.array, n: int = 10, eval_method: str = "kfold", scaler: str = "standard", modelpath: str = "models", max_evals: int = 250, voting: str = "hard", calibrate: str = "sigmoid", timeout: int = 600, max_workers: int = 16, experiment: Experiment = None, ): # pylint:disable=too-many-arguments self.x = X_train self.y = y_train self.x_valid = X_valid self.y_valid = y_valid self.experiment = experiment # We make sure that everything is logged on comet assert isinstance(experiment, Experiment) assert len(self.x) == len(self.y) assert len(self.x_valid) == len(self.y_valid) self.n = n self.eval_method = eval_method if scaler == "robust": self.scalername = "robust" self.scaler = RobustScaler() elif scaler == "standard": self.scalername = "standard" self.scaler = StandardScaler() elif scaler == "minmax": self.scalername = "minmax" self.scaler = MinMaxScaler() self.x = self.scaler.fit_transform(self.x) self.x_valid = self.scaler.transform(self.x_valid) classcounter = dict(Counter(self.y)) trainlogger.info("the classdistribution is %s", classcounter) classes_to_keep = [] for oxidationstate, count in classcounter.items(): if count > MIN_SAMPLES: classes_to_keep.append(oxidationstate) else: trainlogger.warning( "will drop class %s since it has not enough examples", oxidationstate, ) selected_idx = np.where(np.isin(self.y, classes_to_keep))[0] self.x = self.x[selected_idx] self.y = self.y[selected_idx] self.max_evals = max_evals self.voting = voting self.timeout = timeout self.timings = [] self.modelpath = modelpath self.mix_ratios = {"rand": 0.15, "tpe": 0.7, "anneal": 0.15} self.max_workers = max_workers self.calibrate = calibrate self.classes = [1, 2, 3, 4, 5, 6, 7, 8] self.y = self.y.astype(np.int) self.y_valid = self.y_valid.astype(np.int) trainlogger.info("intialized training class")
random_state=6) hbo2_x, hbo2_x_test, hbo2_y, hbo2_y_test = train_test_split(hbo2_x, hbo2_y, test_size=0.1, random_state=6) ca_x, ca_x_test, ca_y, ca_y_test = train_test_split(ca_x, ca_y, test_size=0.1, random_state=6) na_x, na_x_test, na_y, na_y_test = train_test_split(na_x, na_y, test_size=0.1, random_state=6) # scalling scaler = RobustScaler() # scaler = MinMaxScaler() hhb_x = scaler.fit_transform(hhb_x) hhb_x_test = scaler.transform(hhb_x_test) x_pred_hhb = scaler.transform(x_pred_hhb) hbo2_x = scaler.fit_transform(hbo2_x) hbo2_x_test = scaler.transform(hbo2_x_test) x_pred_hbo2 = scaler.transform(x_pred_hbo2) ca_x = scaler.fit_transform(ca_x) ca_x_test = scaler.transform(ca_x_test) x_pred_ca = scaler.transform(x_pred_ca) na_x = scaler.fit_transform(na_x)
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, stratify=public_labels, random_state=1) #Vettorizzare i label from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() train_labels_encoded = encoder.fit_transform(y_train) test_labels_encoded = encoder.transform(y_test) #Scalers from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer scalers_to_test = [StandardScaler(), RobustScaler()] # Designate distributions to sample hyperparameters from C_range = np.array([9.78736006e+00, 2.23814334e+01, 1.00000000e-04, 1.00000000e-04, 1.74371223e+01, 1.00000000e-04, 2.96832303e-01, 1.06931597e+01, 8.90706391e+00, 1.75488618e+01, 1.49564414e+01, 1.06939267e+01, 1.00000000e-04, 7.94862668e+00, 3.14271995e+00, 1.00000000e-04, 1.41729905e+01, 8.07236535e+00, 4.54900806e-01, 1.00000000e-04, 1.00000000e-04, 1.99524074e+00, 4.68439119e+00, 1.00000000e-04, 1.16220405e+01, 1.00000000e-04, 1.00000000e-04, 1.03972709e+01, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.25523737e+01, 1.00000000e-04, 1.66095249e+01, 8.07308186e+00, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 2.08711336e+01, 1.64441230e+00, 1.15020554e+01, 1.00000000e-04, 1.81035130e+00, 1.17786194e+01, 1.00000000e-04, 1.03111446e+01,
X_train = np.vstack([X1, X2]) X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints) X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints) Y_test = np.hstack([[-1]*n_datapoints, [1]*n_datapoints]) X_test = np.vstack([X1, X2]) X_train[0, 0] = -1000 # a fairly large outlier # Scale data standard_scaler = StandardScaler() Xtr_s = standard_scaler.fit_transform(X_train) Xte_s = standard_scaler.transform(X_test) robust_scaler = RobustScaler() Xtr_r = robust_scaler.fit_transform(X_train) Xte_r = robust_scaler.fit_transform(X_test) # Plot data fig, ax = plt.subplots(1, 3, figsize=(12, 4)) ax[0].scatter(X_train[:, 0], X_train[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[0].set_title("Unscaled data") ax[1].set_title("After standard scaling (zoomed in)") ax[2].set_title("After robust scaling (zoomed in)") # for the scaled data, we zoom in to the data center (outlier can't be seen!) for a in ax[1:]:
test.drop('Id', axis=1, inplace=True) x = train.drop('SalePrice', axis=1) #Drop Target feature from train. y = train['SalePrice'] test = test.drop('SalePrice', axis=1) #known outliers(some from author notes and some from notebook guides) outliers = [30, 88, 462, 631, 1322] x = x.drop(x.index[outliers]) y = y.drop(y.index[outliers]) x = x.drop('MSSubClass_150', axis=1) test = test.drop('MSSubClass_150', axis=1) #Robustscalar normalizes the data so it is more robust to outliers. sc = RobustScaler() x = sc.fit_transform(x) test = sc.transform(test) #Train model = Lasso(alpha=0.0005, random_state=1) #other alphas were tried too . model.fit(x, y) #Predict pred = model.predict(test) predFinal = np.exp(pred) #Revert the log. #Data export output = pd.DataFrame({'Id': test2.Id, 'SalePrice': predFinal}) output.to_csv('submission.csv', index=False) output.head()
new # In[204]: pd.concat([new, data], axis=1).drop('number', axis=1).rename(columns={'number1': 'number'}) # In[218]: data1 = pd.get_dummies(data) from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import RobustScaler # In[219]: scaler = RobustScaler() scaler.fit(data1) scaler.transform(data1).shape pd.DataFrame(scaler.transform(data1)) # In[212]: data1 # In[214]: pd.DataFrame(scaler.transform(data1)) # In[221]: train.isnull().sum().sum()
def __init__(self, attribs=None, scaler=RobustScaler()): self.attribs = attribs self.scaler = scaler
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt' dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt' train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt' trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train) trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy) evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev) evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly) evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest) evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2) robust_scaler = RobustScaler() trainx=robust_scaler.fit_transform(trainx) evalx=robust_scaler.transform(evalx) clf= LinearDiscriminantAnalysis() # clf.fit(trainx,trainy) predictValue=clf.predict(evalx) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV) evalx2=robust_scaler.transform(evalx2) predictValue=clf.predict(evalx2) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9597368421052632 exported_pipeline = make_pipeline( RobustScaler(), LogisticRegression(C=25.0, dual=True, penalty="l2") ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# # Min-Max Scaler $\frac{x_i - min(x)}{max(x) - min(x)}$ # In[5]: mms = MinMaxScaler() views['minmax'] = mms.fit_transform(views[['views']]) views # In[6]: (vw[0] - np.min(vw)) / (np.max(vw) - np.min(vw)) # # Robust Scaler $\frac{x_i - median(x)}{IQR_{(1,3)}(x)}$ # In[7]: rs = RobustScaler() views['robust'] = rs.fit_transform(views[['views']]) views # In[8]: quartiles = np.percentile(vw, (25., 75.)) iqr = quartiles[1] - quartiles[0] (vw[0] - np.median(vw)) / iqr
def run_basic_svm(X_train, y_train, selected_features, scorers, refit_scorer_name, subset_share=0.1, n_splits=10, parameters=None): '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier. The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data. subset_share=0.1''' #Create a subset to train on print("[Step 1]: Create a data subset") subset_min = 300 #Minimal subset is 100 samples. if subset_share * X_train.shape[0] < subset_min: number_of_samples = subset_min print("minimal number of samples used: ", number_of_samples) else: number_of_samples = subset_share * X_train.shape[0] X_train_subset, y_train_subset = modelutil.extract_data_subset( X_train, y_train, number_of_samples) print("Got subset sizes X train: {} and y train: {}".format( X_train_subset.shape, y_train_subset.shape)) print("[Step 2]: Define test parameters") if parameters is None: #If no parameters have been defined, then do full definition # Guides used from # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), NearMiss(version=1), EditedNearestNeighbours(), AllKNN(), CondensedNearestNeighbour(random_state=0), InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression( solver='lbfgs', multi_class='auto')), SMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # gamma default parameters param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) parameters = [ { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['linear', 'sigmoid'] }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['poly'], 'svm__degree': [2, 3] # Only relevant for poly }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['rbf'], 'svm__gamma': [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # Only relevant in rbf, default='auto'=1/n_features } ] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) else: print("Parameters defined in the input: ", parameters) # Main pipeline for the grid search pipe_run1 = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('svm', SVC())]) print("Pipeline: ", pipe_run1) print("Stratified KFold={} used.".format(n_splits)) skf = StratifiedKFold(n_splits=n_splits) pipe_run1 = pipe_run1 params_run1 = parameters #params_debug #params_run1 grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=1, cv=skf, scoring=scorers, refit=refit_scorer_name, return_train_score=True, iid=True, n_jobs=-1).fit(X_train_subset, y_train_subset) results_run1 = modelutil.generate_result_table(grid_search_run1, params_run1, refit_scorer_name) print("Result size=", results_run1.shape) print("Number of NaN results: {}. Replace them with 0".format( np.sum(results_run1['mean_test_' + refit_scorer_name].isna()))) return grid_search_run1, params_run1, pipe_run1, results_run1