def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) # Normal matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) X_trans = imputer.fit(X).transform(X.copy()) assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False)) assert_array_equal(X_trans, X_true, err_msg.format(0, False)) # Normal matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(X.transpose()) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, X.copy().transpose()) else: X_trans = imputer.transform(X.copy().transpose()) assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False)) # Sparse matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True)) assert_array_equal(X_trans, X_true, err_msg.format(0, True)) # Sparse matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(sparse.csc_matrix(X.transpose())) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose())) else: X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) # Normal matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) X_trans = imputer.fit(X).transform(X.copy()) assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False)) assert_array_equal(X_trans, X_true, err_msg.format(0, False)) # Normal matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(X.transpose()) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, X.copy().transpose()) else: X_trans = imputer.transform(X.copy().transpose()) assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False)) # Sparse matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True)) assert_array_equal(X_trans, X_true, err_msg.format(0, True)) # Sparse matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(sparse.csc_matrix(X.transpose())) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose())) else: X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))
def preprocessData(self, data): imputer = Imputer(missing_values=np.nan, strategy='mean') imputer.fit(data) imputedData = imputer.transform(data) # nan values will take on mean scaledData = preprocessing.scale(imputedData).tolist() return scaledData
def to_predict_instance(self, X, partition_columns): values_for_preferences = [] for column in partition_columns: if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns): values_for_preferences.append(list(X[column].unique())) all_combinations = list(itertools.product( *values_for_preferences)) instances = [] for combination in all_combinations: instance = [] for column in X.columns: # se é um parametro dentro das preferencias if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns): instance.append( combination[list(partition_columns).index(column)]) # se não está nas preferencias e esta codificado elif len(column.split("#")) > 1: instance.append(0) # se não está nas preferencias e não esta codificado else: instance.append(np.nan) imputer = Imputer( missing_values=np.nan, strategy='mean', axis=0) imputer = imputer.fit(X) instance = imputer.transform([instance])[0] instances.append(instance) return instances
def impute_mean(df, attr): """Imputes the given attribute of the given DataFrame with the mean strategy. Returns a DataFrame object""" imp = Imputer(missing_values="NaN", strategy="mean") imp.fit(df[[attr]]) df[attr] = imp.transform(df[[attr]]).ravel() return df
def clean(df, strategy='median'): '''Cleans DataFrame.''' imputer = Imputer(strategy=strategy) object_df = df.select_dtypes(include=['object']) float_df = df.select_dtypes(include=['float64']) imputer.fit(float_df) float_df = pd.DataFrame(imputer.transform(float_df), columns=float_df.columns) return pd.concat([object_df, float_df], axis=1)
def feature_inf(my_feature,dim_feature): from sklearn.preprocessing.imputation import Imputer dim_feature=my_feature.shape[1] imp = Imputer(missing_values=np.inf, strategy='mean') correction_array=[0]*2*dim_feature correction_array=np.asarray(correction_array).reshape(2,dim_feature) imp.fit(correction_array) my_feature=imp.transform(my_feature) # preprocessing to get rid of NaN, infinity, etc. return my_feature
def preprocessData(self, data): ''' Handle missing values and scale the data (scaling necessary for SVM to function well). :param data: All of the original data. :return: Data that has been processed. ''' imputer = Imputer(missing_values=np.nan, strategy='mean') imputer.fit(data) imputedData = imputer.transform(data) #nan values will take on mean scaledData = preprocessing.scale(imputedData).tolist() return scaledData
def test_imputation_pickle(): # Test for pickling imputers. import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_imputation_pickle(): """Test for pickling imputers.""" import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal(imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_mice_missing_at_transform(): n = 100 d = 10 Xtr = np.random.randint(low=0, high=3, size=(n, d)) Xts = np.random.randint(low=0, high=3, size=(n, d)) Xtr[:, 0] = 1 # definitely no missing values in 0th column Xts[0, 0] = 0 # definitely missing value in 0th column for strategy in ["mean", "median", "most_frequent"]: mice = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy).fit(Xtr) initial_imputer = Imputer(missing_values=0, strategy=strategy).fit(Xtr) # if there were no missing values at time of fit, then mice will # only use the initial imputer for that feature at transform assert np.all( mice.transform(Xts)[:, 0] == initial_imputer.transform(Xts)[:, 0])
def modelo_4v(): print(request.args) loaded_model, graph = cargarModelo_4v() # dimensions of our images. # Show datatest_name = request.args.get("datacsv") data_path = '../samples/' + datatest_name + '.csv' dataset = pd.read_csv(data_path, delimiter='\t') # imp = SimpleImputer(missing_values=np.nan, strategy='mean') sc = StandardScaler() #imputacion de datos(datos nulos) imp = Imputer() X_ID = dataset.iloc[:, 0].values X_testing = dataset.iloc[:, 1:5].values #imputacion de datos(datos nulos) imp = Imputer() imp.fit(X_testing) X_test = imp.transform(X_testing) X_test = sc.fit_transform(X_test, ) #prediccion with graph.as_default(): y_pred = loaded_model.predict(X_test) resultado_final = '' for i in range(0, len(y_pred)): if y_pred[i] > 0.5: print(X_ID[i], ' --> Genera Valor!') resultado = str(X_ID[i]) + ' --> Genera Valor!! ' else: print(X_ID[i], ' --> No genera Valor ') resultado = str(X_ID[i]) + ' --> No genera Valor ' resultado_final = resultado_final + resultado + '\n' #print('Prediccion:', score, ' Gato ' if score < 0.5 else ' Perro') return resultado_final
#%% Mass mobilization data mm = pd.read_csv( "/Users/danielgustafson/Documents/Grad/Fall 2018/Machine Learning/Final Project/full_mm.csv" ) #%% Separate into X and y ids = mm.iloc[:, 0:3] X = mm.iloc[:, 4:] y = mm.protests.values #%% Imputing the feature data imp = Imputer(missing_values=np.nan, strategy='median') imp.fit(X) X_impute = imp.transform(X) #%% Scale data # Get column names first names = list(X) # Create the Scaler object scaler = preprocessing.StandardScaler() # Fit your data on the scaler object X_impute_scaled = scaler.fit_transform(X_impute) X_impute_scaled = pd.DataFrame(X_impute_scaled, columns=names) #%% Split the data X_train, X_test, y_train, y_test = train_test_split(X_impute_scaled, y, test_size=0.2, random_state=1523)
#print("feature's name: ",[col for col in test_features.columns # if col not in train_features.columns]) #train_features,test_features = train_features.align(test_features, # join='left', # axis = 1) missing_cols_train = [ col for col in train_features.columns if train_features[col].isnull().any() ] print('missing features:' + str(missing_cols_train)) #print(train_features.LotFrontage) # 缺失值处理 my_imputer = Imputer(strategy='median') train_features = my_imputer.fit_transform(train_features) test_features = my_imputer.transform(test_features) #print(train_features.LotFrontage) #print("features num : "+len(train_features.columns)) ## 训练数据集分割成训练集和测试集,用于测试 X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, train_size=0.8, test_size=0.2, random_state=0) # 训练XGBOOST model = XGBRegressor(max_depth=7, learning_rate=0.1, Missing=None) model.fit(X_train, y_train, verbose=False) predictions = model.predict(X_test)
# категории print("building train") train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix() imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) train_cat_matr = imp.fit_transform(train_cat_matr) # imp2 = Imputer(missing_values='NaN', strategy='median') train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix() # train_noncat_matr = imp2.fit_transform(train_noncat_matr) # allf = np.hstack((train_cat_matr, train_noncat_matr)) print("building test") test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix() test_cat_matr = imp.transform(test_cat_matr) test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix() # test_noncat_matr = imp2.transform(test_noncat_matr) # test_extra_matr = build_extra_features(test_noncat_matr[:,:10]) # test_noncat_matr = np.hstack((test_noncat_matr, test_extra_matr)) print("One-hot-encoding") enc = OneHotEncoder(categorical_features=range(CAT_COUNT)) preprocessed_features = np.hstack((train_cat_matr, train_noncat_matr)) enc_train_df = enc.fit_transform(preprocessed_features) print("test") enc_test_df = enc.transform(np.hstack((test_cat_matr, test_noncat_matr)))
# категории print("building train") train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix() imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0) train_cat_matr = imp.fit_transform(train_cat_matr) # imp2 = Imputer(missing_values='NaN', strategy='median') train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix() # train_noncat_matr = imp2.fit_transform(train_noncat_matr) # allf = np.hstack((train_cat_matr, train_noncat_matr)) print("building test") test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix() test_cat_matr = imp.transform(test_cat_matr) test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix() # test_noncat_matr = imp2.transform(test_noncat_matr) # test_extra_matr = build_extra_features(test_noncat_matr[:,:10]) # test_noncat_matr = np.hstack((test_noncat_matr, test_extra_matr)) print("One-hot-encoding") enc = OneHotEncoder(categorical_features=range(CAT_COUNT)) preprocessed_features = np.hstack((train_cat_matr, train_noncat_matr)) enc_train_df = enc.fit_transform(preprocessed_features) print("test") enc_test_df = enc.transform(np.hstack((test_cat_matr, test_noncat_matr)))
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_test = X_test.drop(cols_with_missing, axis=1) print("Mean Absolute Error from dropping columns with Missing Values:") print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test)) # imputer my_imputer = Imputer() # 先fit再transform # fit:只有X_train的话,执行无监督学习算法,比如降维、特征提取、标准化等 # transform:根据对象的特性来定,比如这里是Imputer()对象,那么就是要执行impute # 另外也可以是StandardScaler()对象,实现标准化(在此之前也要fit) #print(len(X_train.columns)) imputed_X_train = my_imputer.fit_transform(X_train) #print(len(imputed_X_train[0,:])) imputed_X_test = my_imputer.transform(X_test) print("Mean Absolute Error from Imputation:") print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)) # 被impute的数据 imputed_X_train_plus = X_train.copy() imputed_X_test_plus = X_test.copy() cols_with_missing = (col for col in X_train.columns if X_train[col].isnull().any()) # 有缺失值得数据不是直接删除,而是有数据的是false,无数据的是true for col in cols_with_missing: imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull() imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
import pandas as pd """Reading the dataset 1. iloc .values removes the column and row labels """ dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values Y = dataset.iloc[:, -1] """Removing the missing values strategy can be mean, median, most_frequent""" from sklearn.preprocessing.imputation import Imputer # from sklearn.impute import SimpleImputer SI = Imputer(missing_values=np.nan, strategy='mean') """when we fit a model with 00data it calculates important parameters like mean etc from given 00data , then when we transform another set using that model then it utilizes that previous model. """ SI = SI.fit(X[:, 1:3]) X[:, 1:3] = SI.transform(X[:, 1:3]) """we cant use english labels so we change it to 1,2,3 but it can give different weight to columns so we change it to n different columns were n is number of types of entries in categorical column""" from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelEncoder_X = LabelEncoder() X[:, 0] = labelEncoder_X.fit_transform(X[:, 0]) oneHotEncoder = OneHotEncoder(categorical_features=[0]) X = oneHotEncoder.fit_transform(X).toarray() labelEncoder_Y = LabelEncoder() Y = labelEncoder_Y.fit_transform(Y) # splitting into test train from sklearn.model_selection import train_test_split