def impute_mean(df, attr): """Imputes the given attribute of the given DataFrame with the mean strategy. Returns a DataFrame object""" imp = Imputer(missing_values="NaN", strategy="mean") imp.fit(df[[attr]]) df[attr] = imp.transform(df[[attr]]).ravel() return df
def preprocessData(self, data): imputer = Imputer(missing_values=np.nan, strategy='mean') imputer.fit(data) imputedData = imputer.transform(data) # nan values will take on mean scaledData = preprocessing.scale(imputedData).tolist() return scaledData
def clean(df, strategy='median'): '''Cleans DataFrame.''' imputer = Imputer(strategy=strategy) object_df = df.select_dtypes(include=['object']) float_df = df.select_dtypes(include=['float64']) imputer.fit(float_df) float_df = pd.DataFrame(imputer.transform(float_df), columns=float_df.columns) return pd.concat([object_df, float_df], axis=1)
def feature_inf(my_feature,dim_feature): from sklearn.preprocessing.imputation import Imputer dim_feature=my_feature.shape[1] imp = Imputer(missing_values=np.inf, strategy='mean') correction_array=[0]*2*dim_feature correction_array=np.asarray(correction_array).reshape(2,dim_feature) imp.fit(correction_array) my_feature=imp.transform(my_feature) # preprocessing to get rid of NaN, infinity, etc. return my_feature
def preprocessData(self, data): ''' Handle missing values and scale the data (scaling necessary for SVM to function well). :param data: All of the original data. :return: Data that has been processed. ''' imputer = Imputer(missing_values=np.nan, strategy='mean') imputer.fit(data) imputedData = imputer.transform(data) #nan values will take on mean scaledData = preprocessing.scale(imputedData).tolist() return scaledData
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_true(np.all(X == Xt)) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=0 => no copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=1 => copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=1, missing_values=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) assert_false(sparse.issparse(Xt))
def to_predict_instance(self, X, partition_columns): values_for_preferences = [] for column in partition_columns: if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns): values_for_preferences.append(list(X[column].unique())) all_combinations = list(itertools.product( *values_for_preferences)) instances = [] for combination in all_combinations: instance = [] for column in X.columns: # se é um parametro dentro das preferencias if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns): instance.append( combination[list(partition_columns).index(column)]) # se não está nas preferencias e esta codificado elif len(column.split("#")) > 1: instance.append(0) # se não está nas preferencias e não esta codificado else: instance.append(np.nan) imputer = Imputer( missing_values=np.nan, strategy='mean', axis=0) imputer = imputer.fit(X) instance = imputer.transform([instance])[0] instances.append(instance) return instances
def test_imputation_pickle(): # Test for pickling imputers. import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_imputation_pickle(): """Test for pickling imputers.""" import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal(imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) # Normal matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) X_trans = imputer.fit(X).transform(X.copy()) assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False)) assert_array_equal(X_trans, X_true, err_msg.format(0, False)) # Normal matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(X.transpose()) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, X.copy().transpose()) else: X_trans = imputer.transform(X.copy().transpose()) assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False)) # Sparse matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True)) assert_array_equal(X_trans, X_true, err_msg.format(0, True)) # Sparse matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(sparse.csc_matrix(X.transpose())) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose())) else: X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))
def modelo_4v(): print(request.args) loaded_model, graph = cargarModelo_4v() # dimensions of our images. # Show datatest_name = request.args.get("datacsv") data_path = '../samples/' + datatest_name + '.csv' dataset = pd.read_csv(data_path, delimiter='\t') # imp = SimpleImputer(missing_values=np.nan, strategy='mean') sc = StandardScaler() #imputacion de datos(datos nulos) imp = Imputer() X_ID = dataset.iloc[:, 0].values X_testing = dataset.iloc[:, 1:5].values #imputacion de datos(datos nulos) imp = Imputer() imp.fit(X_testing) X_test = imp.transform(X_testing) X_test = sc.fit_transform(X_test, ) #prediccion with graph.as_default(): y_pred = loaded_model.predict(X_test) resultado_final = '' for i in range(0, len(y_pred)): if y_pred[i] > 0.5: print(X_ID[i], ' --> Genera Valor!') resultado = str(X_ID[i]) + ' --> Genera Valor!! ' else: print(X_ID[i], ' --> No genera Valor ') resultado = str(X_ID[i]) + ' --> No genera Valor ' resultado_final = resultado_final + resultado + '\n' #print('Prediccion:', score, ' Gato ' if score < 0.5 else ' Perro') return resultado_final
def test_imputation_copy(): """Test imputation with copy=True.""" l = 5 # Test default behaviour and with copy=True for params in [{}, {'copy': True}]: X = sparse_random_matrix(l, l, density=0.75, random_state=0) # Dense imputer = Imputer(missing_values=0, strategy="mean", **params) Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X.todense() == Xt)) # Sparse imputer = Imputer(missing_values=0, strategy="mean", **params) X = X.todense() Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X == Xt))
#%% Mass mobilization data mm = pd.read_csv( "/Users/danielgustafson/Documents/Grad/Fall 2018/Machine Learning/Final Project/full_mm.csv" ) #%% Separate into X and y ids = mm.iloc[:, 0:3] X = mm.iloc[:, 4:] y = mm.protests.values #%% Imputing the feature data imp = Imputer(missing_values=np.nan, strategy='median') imp.fit(X) X_impute = imp.transform(X) #%% Scale data # Get column names first names = list(X) # Create the Scaler object scaler = preprocessing.StandardScaler() # Fit your data on the scaler object X_impute_scaled = scaler.fit_transform(X_impute) X_impute_scaled = pd.DataFrame(X_impute_scaled, columns=names) #%% Split the data X_train, X_test, y_train, y_test = train_test_split(X_impute_scaled, y, test_size=0.2,
import matplotlib.pyplot as plt import pandas as pd """Reading the dataset 1. iloc .values removes the column and row labels """ dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values Y = dataset.iloc[:, -1] """Removing the missing values strategy can be mean, median, most_frequent""" from sklearn.preprocessing.imputation import Imputer # from sklearn.impute import SimpleImputer SI = Imputer(missing_values=np.nan, strategy='mean') """when we fit a model with 00data it calculates important parameters like mean etc from given 00data , then when we transform another set using that model then it utilizes that previous model. """ SI = SI.fit(X[:, 1:3]) X[:, 1:3] = SI.transform(X[:, 1:3]) """we cant use english labels so we change it to 1,2,3 but it can give different weight to columns so we change it to n different columns were n is number of types of entries in categorical column""" from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelEncoder_X = LabelEncoder() X[:, 0] = labelEncoder_X.fit_transform(X[:, 0]) oneHotEncoder = OneHotEncoder(categorical_features=[0]) X = oneHotEncoder.fit_transform(X).toarray() labelEncoder_Y = LabelEncoder() Y = labelEncoder_Y.fit_transform(Y) # splitting into test train