Exemple #1
0
def impute_mean(df, attr):
    """Imputes the given attribute of the given DataFrame with the mean strategy.
    Returns a DataFrame object"""
    imp = Imputer(missing_values="NaN", strategy="mean")
    imp.fit(df[[attr]])
    df[attr] = imp.transform(df[[attr]]).ravel()
    return df
    def preprocessData(self, data):
        imputer = Imputer(missing_values=np.nan, strategy='mean')
        imputer.fit(data)
        imputedData = imputer.transform(data)  # nan values will take on mean
        scaledData = preprocessing.scale(imputedData).tolist()

        return scaledData
Exemple #3
0
def clean(df, strategy='median'):
    '''Cleans DataFrame.'''
    imputer = Imputer(strategy=strategy)
    object_df = df.select_dtypes(include=['object'])
    float_df = df.select_dtypes(include=['float64'])
    imputer.fit(float_df)
    float_df = pd.DataFrame(imputer.transform(float_df),
                            columns=float_df.columns)

    return pd.concat([object_df, float_df], axis=1)
def feature_inf(my_feature,dim_feature):

  from sklearn.preprocessing.imputation import Imputer
  dim_feature=my_feature.shape[1]
  imp = Imputer(missing_values=np.inf, strategy='mean')
  correction_array=[0]*2*dim_feature
  correction_array=np.asarray(correction_array).reshape(2,dim_feature)
  imp.fit(correction_array) 
  my_feature=imp.transform(my_feature) # preprocessing to get rid of NaN, infinity, etc.

  return my_feature
Exemple #5
0
    def preprocessData(self, data):
        '''
        Handle missing values and scale the data (scaling necessary for SVM to function well).

        :param data: All of the original data.
        :return: Data that has been processed.
        '''
        imputer = Imputer(missing_values=np.nan, strategy='mean')
        imputer.fit(data)
        imputedData = imputer.transform(data)  #nan values will take on mean
        scaledData = preprocessing.scale(imputedData).tolist()

        return scaledData
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_false(np.all(X == Xt))

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_true(np.all(X == Xt))

    # copy=False, sparse csr, axis=1 => no copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=0 => no copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=1 => copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=1, missing_values=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    assert_false(sparse.issparse(Xt))
    def to_predict_instance(self, X, partition_columns):
        values_for_preferences = []
        for column in partition_columns:
            if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns):
                values_for_preferences.append(list(X[column].unique()))
        all_combinations = list(itertools.product(
            *values_for_preferences))

        instances = []
        for combination in all_combinations:
            instance = []
            for column in X.columns:
                # se é um parametro dentro das preferencias
                if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns):
                    instance.append(
                        combination[list(partition_columns).index(column)])
                # se não está nas preferencias e esta codificado
                elif len(column.split("#")) > 1:
                    instance.append(0)
                # se não está nas preferencias e não esta codificado
                else:
                    instance.append(np.nan)
            imputer = Imputer(
                missing_values=np.nan, strategy='mean', axis=0)
            imputer = imputer.fit(X)
            instance = imputer.transform([instance])[0]
            instances.append(instance)
        return instances
Exemple #8
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(
            imputer.transform(X.copy()), imputer_pickled.transform(X.copy()),
            "Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy))
def test_imputation_pickle():
    """Test for pickling imputers."""
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(imputer.transform(X.copy()),
                           imputer_pickled.transform(X.copy()),
                           "Fail to transform the data after pickling "
                           "(strategy = %s)" % (strategy))
def _check_statistics(X, X_true,
                      strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    # Normal matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, False))
    assert_array_equal(X_trans, X_true, err_msg.format(0, False))

    # Normal matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(X.transpose())
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform, X.copy().transpose())
    else:
        X_trans = imputer.transform(X.copy().transpose())
        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, False))

    # Sparse matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, True))
    assert_array_equal(X_trans, X_true, err_msg.format(0, True))

    # Sparse matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(sparse.csc_matrix(X.transpose()))
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform,
                      sparse.csc_matrix(X.copy().transpose()))
    else:
        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, True))
def _check_statistics(X, X_true,
                      strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    # Normal matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, False))
    assert_array_equal(X_trans, X_true, err_msg.format(0, False))

    # Normal matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(X.transpose())
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform, X.copy().transpose())
    else:
        X_trans = imputer.transform(X.copy().transpose())
        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, False))

    # Sparse matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, True))
    assert_array_equal(X_trans, X_true, err_msg.format(0, True))

    # Sparse matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(sparse.csc_matrix(X.transpose()))
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform,
                      sparse.csc_matrix(X.copy().transpose()))
    else:
        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, True))
Exemple #12
0
def modelo_4v():
    print(request.args)
    loaded_model, graph = cargarModelo_4v()
    # dimensions of our images.

    # Show
    datatest_name = request.args.get("datacsv")
    data_path = '../samples/' + datatest_name + '.csv'

    dataset = pd.read_csv(data_path, delimiter='\t')
    # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    sc = StandardScaler()
    #imputacion de datos(datos nulos)
    imp = Imputer()

    X_ID = dataset.iloc[:, 0].values
    X_testing = dataset.iloc[:, 1:5].values
    #imputacion de datos(datos nulos)
    imp = Imputer()
    imp.fit(X_testing)
    X_test = imp.transform(X_testing)
    X_test = sc.fit_transform(X_test, )

    #prediccion

    with graph.as_default():
        y_pred = loaded_model.predict(X_test)
        resultado_final = ''
        for i in range(0, len(y_pred)):

            if y_pred[i] > 0.5:
                print(X_ID[i], ' --> Genera Valor!')
                resultado = str(X_ID[i]) + ' --> Genera Valor!! '
            else:
                print(X_ID[i], ' --> No genera Valor ')
                resultado = str(X_ID[i]) + ' --> No genera Valor '
            resultado_final = resultado_final + resultado + '\n'

        #print('Prediccion:', score, ' Gato ' if score < 0.5 else ' Perro')
        return resultado_final
Exemple #13
0
def test_imputation_copy():
    """Test imputation with copy=True."""
    l = 5

    # Test default behaviour and with copy=True
    for params in [{}, {'copy': True}]:
        X = sparse_random_matrix(l, l, density=0.75, random_state=0)

        # Dense
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X.todense() == Xt))

        # Sparse
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        X = X.todense()
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X == Xt))
def test_imputation_copy():
    """Test imputation with copy=True."""
    l = 5

    # Test default behaviour and with copy=True
    for params in [{}, {'copy': True}]:
        X = sparse_random_matrix(l, l, density=0.75, random_state=0)

        # Dense
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X.todense() == Xt))

        # Sparse
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        X = X.todense()
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X == Xt))
#%% Mass mobilization data
mm = pd.read_csv(
    "/Users/danielgustafson/Documents/Grad/Fall 2018/Machine Learning/Final Project/full_mm.csv"
)

#%% Separate into X and y
ids = mm.iloc[:, 0:3]

X = mm.iloc[:, 4:]

y = mm.protests.values

#%% Imputing the feature data
imp = Imputer(missing_values=np.nan, strategy='median')
imp.fit(X)
X_impute = imp.transform(X)

#%% Scale data
# Get column names first
names = list(X)
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
X_impute_scaled = scaler.fit_transform(X_impute)
X_impute_scaled = pd.DataFrame(X_impute_scaled, columns=names)

#%% Split the data
X_train, X_test, y_train, y_test = train_test_split(X_impute_scaled,
                                                    y,
                                                    test_size=0.2,
Exemple #16
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_false(np.all(X == Xt))

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_true(np.all(X == Xt))

    # copy=False, sparse csr, axis=1 => no copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=0 => no copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=1 => copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=1, missing_values=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    assert_false(sparse.issparse(Xt))
import matplotlib.pyplot as plt
import pandas as pd
"""Reading the dataset 
    1. iloc .values removes the column and row labels """
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1]
"""Removing the missing values strategy can be mean, median, most_frequent"""
from sklearn.preprocessing.imputation import Imputer
# from sklearn.impute import SimpleImputer

SI = Imputer(missing_values=np.nan, strategy='mean')
"""when we fit a model with 00data it calculates important parameters like mean etc from 
   given 00data , then when we transform another set using that model then it utilizes that 
   previous model. """
SI = SI.fit(X[:, 1:3])
X[:, 1:3] = SI.transform(X[:, 1:3])
"""we cant use english labels so we change it to 1,2,3 but it can give different weight 
    to columns so we change it to n different columns were n is number of types of entries 
    in categorical column"""
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelEncoder_X = LabelEncoder()
X[:, 0] = labelEncoder_X.fit_transform(X[:, 0])

oneHotEncoder = OneHotEncoder(categorical_features=[0])
X = oneHotEncoder.fit_transform(X).toarray()
labelEncoder_Y = LabelEncoder()
Y = labelEncoder_Y.fit_transform(Y)

# splitting into test train