def _check_statistics(X, X_true,
                      strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    # Normal matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, False))
    assert_array_equal(X_trans, X_true, err_msg.format(0, False))

    # Normal matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(X.transpose())
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform, X.copy().transpose())
    else:
        X_trans = imputer.transform(X.copy().transpose())
        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, False))

    # Sparse matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, True))
    assert_array_equal(X_trans, X_true, err_msg.format(0, True))

    # Sparse matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(sparse.csc_matrix(X.transpose()))
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform,
                      sparse.csc_matrix(X.copy().transpose()))
    else:
        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, True))
def _check_statistics(X, X_true,
                      strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    # Normal matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, False))
    assert_array_equal(X_trans, X_true, err_msg.format(0, False))

    # Normal matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(X.transpose())
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform, X.copy().transpose())
    else:
        X_trans = imputer.transform(X.copy().transpose())
        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, False))

    # Sparse matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, True))
    assert_array_equal(X_trans, X_true, err_msg.format(0, True))

    # Sparse matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(sparse.csc_matrix(X.transpose()))
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform,
                      sparse.csc_matrix(X.copy().transpose()))
    else:
        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, True))
    def preprocessData(self, data):
        imputer = Imputer(missing_values=np.nan, strategy='mean')
        imputer.fit(data)
        imputedData = imputer.transform(data)  # nan values will take on mean
        scaledData = preprocessing.scale(imputedData).tolist()

        return scaledData
Beispiel #4
0
    def to_predict_instance(self, X, partition_columns):
        values_for_preferences = []
        for column in partition_columns:
            if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns):
                values_for_preferences.append(list(X[column].unique()))
        all_combinations = list(itertools.product(
            *values_for_preferences))

        instances = []
        for combination in all_combinations:
            instance = []
            for column in X.columns:
                # se é um parametro dentro das preferencias
                if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns):
                    instance.append(
                        combination[list(partition_columns).index(column)])
                # se não está nas preferencias e esta codificado
                elif len(column.split("#")) > 1:
                    instance.append(0)
                # se não está nas preferencias e não esta codificado
                else:
                    instance.append(np.nan)
            imputer = Imputer(
                missing_values=np.nan, strategy='mean', axis=0)
            imputer = imputer.fit(X)
            instance = imputer.transform([instance])[0]
            instances.append(instance)
        return instances
Beispiel #5
0
def impute_mean(df, attr):
    """Imputes the given attribute of the given DataFrame with the mean strategy.
    Returns a DataFrame object"""
    imp = Imputer(missing_values="NaN", strategy="mean")
    imp.fit(df[[attr]])
    df[attr] = imp.transform(df[[attr]]).ravel()
    return df
Beispiel #6
0
def clean(df, strategy='median'):
    '''Cleans DataFrame.'''
    imputer = Imputer(strategy=strategy)
    object_df = df.select_dtypes(include=['object'])
    float_df = df.select_dtypes(include=['float64'])
    imputer.fit(float_df)
    float_df = pd.DataFrame(imputer.transform(float_df),
                            columns=float_df.columns)

    return pd.concat([object_df, float_df], axis=1)
def feature_inf(my_feature,dim_feature):

  from sklearn.preprocessing.imputation import Imputer
  dim_feature=my_feature.shape[1]
  imp = Imputer(missing_values=np.inf, strategy='mean')
  correction_array=[0]*2*dim_feature
  correction_array=np.asarray(correction_array).reshape(2,dim_feature)
  imp.fit(correction_array) 
  my_feature=imp.transform(my_feature) # preprocessing to get rid of NaN, infinity, etc.

  return my_feature
Beispiel #8
0
    def preprocessData(self, data):
        '''
        Handle missing values and scale the data (scaling necessary for SVM to function well).

        :param data: All of the original data.
        :return: Data that has been processed.
        '''
        imputer = Imputer(missing_values=np.nan, strategy='mean')
        imputer.fit(data)
        imputedData = imputer.transform(data)  #nan values will take on mean
        scaledData = preprocessing.scale(imputedData).tolist()

        return scaledData
Beispiel #9
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(
            imputer.transform(X.copy()), imputer_pickled.transform(X.copy()),
            "Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy))
def test_imputation_pickle():
    """Test for pickling imputers."""
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(imputer.transform(X.copy()),
                           imputer_pickled.transform(X.copy()),
                           "Fail to transform the data after pickling "
                           "(strategy = %s)" % (strategy))
Beispiel #11
0
def test_mice_missing_at_transform():
    n = 100
    d = 10
    Xtr = np.random.randint(low=0, high=3, size=(n, d))
    Xts = np.random.randint(low=0, high=3, size=(n, d))

    Xtr[:, 0] = 1  # definitely no missing values in 0th column
    Xts[0, 0] = 0  # definitely missing value in 0th column

    for strategy in ["mean", "median", "most_frequent"]:
        mice = MICEImputer(missing_values=0,
                           n_imputations=1,
                           n_burn_in=1,
                           initial_strategy=strategy).fit(Xtr)
        initial_imputer = Imputer(missing_values=0, strategy=strategy).fit(Xtr)

        # if there were no missing values at time of fit, then mice will
        # only use the initial imputer for that feature at transform
        assert np.all(
            mice.transform(Xts)[:, 0] == initial_imputer.transform(Xts)[:, 0])
Beispiel #12
0
def modelo_4v():
    print(request.args)
    loaded_model, graph = cargarModelo_4v()
    # dimensions of our images.

    # Show
    datatest_name = request.args.get("datacsv")
    data_path = '../samples/' + datatest_name + '.csv'

    dataset = pd.read_csv(data_path, delimiter='\t')
    # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    sc = StandardScaler()
    #imputacion de datos(datos nulos)
    imp = Imputer()

    X_ID = dataset.iloc[:, 0].values
    X_testing = dataset.iloc[:, 1:5].values
    #imputacion de datos(datos nulos)
    imp = Imputer()
    imp.fit(X_testing)
    X_test = imp.transform(X_testing)
    X_test = sc.fit_transform(X_test, )

    #prediccion

    with graph.as_default():
        y_pred = loaded_model.predict(X_test)
        resultado_final = ''
        for i in range(0, len(y_pred)):

            if y_pred[i] > 0.5:
                print(X_ID[i], ' --> Genera Valor!')
                resultado = str(X_ID[i]) + ' --> Genera Valor!! '
            else:
                print(X_ID[i], ' --> No genera Valor ')
                resultado = str(X_ID[i]) + ' --> No genera Valor '
            resultado_final = resultado_final + resultado + '\n'

        #print('Prediccion:', score, ' Gato ' if score < 0.5 else ' Perro')
        return resultado_final
#%% Mass mobilization data
mm = pd.read_csv(
    "/Users/danielgustafson/Documents/Grad/Fall 2018/Machine Learning/Final Project/full_mm.csv"
)

#%% Separate into X and y
ids = mm.iloc[:, 0:3]

X = mm.iloc[:, 4:]

y = mm.protests.values

#%% Imputing the feature data
imp = Imputer(missing_values=np.nan, strategy='median')
imp.fit(X)
X_impute = imp.transform(X)

#%% Scale data
# Get column names first
names = list(X)
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
X_impute_scaled = scaler.fit_transform(X_impute)
X_impute_scaled = pd.DataFrame(X_impute_scaled, columns=names)

#%% Split the data
X_train, X_test, y_train, y_test = train_test_split(X_impute_scaled,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1523)
Beispiel #14
0
#print("feature's name: ",[col for col in test_features.columns
#                          if col not in train_features.columns])

#train_features,test_features = train_features.align(test_features,
#                                                    join='left',
#                                                    axis = 1)
missing_cols_train = [
    col for col in train_features.columns
    if train_features[col].isnull().any()
]
print('missing features:' + str(missing_cols_train))
#print(train_features.LotFrontage)
# 缺失值处理
my_imputer = Imputer(strategy='median')
train_features = my_imputer.fit_transform(train_features)
test_features = my_imputer.transform(test_features)
#print(train_features.LotFrontage)
#print("features num : "+len(train_features.columns))
## 训练数据集分割成训练集和测试集,用于测试

X_train, X_test, y_train, y_test = train_test_split(train_features,
                                                    train_target,
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=0)

# 训练XGBOOST
model = XGBRegressor(max_depth=7, learning_rate=0.1, Missing=None)
model.fit(X_train, y_train, verbose=False)

predictions = model.predict(X_test)
# категории
print("building train")
train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix()
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
train_cat_matr = imp.fit_transform(train_cat_matr)
# imp2 = Imputer(missing_values='NaN', strategy='median')
train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix()
# train_noncat_matr = imp2.fit_transform(train_noncat_matr)
# allf = np.hstack((train_cat_matr, train_noncat_matr))


print("building test")
test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode
test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix()
test_cat_matr = imp.transform(test_cat_matr)
test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix()
# test_noncat_matr = imp2.transform(test_noncat_matr)
# test_extra_matr = build_extra_features(test_noncat_matr[:,:10])
# test_noncat_matr = np.hstack((test_noncat_matr, test_extra_matr))

print("One-hot-encoding")

enc = OneHotEncoder(categorical_features=range(CAT_COUNT))
preprocessed_features = np.hstack((train_cat_matr, train_noncat_matr))

enc_train_df = enc.fit_transform(preprocessed_features)

print("test")
enc_test_df = enc.transform(np.hstack((test_cat_matr, test_noncat_matr)))
# категории
print("building train")
train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix()
imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0)
train_cat_matr = imp.fit_transform(train_cat_matr)
# imp2 = Imputer(missing_values='NaN', strategy='median')
train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix()
# train_noncat_matr = imp2.fit_transform(train_noncat_matr)
# allf = np.hstack((train_cat_matr, train_noncat_matr))


print("building test")
test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode
test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix()
test_cat_matr = imp.transform(test_cat_matr)
test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix()
# test_noncat_matr = imp2.transform(test_noncat_matr)
# test_extra_matr = build_extra_features(test_noncat_matr[:,:10])
# test_noncat_matr = np.hstack((test_noncat_matr, test_extra_matr))

print("One-hot-encoding")

enc = OneHotEncoder(categorical_features=range(CAT_COUNT))
preprocessed_features = np.hstack((train_cat_matr, train_noncat_matr))

enc_train_df = enc.fit_transform(preprocessed_features)

print("test")
enc_test_df = enc.transform(np.hstack((test_cat_matr, test_noncat_matr)))
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test  = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))

# imputer
my_imputer = Imputer()
# 先fit再transform
# fit:只有X_train的话,执行无监督学习算法,比如降维、特征提取、标准化等
# transform:根据对象的特性来定,比如这里是Imputer()对象,那么就是要执行impute
# 另外也可以是StandardScaler()对象,实现标准化(在此之前也要fit)
#print(len(X_train.columns))
imputed_X_train = my_imputer.fit_transform(X_train) 
#print(len(imputed_X_train[0,:]))
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))

# 被impute的数据
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns 
                                 if X_train[col].isnull().any())

# 有缺失值得数据不是直接删除,而是有数据的是false,无数据的是true
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
import pandas as pd
"""Reading the dataset 
    1. iloc .values removes the column and row labels """
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1]
"""Removing the missing values strategy can be mean, median, most_frequent"""
from sklearn.preprocessing.imputation import Imputer
# from sklearn.impute import SimpleImputer

SI = Imputer(missing_values=np.nan, strategy='mean')
"""when we fit a model with 00data it calculates important parameters like mean etc from 
   given 00data , then when we transform another set using that model then it utilizes that 
   previous model. """
SI = SI.fit(X[:, 1:3])
X[:, 1:3] = SI.transform(X[:, 1:3])
"""we cant use english labels so we change it to 1,2,3 but it can give different weight 
    to columns so we change it to n different columns were n is number of types of entries 
    in categorical column"""
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelEncoder_X = LabelEncoder()
X[:, 0] = labelEncoder_X.fit_transform(X[:, 0])

oneHotEncoder = OneHotEncoder(categorical_features=[0])
X = oneHotEncoder.fit_transform(X).toarray()
labelEncoder_Y = LabelEncoder()
Y = labelEncoder_Y.fit_transform(Y)

# splitting into test train
from sklearn.model_selection import train_test_split