Esempio n. 1
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent']:
        imputer = Imputer(strategy=strategy)
        X_imputed = imputer.fit_transform(X)
        assert_equal(X_imputed.shape, (10, 2))
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert_equal(X_imputed.shape, (10, 2))
Esempio n. 2
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(strategy=strategy)
        X_imputed = imputer.fit_transform(X)
        assert_equal(X_imputed.shape, (10, 2))
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert_equal(X_imputed.shape, (10, 2))
Esempio n. 3
def _impute(features, imputer=True):
    Helper function that uses the safest imputing method to remove null values, in terms of compatibility with the data size
    @param features: the feature values that need to be imputed
    @type features: numpy.array
    @param imputer: whether or not the scikit imputing method should be used
    @type imputer: boolean
    @return: the modified feature values
    @rtype: numpy.array
    if not imputer: #run imputer only if enabled (default)
        return np.nan_to_num(features)
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=2)
            impfeatures = imp.fit_transform(features)
        except ValueError as exc:
            #catch errors with illegal values (e.g. strings)
            log.warning("Exception trying to run scikit imputation: {}".format(exc))
            impfeatures = features
        #show size for debugging purposes
        #log.debug("Featurevectors {} after imputation: {}".format(impfeatures.shape, features))i

        #we don't want shgrid_scores_ape to change, so if this happens, then just replace nans with zero and infinites
        if impfeatures.shape == features.shape:
            features = impfeatures
            log.warning("Imputer failed, filtering NaN based on numpy converter")
            features = np.nan_to_num(features)
    return features
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:,:-1]
        y = data[:,-1].reshape((-1,))

        ohe = OneHotEncoder(self.categorical)
        X_transformed = ohe.fit_transform(X)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        center = not scipy.sparse.isspmatrix((X_transformed))
        standard_scaler = StandardScaler(with_mean=center)
        X_transformed = standard_scaler.fit_transform(X_transformed)
        X_transformed = X_transformed.todense()

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X
        self.X_transformed = X_transformed
        self.y = y = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
                               self.helpers["ClassOccurences"](self.X, self.y))
            self.helpers["Skewnesses"](self.X_transformed, self.y,
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
Esempio n. 5
def check_indicator(X, expected_imputed_features, axis):
    n_samples, n_features = X.shape
    imputer = Imputer(missing_values=-1, strategy='mean', axis=axis)
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
    Xt = imputer.fit_transform(X)
    Xt_with_in = imputer_with_in.fit_transform(X)
    imputed_features_mask = X[:, expected_imputed_features] == -1
    n_features_new = Xt.shape[1]
    n_imputed_features = len(imputer_with_in.imputed_features_)
    assert_array_equal(imputer.imputed_features_, expected_imputed_features)
                 (n_samples, n_features_new + n_imputed_features))
    assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
Esempio n. 6
def check_indicator(X, expected_imputed_features, axis):
    n_samples, n_features = X.shape
    imputer = Imputer(missing_values=-1, strategy='mean', axis=axis)
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
    Xt = imputer.fit_transform(X)
    Xt_with_in = imputer_with_in.fit_transform(X)
    imputed_features_mask = X[:, expected_imputed_features] == -1
    n_features_new = Xt.shape[1]
    n_imputed_features = len(imputer_with_in.imputed_features_)
    assert_array_equal(imputer.imputed_features_, expected_imputed_features)
                 (n_samples, n_features_new + n_imputed_features))
    assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
    X, Y, test_size=validation_size, random_state=seed)
X_train = pd.DataFrame(data=X_train, columns=columns)
X_validation = pd.DataFrame(data=X_validation, columns=columns)

# handling missing values (NaN, Null)
# creates additonal new columns based on calumns where missing data was (fill those columns with 1 and 0)
# True where missing value was, False where not (1 or 0)
missing_columns = [
    col for col in X_train.columns if X_train[col].isnull().any()
for col in missing_columns:
    X_train[col + '_missing_data'] = X_train[col].isnull()
original_data = X_train
# fill missing values with mean values
imputer = Imputer()
X_train = pd.DataFrame(data=imputer.fit_transform(X_train))
X_train.columns = original_data.columns
# make one column indicating where wasmissing point, drop missing_columns
X_train['missing_values'] = numpy.zeros((len(X_train), 1))
for col in missing_columns:
    X_train['missing_values'] += X_train[col + '_missing_data']
    X_train = X_train.drop([col + '_missing_data'], axis=1)
X_train['Age'] = X_train['Age'].values.round()
X_train = X_train.values

# validation dataset
missing_columns = [
    col for col in X_validation.columns if X_validation[col].isnull().any()
for col in missing_columns:
    X_validation[col + '_missing_data'] = X_validation[col].isnull()
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:, :-1]
        y = data[:, -1].reshape((-1,))

        # First, swap NaNs and zeros, because when converting an encoded
        # dense matrix to sparse, the values which are encoded to zero are lost
        X_sparse = X.copy()
        NaNs = ~np.isfinite(X_sparse)
        X_sparse[NaNs] = 0
        X_sparse = sparse.csr_matrix(X_sparse)

        ohe = OneHotEncoder(self.categorical)
        X_transformed = X_sparse.copy()
        X_transformed = ohe.fit_transform(X_transformed)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler()
        X_transformed = standard_scaler.fit_transform(X_transformed)

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X_sparse
        self.X_transformed = X_transformed
        self.y = y = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))"NumberOfMissingValues",
  ["NumberOfMissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
            self.helpers["ClassOccurences"](self.X, self.y))
            self.helpers["Skewnesses"](self.X_transformed, self.y,
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
Esempio n. 9
test_path = '../input/test.csv'
test_data = pd.read_csv(test_path)
train_data = pd.read_csv(path)
total_data = train_data.append(test_data)
#exploring the data
print((total_data.isnull().sum()))  # finding columns that have null values
#getting rid of Cabin since most of its values are missing (687)
data = total_data.drop('Cabin',
                       axis=1)  # drop Cabin because it is mostly blank
# replacing missing values in age with median age
droplist = [
    'PassengerId', 'Name', 'Sex', 'Ticket', 'Embarked', 'Survived', 'Pclass',
    'Parch', 'Fare', 'SibSp'
data1 = data.drop(droplist, axis=1)
imputed_age = my_age_imputer.fit_transform(data1)
#imputer outputs a multi-D array so we need to convert it into a dataframe before we can use it
age_corrected = pd.DataFrame({'ImputedAge': imputed_age[:, 0]})
data['ImputedAge'] = age_corrected
                     inplace=True)  # filling na with mode of location Embarked
data.Embarked = data.Embarked.replace(['S', 'Q', 'C'], [0, 1, 2])
corr = data.corr()
print(corr.Survived)  #checking data correlation

# In[ ]:

from matplotlib import pyplot as plt
#plotting histograms of Age and ImputedAge to see if the distribution is similar - histogram shows the imputed data in 25-30 is somewhat higher
         range=[0, data.ImputedAge.max()],
Esempio n. 10
# Create the data and the labels correseponding to the data
X = dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14]].values
y = dataset.iloc[:, 15].values

# Preprocess the data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing.imputation import Imputer

labelencode_array = [0, 2, 5, 6, 8, 9]

for i in labelencode_array:
    labelencoder = LabelEncoder()
    X[:, i] = labelencoder.fit_transform(X[:, i].astype(str))

imp = Imputer(missing_values=np.nan, strategy='mean')
X = imp.fit_transform(X)

# one-hot encode

onehotencode_array = [0, 48, 60, 64, 69, 83]

for i in onehotencode_array:
    onehotencoder_make = OneHotEncoder(categorical_features=[i])
    X = onehotencoder_make.fit_transform(X).toarray()
    X = X[:, 1:]

# Split the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
Esempio n. 11
#print("test_features's num: ",test_features.columns.size)
#print("feature's name: ",[col for col in test_features.columns
#                          if col not in train_features.columns])

#train_features,test_features = train_features.align(test_features,
#                                                    join='left',
#                                                    axis = 1)
missing_cols_train = [
    col for col in train_features.columns
    if train_features[col].isnull().any()
print('missing features:' + str(missing_cols_train))
# 缺失值处理
my_imputer = Imputer(strategy='median')
train_features = my_imputer.fit_transform(train_features)
test_features = my_imputer.transform(test_features)
#print("features num : "+len(train_features.columns))
## 训练数据集分割成训练集和测试集,用于测试

X_train, X_test, y_train, y_test = train_test_split(train_features,

model = XGBRegressor(max_depth=7, learning_rate=0.1, Missing=None), y_train, verbose=False)
Esempio n. 12
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

cols_with_missing = [
    col for col in X_train.columns if X_train[col].isnull().any()
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))

from sklearn.preprocessing.imputation import Imputer

my_imputer = Imputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_train.columns = numeric_predictors.columns
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))

imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns
                     if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col +
                         '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col +
                        '_was_missing'] = imputed_X_test_plus[col].isnull()
        if count % 1000 == 0:
        val = noncat_matrix[x, y]
        if val - math.floor(val) != 0.0:
            for i in range(20):
                if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001:
                    X[x, 2 * y] = math.ceil(abs(val) * i)
                    X[x, 2 * y + 1] = i
    return X

# категории
print("building train")
train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix()
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
train_cat_matr = imp.fit_transform(train_cat_matr)
# imp2 = Imputer(missing_values='NaN', strategy='median')
train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix()
# train_noncat_matr = imp2.fit_transform(train_noncat_matr)
# allf = np.hstack((train_cat_matr, train_noncat_matr))

print("building test")
test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode
test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix()
test_cat_matr = imp.transform(test_cat_matr)
test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix()
# test_noncat_matr = imp2.transform(test_noncat_matr)
# test_extra_matr = build_extra_features(test_noncat_matr[:,:10])
Esempio n. 14
predictors_without_categoricals = train_predictors.select_dtypes(

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' +
print('Mean Abslute Error with One-Hot Encoding: ' +

one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(
    one_hot_encoded_test_predictors, join='inner', axis=1)

print('Mean Absolute Error for Final train: ' +
      str(int(get_mae(final_train, target))))
from sklearn.preprocessing.imputation import Imputer

my_imputer = Imputer()
imputed_final_test = pd.DataFrame(my_imputer.fit_transform(final_test))
imputed_final_test.columns = one_hot_encoded_test_predictors.columns
forest_model = RandomForestRegressor(50), target)
predicted_prices = forest_model.predict(imputed_final_test)
submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predicted_prices})
submission.to_csv('submission2.csv', index=False)
# drop columns with Missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test  = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))

# imputer
my_imputer = Imputer()
# 先fit再transform
# fit:只有X_train的话,执行无监督学习算法,比如降维、特征提取、标准化等
# transform:根据对象的特性来定,比如这里是Imputer()对象,那么就是要执行impute
# 另外也可以是StandardScaler()对象,实现标准化(在此之前也要fit)
imputed_X_train = my_imputer.fit_transform(X_train) 
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))

# 被impute的数据
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns 
                                 if X_train[col].isnull().any())

# 有缺失值得数据不是直接删除,而是有数据的是false,无数据的是true
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
Esempio n. 16
def calculate_all_metafeatures(X, y, categorical, dataset_name,
        calculate=None, dont_calculate=None):
    """Calculate all metafeatures."""
    mf_ = list()

    visited = set()
    to_visit = deque()

    # TODO make sure this is done as efficient as possible (no copy for
    # sparse matrices because of wrong sparse format)
    ohe = OneHotEncoder(categorical, sparse=True)
    X_transformed = ohe.fit_transform(X)
    imputer = Imputer(strategy='mean')
    X_transformed = imputer.fit_transform(X_transformed)
    standard_scaler = StandardScaler()
    X_transformed = standard_scaler.fit_transform(X_transformed)

    # TODO add possibility to not transform here
    if scipy.sparse.issparse(X_transformed):
        X_transformed = X_transformed.todense()

    # This is not only important for datasets which are somehow
    # sorted in a strange way, but also prevents lda from failing in
    # some cases.
    # Because this is advanced indexing, a copy of the data is returned!!!
    X_transformed = check_arrays(X_transformed, sparse_format='dense',
    rs = np.random.RandomState(42)
    indices = np.arange(X_transformed.shape[0])
    X_transformed = X_transformed[indices]
    y_transformed = y[indices]

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
        if dont_calculate is not None and name in dont_calculate:

        if name in npy_metafeatures:
            X_ = X_transformed
            y_ = y_transformed
            X_ = X
            y_ = y

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
            elif is_helper_function and not helper_functions.is_calculated(
                value = helper_functions[dependency](X_, y_, categorical)
                helper_functions.set_value(dependency, value)

        value = metafeatures[name](X_, y_)
        metafeatures.set_value(name, value)

    mf_.sort(key=lambda t:
    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
Esempio n. 17
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:, :-1]
        y = data[:, -1].reshape((-1,))

        # First, swap NaNs and zeros, because when converting an encoded
        # dense matrix to sparse, the values which are encoded to zero are lost
        X_sparse = X.copy()
        NaNs = ~np.isfinite(X_sparse)
        X_sparse[NaNs] = 0
        X_sparse = sparse.csr_matrix(X_sparse)

        ohe = OneHotEncoder(self.categorical)
        X_transformed = X_sparse.copy()
        X_transformed = ohe.fit_transform(X_transformed)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler()
        X_transformed = standard_scaler.fit_transform(X_transformed)

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X_sparse
        self.X_transformed = X_transformed
        self.y = y = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))"NumberOfMissingValues",
  ["NumberOfMissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
            self.helpers["ClassOccurences"](self.X, self.y))
            self.helpers["Skewnesses"](self.X_transformed, self.y,
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
        if count % 1000 == 0:
        val = noncat_matrix[x, y]
        if val - math.floor(val) != 0.0:
            for i in range(20):
                if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001:
                    X[x, 2 * y] = math.ceil(abs(val) * i)
                    X[x, 2 * y + 1] = i
    return X

# категории
print("building train")
train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix()
imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0)
train_cat_matr = imp.fit_transform(train_cat_matr)
# imp2 = Imputer(missing_values='NaN', strategy='median')
train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix()
# train_noncat_matr = imp2.fit_transform(train_noncat_matr)
# allf = np.hstack((train_cat_matr, train_noncat_matr))

print("building test")
test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode
test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix()
test_cat_matr = imp.transform(test_cat_matr)
test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix()
# test_noncat_matr = imp2.transform(test_noncat_matr)
# test_extra_matr = build_extra_features(test_noncat_matr[:,:10])