Example #1
0
def imputation(df, strategy, cols_to_leave_out=None):
    """
    Method that imputes values to the missing places based on the median, mean, etc. of the data in the column

    Args:
        df: (dataframe), pandas dataframe containing data
        strategy: (str), method of imputation, e.g. median, mean, etc.
        cols_to_leave_out: (list), list of column indices to not include in imputation

    Returns:
        df: (dataframe): dataframe with NaN or missing values resolved via imputation

    """
    col_names = df.columns.tolist()
    if cols_to_leave_out is None:
        df_imputed = pd.DataFrame(
            Imputer(missing_values='NaN', strategy=strategy,
                    axis=0).fit_transform(df))
    else:
        df_include = df.drop(cols_to_leave_out, axis=1)
        df_hold_out = df.drop(
            [c for c in df.columns if c not in cols_to_leave_out], axis=1)
        df_imputed = pd.DataFrame(Imputer(missing_values='NaN',
                                          strategy=strategy,
                                          axis=0).fit_transform(df_include),
                                  columns=df_include.columns)
    # Need to join the imputed dataframe with the columns containing strings that were held out
    if cols_to_leave_out is None:
        df = df_imputed
    else:
        df = pd.concat([df_hold_out, df_imputed], axis=1)
        col_names = df.columns.tolist()
    return df
    def preprocess_data(self):

        # Step 1 - One Hot Encode
        self.get_categorical_columns()
        print('Step 2 - Categorical Column Identification Complete ...')

        self.x_train = pd.get_dummies(self.x_train, columns=self.categorical_columns, prefix='one_hot_encoded_')
        self.get_training_columns(self.x_train)
        # Hotfix for XGBoost
        for column in self.traincols:
            if ("<" in column):
                self.x_train.rename(index=str, columns={column: column.replace("<", "")}, inplace=True)
        self.get_training_columns(self.x_train)
        encoded_columns = [i for i in self.traincols if "one_hot_encoded_" in i][:-1]
        not_encoded_columns = [i for i in self.traincols if "one_hot_encoded_" not in i]
        self.x_train = self.x_train[self.union(encoded_columns, not_encoded_columns)]
        self.get_training_columns(self.x_train)
        print('Step 3 - One Hot Encoding Complete ...')

        # Step 2 - Null Value Impute
        imputer = Imputer(strategy='mean', copy=False)
        self.x_train = pd.DataFrame(data=imputer.fit_transform(self.x_train), columns=self.traincols)
        print('Step 4 - Null Value Imputation Complete ...')

        # Step 3 - Feature Scaling
        sc_X = scaler(copy=False)
        self.x_train[not_encoded_columns] = sc_X.fit_transform(self.x_train[not_encoded_columns])
        print('Step 5 - Standardisation Complete ...')

        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_train, self.y_train, test_size=0.2, random_state=1)
        print('Step 5 - Train Test Splitting Complete ...')

        print('Shape:' + str(self.x_train.shape))

        return self.df, self.x_train, self.y_train, self.x_test, self.y_test, self.traincols, self.categorical_columns
Example #3
0
def return_features_labels():
    global data
    global data_copy
    # As we can see the the different move options, we perform label encoding.
    mapping_for_moves = {'x': 1, "o": 0}  # For b, we put mean of the data.
    # Positive is win, negative is lose
    mapping_for_wins = {"positive": 1, "negative": 0}
    data.is_win = data.is_win.map(mapping_for_wins)
    data_copy.is_win = data_copy.is_win.map(mapping_for_wins)

    data = data.drop(columns=["is_win"], axis=1)

    for i in data.columns:  # Applying map to all the columns except is_win.
        data[i] = data[i].map(mapping_for_moves)

    # Extracting features and labels
    features = data.values
    labels = data_copy.is_win.values

    # Filling missing values aka "b" with the mean
    features = (Imputer().fit_transform(features))

    features = features.astype(np.int)
    labels = labels.astype(np.int)

    return features, labels
Example #4
0
def impute_and_scale_array(mat, scaling=None):
    """ Impute missing values with mean and scale data included in numpy array.

        Parameters
        ----------
        mat : numpy array
            Array to scale
        scaling : string
            String describing type of scaling to apply.
            Options recognized: 'maxabs', 'minmax', 'std'.
            'maxabs' : scales data to range [-1 to 1].
            'minmax' : scales data to range [-1 to 1].
            'std'    : scales data to normal variable with mean 0 and standard deviation 1.
            (Default: None, no scaling).

        Return
        ----------
        Returns the numpy array imputed with the mean value of the \
        column and scaled by the method specified. If no scaling method is specified, \
        it returns the imputed numpy array.
    """

    #    imputer = Imputer(strategy='mean', axis=0, copy=False)
    #    imputer = SimpleImputer(strategy='mean', copy=False)
    # Next line is from conditional import. axis=0 is default
    # in old version so it is not necessary.
    imputer = Imputer(strategy='mean', copy=False)
    imputer.fit_transform(mat)

    return scale_array(mat, scaling)
def get_some_data():
    data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')
    y = data.Price
    X = data[cols_to_use]
    my_imputer = Imputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y
Example #6
0
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean')
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df
Example #7
0
 def test_categories_to_integers_grid_search(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data",
                         "adult_set.txt")
     df = pandas.read_csv(data, sep="\t")
     X = df.drop('income', axis=1)
     y = df['income']
     pipe = make_pipeline(CategoriesToIntegers(), LogisticRegression())
     self.assertRaise(lambda: test_sklearn_grid_search_cv(lambda: pipe, df),
                      ValueError)
     self.assertRaise(
         lambda: test_sklearn_grid_search_cv(
             lambda: pipe, X, y, categoriestointegers__single=[True, False]
         ), ValueError, "Unable to find category value")
     pipe = make_pipeline(CategoriesToIntegers(),
                          Imputer(strategy='most_frequent'),
                          LogisticRegression())
     res = test_sklearn_grid_search_cv(
         lambda: pipe,
         X,
         y,
         categoriestointegers__single=[True, False],
         categoriestointegers__skip_errors=[True])
     self.assertIn('model', res)
     self.assertIn('score', res)
     self.assertGreater(res['score'], 0)
     self.assertLesser(res['score'], 1)
Example #8
0
    def __init__(self,
                 params,
                 max_iter=6,
                 ini_fill=True,
                 ini_strategy_reg='mean',
                 tol=1e-3,
                 model_reg="xgboost",
                 model_clf="xgboost"):
        '''
		-max_iter:迭代次数
		-ini_fill:是否要进行简单填补(False仅对xgb和lgb有效)
		-ini_strategy_reg:连续变量简单填补规则, mean or median
		-ini_strategy_clf:离散变量简单填补规则, only most_frequent
		-cat_index:离散变量索引(int)
		-tol:阈值
		-model_reg:连续变量采用的预测缺失值模型, be xgboost,lightgbm, randomforest, knn
		-model_clf:离散变量采用的预测缺失值模型
		'''
        self.params = params  # 模型参数
        self.best_params = params  #用于存储最佳的模型参数
        self.ini_fill = ini_fill  #是否简单初始化缺失部分。
        self.max_iter = max_iter  #最大迭代次数,每次迭代,缺失部分都会被修改。
        self.imputer_reg = Imputer(strategy=ini_strategy_reg)  #TODO 使用均值预填充
        self.tol = tol  #误差阈值,小于阈值则停止训练
        self.model_reg = model_reg  #回归模型
Example #9
0
def impute_array(X_fit, *X_s, missing_values=np.NaN, strategy="mean"):
    """
    :param X_fit: {array-like, sparse matrix} used to fit the imputer. This array is also imputed.
    :param X_s: the additional (optional) arrays that are imputed using the same imputer.
    :param missing_values: the value that will be substituted during the imputation.
    :param strategy: 'mean' (default) -> missing values are imputed with the mean value of the corresponding vector.
                     'median' -> missing values are imputed with the median value of the corresponding vector.
                     'mode' -> missing values are imputed with the mode of the corresponding vector.
                     ('constant', value) -> missing values are imputed with the constant value provided as the second term of the tuple.
                     None -> no-op (for internal use).
    :return: a list of imputed arrays, returned in the same order as they were provided.
    """
    if strategy is None:
        return [X_fit, *X_s]
    strategy, fill_value = strategy if isinstance(
        strategy, tuple) and strategy[0] == 'constant' else (strategy, None)
    strategy = dict(mode='most_frequent').get(strategy, strategy)

    imputer = Imputer(missing_values=missing_values,
                      strategy=strategy,
                      fill_value=fill_value)
    imputed = _restore_dtypes(imputer.fit_transform(X_fit), X_fit)
    if len(X_s) > 0:
        result = [imputed]
        for X in X_s:
            result.append(_restore_dtypes(imputer.transform(X), X))
        return result
    else:
        return imputed
Example #10
0
def data_preprocess(data):
    # your code here
    # example:
    label = LabelEncoder()
    label_count = 0

    for col in data:
        if data[col].dtype == 'object':
            if len(list(data[col].unique())) <= 2:
                # Train on data
                label.fit(data[col])
                # Transform data
                data[col] = label.transform(data[col])
                label_count += 1
    x = pd.get_dummies(data)

    scaler = Normalizer()
    imputer = Imputer(strategy = 'median')
    imputer.fit(x)
    x = imputer.transform(x)
    scaler.fit(x)
    x = scaler.transform(x)

    # your code end
    return x
Example #11
0
def preprocessdata(dataset):
    categorical = pd.get_dummies(data=dataset,columns =['Embarked','Sex'])

    names= pd.DataFrame({"Names":dataset['Name']})


    column_titles = ['Mr.','Mrs.','Miss.','Master.', 'Rev.', 'Dr.','Col.','Mme.','Major.','Ms.','Lady.','Sir.', 'Mlle.','Capt.']

    names.reindex(columns = column_titles, fill_value=0 )

    titles = ['Mr\.','Mrs\.','Miss\.','Master\.', 'Rev\.', 'Dr\.','Col\.','Mme\.','Major\.','Ms\.','Lady\.','Sir\.', 'Mlle\.','Capt\.']

    for ColName, title in zip(column_titles, titles):
        names[ColName] = names['Names'].str.contains(title)

    names = names.drop(columns=['Names'])

    dataset = pd.concat([categorical, names],axis = 1)

    dataset = dataset.drop(columns = ['Name','Cabin','Ticket','PassengerId'])



    #taking care of missing data
    imputer = Imputer(missing_values=np.nan, strategy='mean')
    imputer = imputer.fit(dataset)
    dataset = imputer.transform(dataset)

  
    
    #Feature Scaling
    sc_X = StandardScaler()
    dataset= sc_X.fit_transform(dataset)
    
    return dataset
Example #12
0
def load_dataset(data_X, data_y, subsample_data, FIXED_SPLIT=True, TEST_SIZE=0.3, RANDOM_STATE=42, LIMIT_SPLIT=1000000):
    if isinstance(data_X,pd.DataFrame):
        data_X=pd.get_dummies(data_X)
        data_X=data_X.values

    if np.any(np.isnan(data_X)):
        imputer=Imputer(strategy="median")
        imputer.fit(data_X)
        data_X=imputer.transform(data_X)


    if isinstance(data_y,pd.Series):
        data_y=data_y.values

    if not isinstance(data_X,np.ndarray) or not isinstance(data_y,np.ndarray):
        raise "Incompatible dataset type. Must be pandas or np.ndarray."

    if subsample_data < 1.0 and FIXED_SPLIT is True:
        data_X, _, data_y, _ = train_test_split(data_X, data_y, train_size=subsample_data, random_state=RANDOM_STATE)

    data_X, data_y = check_X_y(data_X, data_y, accept_sparse=False) #change for true when the sparse grammar is ready

    temp_folder = tempfile.mkdtemp()
    filename = os.path.join(temp_folder, 'autocve_joblib.mmap')
    if os.path.exists(filename): os.unlink(filename)
    _ = dump(data_X, filename)
    data_X_memmap = load(filename, mmap_mode='r') 

    if FIXED_SPLIT is True:
        split=None
    else:
        TRAIN_SIZE=1-TEST_SIZE
        split=StratifiedShuffleSplit(train_size=(TRAIN_SIZE*subsample_data), test_size=(TEST_SIZE*subsample_data), random_state=RANDOM_STATE, n_splits=LIMIT_SPLIT).split(data_X_memmap,data_y)

    return data_X_memmap, data_y, split, filename, temp_folder, data_X.shape[1]
Example #13
0
def replacing_missing_numeric(df='dataframe', df_test='dataframe2'):

    # Test missing values
    #missing_value(df_test,df_name='TEST',visualizse=False,head_count=10)

    ## Fill in missing values

    #Strategy = Median, variances is high so better to use Median
    imputer = Imputer(strategy='median')
    scaler = MinMaxScaler(feature_range=[0, 1])
    train = df_test
    train_col = train.columns

    with open((r'Imputer_folder/_Imputer.pkl'), 'rb') as f:
        imputer = pickle.load(f)

    test = imputer.transform(df_test)
    with open((r'Scalar/_ScalarImputer.pkl'), 'rb') as f:
        scaler = pickle.load(f)
    test = scaler.transform(test)
    #print('Testing data shape: ', test.shape)

    new_df_test = pd.DataFrame(test, columns=train_col)
    new_df_test['SK_ID_CURR'] = df_test['SK_ID_CURR'].values

    # Test missing values
    #missing_value(new_df_test,df_name='TEST',visualizse=False,head_count=5)

    #print(' Observation : \n 1.Now there is no missing value in Train and test')

    return new_df_test
Example #14
0
def get_some_data(data):

    y = data.suit
    X = data[cols_to_use]
    my_imputer = Imputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y
def get_some_data():
    cols_to_use = ['Distance', 'Landsize', 'BuildingArea']
    data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')
    y = data.Price
    X = data[cols_to_use]
    my_imputer = Imputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y
 def _imputation(self, data):
     imp = Imputer(strategy='median')
     attributes = [
         'Critic_Score', 'User_Score', 'Critic_Count', 'User_Count'
     ]
     for item in attributes:
         data[item] = imp.fit_transform(data[[item]]).ravel()
     return data
def process_data_pipeline(raw_data: pd.DataFrame,
                          num_feat: 'list of numbers',
                          categ_feat: 'list of strings' = None,
                          categ_feat_vals: 'list of strings' = None,
                          just_transform: bool = False,
                          just_pipeline: bool = False):
    num_pipeline = Pipeline([
        ('feat_sel', FeatureSelector(num_feat, True)),
        ('Grade',
         FeatureCreator(['OverallCond', 'OverallQual'],
                        lambda x, y: x / y,
                        as_dataframe=True,
                        feat_name='Grade')),
        ('Age',
         FeatureCreator(['YrSold', 'YearBuilt'],
                        lambda x, y: x - y,
                        as_dataframe=True,
                        feat_name='Age')),
        ('RemodAge',
         FeatureCreator(['YrSold', 'YearRemodAdd'],
                        lambda x, y: x - y,
                        as_dataframe=True,
                        feat_name='RemodAge')),
        ('TotalSF',
         FeatureCreator(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'],
                        lambda x, y: x + y,
                        as_dataframe=True,
                        feat_name='TotalSF')),
        ('drop_cat_feat',
         FeatureDropper(['YrSold', 'OverallCond'], as_dataframe=True)),
        ('imputer_mean', Imputer(strategy='mean')),
        ('std_scaler', RobustScaler())
    ])
    if categ_feat is None:
        if just_transform is True:
            return num_pipeline.transform(raw_data)
        return num_pipeline.fit_transform(raw_data)

    categ_cols = [raw_data[col].unique() for col in categ_feat
                  ] if categ_feat_vals is None else categ_feat_vals

    cat_pipeline = Pipeline([
        ('feat_sel', FeatureSelector(categ_feat, True)),
        ('imputer_most_frequent', CategoricalImputer()),
        ('encode', OneHotEncoder(sparse=False) if categ_cols is None else
         OneHotEncoder(categories=categ_cols, sparse=False)),
    ])
    feat_union = FeatureUnion(transformer_list=[
        ('num_features', num_pipeline),
        ('cat_features', cat_pipeline),
    ])

    if just_pipeline is True:
        return feat_union

    if just_transform is True:
        return feat_union.transform(raw_data)
    return feat_union.fit_transform(raw_data)
Example #18
0
def pact_score(Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest,costs,verbose=False):
    columns = Xtrain.columns#np.array(pd.read_pickle('../data/ed-trauma/'+"Xtrain_raw_cat.pickle").columns)
    Xtrain = pd.DataFrame(data=np.vstack((Xtrain,Xvalid)),columns=columns)
    ytrain = np.hstack((ytrain,yvalid))
    Xtest = pd.DataFrame(data=Xtest,columns=columns)
    ytest = ytest
    all_measurability = load_cost_dict()#{fname:cost for fname, cost in zip(columns,costs)}
    # PACT Score
    pact_meas = {"shock_index":all_measurability['scenefirstpulse']+all_measurability['scenefirstbloodpressure'],
                 "age":all_measurability['age'],
                 "not_mvc":all_measurability['causecode'],
                 "gcs":np.sum(all_measurability[f'scenegcs{k}'] for k in ['eye','motor','verbal']),
                 "intub":all_measurability['intub'],
                "cpr":all_measurability['cpr']}

    lr_pact = LogisticRegression()
    Xtrain_pact = pd.DataFrame()
    Xtrain_pact["shock_index"] = Xtrain['scenefirstpulse']/(Xtrain['scenefirstbloodpressure']+1)
    Xtrain_pact["age"] = Xtrain['age']
    bike_mv_mc_ped = np.array([2,14,15,18])
    Xtrain_pact["not_mvc"] = ~np.isin(Xtrain['causecode'].values,bike_mv_mc_ped)
    Xtrain_pact["gcs"] = 15-Xtrain['scenegcs']
    Xtrain_pact["cpr"] = Xtrain['cpr']
    Xtrain_pact["intub"] = Xtrain['intub']
    imp = Imputer()
    ss = StandardScaler()
    Xtrain_pact_imp = imp.fit_transform(Xtrain_pact.values.astype('float'))
    Xtrain_pact_ss = ss.fit_transform(Xtrain_pact_imp)


    Xtest_pact = pd.DataFrame()
    Xtest_pact["shock_index"] = Xtest['scenefirstpulse']/(Xtest['scenefirstbloodpressure']+1)
    Xtest_pact["age"] = Xtest['age']
    Xtest_pact["not_mvc"] = ~np.isin(Xtest['causecode'].values,bike_mv_mc_ped)
    Xtest_pact["gcs"] = 15-Xtest['scenegcs']
    Xtest_pact["cpr"] = Xtest['cpr']
    Xtest_pact["intub"] = Xtest['intub']
    Xtest_pact_imp = imp.transform(Xtest_pact.values.astype('float'))
    Xtest_pact_ss = ss.transform(Xtest_pact_imp)

    pact_lr = LogisticRegression()
    pact_lr.fit(Xtrain_pact_ss,ytrain)
    if verbose: print ("PACT ROC",roc_auc_score(ytest,pact_lr.predict_proba(Xtest_pact_ss)[:,1]))
    pact_cost = np.sum(list(pact_meas.values()))
    if verbose: print ("PACT Cost",  pact_cost)
    
    costvec = np.array([pact_meas[c] for c in Xtrain_pact.columns])
    exp = LinearExplainer
    model = LogisticRegression()
    DIO = knapsack.IncreasingCostRetainer(model,exp)
    DIO.fit(Xtrain_pact_ss,ytrain,costvec)
    DIO.score_models_proba(Xtest_pact_ss,ytest,roc_auc_score)
    
    preds = pact_lr.predict_proba(Xtest_pact_ss)[:,1]
    
    return pact_cost,roc_auc_score(ytest,preds), Xtest_pact_ss, pact_lr,DIO, Xtest_pact.columns, preds
Example #19
0
 def test_imputer(self):
     try:
         model = Imputer(missing_values='NaN', strategy='mean', axis=0)
     except TypeError:
         model = Imputer(missing_values=np.nan, strategy='mean')
         model.axis = 0
     data = [[1, 2], [np.nan, 3], [7, 6]]
     model.fit(data)
     from onnxmltools.convert.coreml.convert import convert
     import coremltools  # noqa
     try:
         model_coreml = coremltools.converters.sklearn.convert(model)
     except ValueError as e:
         if 'not supported' in str(e):
             # Python 2.7 + scikit-learn 0.22
             return
     model_onnx = convert(model_coreml.get_spec())
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(np.array(data, dtype=np.float32),
                         model, model_onnx, basename="CmlImputerMeanFloat32")
Example #20
0
def test_multiindex_df(multiindex_dataframe_incomplete):
    """
    Get a dataframe from a multiindex dataframe with missing data
    """
    df = multiindex_dataframe_incomplete
    mapper = DataFrameMapper([([c], Imputer()) for c in df.columns],
                             df_out=True)
    transformed = mapper.fit_transform(df)
    assert len(transformed) == len(multiindex_dataframe_incomplete)
    for c in df.columns:
        assert len(transformed[str(c)]) == len(df[c])
def impute_data(X, X_test):

    if np.any(np.isnan(X.values)) or np.any(np.isnan(X_test.values)):
        imputer = Imputer(strategy="median")
        imputer.fit(X)
        X = imputer.transform(X)
        X_test = imputer.transform(X_test)
    else:
        X = X.values  #TPOT operators need numpy format for been applied
        X_test = X_test.values

    return X, X_test
Example #22
0
def createAuto(target):
    win = 13  # window size, how many previous values we take of the target (here 12 because the range goes from 1-12 without the 13)
    dataAuto = np.empty((len(target), win - 1))
    for i in range(1, win):
        dataAuto[:, i - 1] = shift2(target, i)
    dataAuto[np.isinf(dataAuto)] = np.nan
    imp = Imputer(
        missing_values=np.nan, strategy='mean'
    )  # fill in the missing values with the mean of each column, works on axis=0 by default
    transformedDataAuto = imp.fit_transform(dataAuto)
    X_auto = transformedDataAuto
    return X_auto
Example #23
0
def test_default_transformer():
    """
    If default=Transformer, non explicitly selected columns are applied this
    transformer.
    """
    df = pd.DataFrame({
        'a': [1, np.nan, 3],
    })
    mapper = DataFrameMapper([], default=Imputer())

    transformed = mapper.fit_transform(df)
    assert (transformed[:0] == np.array([1., 2., 3.])).all()
Example #24
0
def impute_featureset(fset,
                      strategy='constant',
                      value=None,
                      max_value=1e20,
                      inplace=False):
    """Replace NaN/Inf values with imputed values as defined by `strategy`.
    Output should satisfy `sklearn.validation.assert_all_finite` so that
    training a model will not produce an error.

    Parameters
    ----------
    strategy : str, optional
    The imputation strategy. Defaults to 'constant'.

        - 'constant': replace all missing with `value`
        - 'mean': replace all missing with mean along `axis`
        - 'median': replace all missing with median along `axis`
        - 'most_frequent': replace all missing with mode along `axis`

    value : float or None, optional
        Replacement value to use for `strategy='constant'`. Defaults to
        `None`, in which case a very large negative value is used (a
        good choice for e.g. random forests).

    max_value : float, optional
        Maximum (absolute) value above which values are treated as infinite.
        Used to prevent overflow when fitting `sklearn` models.

    inplace : bool, optional
        If True, fill in place. If False, return a copy.

    Returns
    -------
    pd.DataFrame
        Feature data frame wth no missing/infinite values.
    """
    if not inplace:
        fset = fset.copy()
    fset.values[np.isnan(
        fset.values)] = np.inf  # avoid NaN comparison warnings
    fset.values[np.abs(fset.values) > max_value] = np.nan
    if strategy == 'constant':
        if value is None:
            # If no fill-in value is provided, use a large negative value
            value = -2. * np.nanmax(np.abs(fset.values))
        fset.fillna(value, inplace=True)
    elif strategy in ('mean', 'median', 'most_frequent'):
        imputer = Imputer(strategy=strategy)
        fset.values[:] = imputer.fit_transform(fset.values)
    else:
        raise NotImplementedError("Imputation strategy '{}' not"
                                  "recognized.".format(strategy))
    return fset
Example #25
0
def data_cleaning(df):

    #Data Cleaning for numbers
    imputer = Imputer(strategy="median")
    dfn = df.drop("ocean_proximity", axis=1)
    imputer.fit(dfn)
    Xn = pd.DataFrame(imputer.transform(dfn), columns=dfn.columns)

    #Data Cleaning for Text and Categorical Attributes
    encoder = LabelBinarizer(sparse_output=True)
    Xt = encoder.fit_transform(df["ocean_proximity"])

    return imputer.statistics_, Xn, Xt
Example #26
0
	def __init__(self, max_iter = 10, ini_fill = True, ini_strategy_reg = 'mean',
		ini_strategy_clf = 'most_frequent', with_cat = False, 
		cat_index = None, tol = 1e-3, model_reg = "knn", model_clf = "knn"):
		'''
		-max_iter:迭代次数
		-ini_fill:是否要进行简单填补(False仅对xgb和lgb有效)
		-ini_strategy_reg:连续变量简单填补规则, mean or median
		-ini_strategy_clf:离散变量简单填补规则, only most_frequent
		-cat_index:离散变量索引(int)
		-tol:阈值
		-model_reg:连续变量采用的预测缺失值模型, be xgboost,lightgbm, randomforest, knn
		-model_clf:离散变量采用的预测缺失值模型
		'''
		self.ini_fill = ini_fill
		self.max_iter = max_iter
		self.imputer_reg = Imputer(strategy = ini_strategy_reg)  #TODO 使用均值预填充
		self.imputer_clf = Imputer(strategy = ini_strategy_clf)
		self.with_cat = with_cat
		self.cat_index = cat_index
		self.tol = tol
		self.model_reg = model_reg
		self.model_clf = model_clf
		if (not self.ini_fill) and (self.model_reg not in ('lightgbm', 'xgboost')) and (self.model_clf not in ('lightgbm', 'xgboost')):
			raise ValueError("ini_fill = False only work when prams is lightgbm or xgboost")
def preprocess_numerics(dataframe, numerical_columns):
    """
    Preprocess numerical dataframe columns for a Restricted Boltzmann Machine.

    Parameters
    ----------
    dataframe : pd.DataFrame
        A Pandas DataFrame to be used for training an RBM on.
    numerical_columns : list of str
        A list of the column names to be treated as numerical values.

    Returns
    -------
    numerics : np.array
        A numpy array of the numerical columns scaled to [0,1].
    scaler: sklearn.preprocessing.MinMaxScaler
        The scikit-learn scaler used to transform the values.

    """

    # converts to numerical values where possible, replaces with NaN if not
    numerics = pd.DataFrame(
        dataframe[numerical_columns]._convert(numeric=True))
    # selects only columns with some numerical values
    numerics = numerics.select_dtypes([np.number])

    if not numerics.empty:
        to_impute = np.logical_not(np.isfinite(numerics))

        # avoids that annoying pandas warning
        numerics.is_copy = False
        # replaces infs with nans
        numerics[to_impute] = np.nan

        # replace NaNs with column means to leave min-max scaling unaffected
        array = Imputer().fit_transform(numerics)

        # scale values to the range [0,1]
        scaler = MinMaxScaler().fit(array)
        numerics = scaler.transform(array)

        # put our NaNs back in to be imputed by the RBM
        numerics[to_impute] = np.nan
    else:
        numerics = np.empty((dataframe.shape[0], 0))
        scaler = None

    return numerics, scaler
Example #28
0
def impute_and_scale_array(mat, scaling=None):
    """Impute missing values with mean and scale data included in numpy array.

        Parameters
        ----------
        mat : numpy array
            array to scale
        scaling : 'maxabs', 'minmax', 'std', or None, optional (default 'None')
            type of scaling to apply
    """

    imputer = Imputer(strategy='mean', axis=0, copy=False)
    imputer.fit_transform(mat)
    #mat = imputer.fit_transform(mat)

    return scale_array(mat, scaling)
Example #29
0
def get_rfc_grid(cv, dim_reduction_methods, scoring, random_state=None, n_jobs=1, 
                 rfc_n_estimators_l=None):

    pipe = Pipeline([
        ("Fill_NaN", Imputer(strategy="median")),
        ('StdScaler', StandardScaler()),
        ('dim_reduction', SelectKBest(stats.ttest_ind)),
        ('classifier', RandomForestClassifier(random_state=random_state)),])

    param_grid = {'dim_reduction': dim_reduction_methods,}
    if rfc_n_estimators_l is not None:
        param_grid['classifier__n_estimators'] = rfc_n_estimators_l
    
    return GridSearchCV(
        estimator = pipe, param_grid = param_grid,
        scoring = scoring, cv = cv, n_jobs = n_jobs
    )
    def makeit(self):
        self.housing_num = self.housing.drop(self.rem_attribs, axis=1)
        self.num_attribs = list(self.housing_num)

        self.num_pipeline = Pipeline([
            ('selector', DataFrameSelector(self.num_attribs)),
            ('imputer', Imputer(strategy="median")),
            ('attribs_adder', self.attr_adder),
            ('std_scaler', StandardScaler()),
        ])

        self.cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(self.cat_attribs)),
            ('cat_encoder', OneHotEncoder(sparse=False))
        ])

        self.full_pipeline = FeatureUnion(transformer_list=[
            ('num_pipeline', self.num_pipeline),
            ('cat_pipeline', self.cat_pipeline),
        ])

        self.train_labels = self.strat_train_set['median_house_value'].copy(
        ).to_numpy()
        self.strat_train_set.drop('median_house_value', axis=1)
        self.train_features_prepared = self.full_pipeline.fit_transform(
            self.strat_train_set)

        self.test_lables = self.strat_test_set['median_house_value'].to_numpy()
        self.strat_test_set.drop('median_house_value', axis=1)
        self.test_features_prepared = self.full_pipeline.fit_transform(
            self.strat_test_set)

        self.cat_encoder = self.cat_pipeline.named_steps['cat_encoder']
        self.cat_onehot_attribs = list(self.cat_encoder.categories_[0])
        self.headings = self.num_attribs + self.attr_adder.extras(
        ) + self.cat_onehot_attribs

        print('\n' + '=' * 80)
        # print('\nPipeline: {}'.format(self.attr_adder))
        print('Pipeline training array.shape:\t',
              self.train_features_prepared.shape)
        print('Pipeline test array.shape:\t\t',
              self.test_features_prepared.shape)

        return self.train_labels, self.train_features_prepared, self.test_lables, self.test_features_prepared