Example #1
0
def prepare_data(x_train,indexx):
    #Convert numeric data
    #indexx-- >> ID primary key of the table
    text=[]
    text_= x_train.select_dtypes(include="object")
    text=text_.columns.values.tolist()
    id=x_train[indexx].tolist()
    x_train.drop(indexx,1)
    from sklearn.preprocessing import LabelEncoder

    encoder = LabelEncoder()
    for i in text:
        housing_cat = x_train[i]
        housing_cat_encoded = encoder.fit_transform(housing_cat.astype(str))
        x_train = x_train.drop(i, 1)
        x_train[i] = housing_cat_encoded

    #Create Pipeline
    from sklearn.preprocessing import Imputer as SimpleImputer
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler

    num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])

    x_train_arry = num_pipeline.fit_transform(x_train)

    train_x=pd.DataFrame(x_train_arry,columns=x_train.columns)
    train_x[indexx]=id


    return train_x
Example #2
0
def rph_graph(X, y, columns):
    my_model = GradientBoostingRegressor()
    regression_columns = columns
    my_imputer = SimpleImputer()
    X_regression = my_imputer.fit_transform(X)
    my_model.fit(X_regression, y)
    my_plots = plot_partial_dependence(
        my_model,
        features=[0, 1, 2],  # column numbers of plots we want to show
        X=X_regression,  # raw predictors data.
        feature_names=regression_columns,  # labels on graphs
        grid_resolution=10)  # number of values to plot on x axis
def rph_create_output_file(model, test_data, id_column, column_to_predict,
                           output_file_path):
    test_X = test_data.drop(columns=[id_column])

    my_imputer = SimpleImputer()
    test_X = my_imputer.fit_transform(test_X)

    predictions = model.predict(test_X)

    my_submission = pd.DataFrame({
        id_column: test_data[id_column],
        column_to_predict: predictions
    })
    my_submission.to_csv(output_file_path, index=False)
    print("\nCSV created")
Example #4
0
def rph_cross_validation(train_X, train_y):
    train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size = 0.20, random_state=1)

    my_imputer = SimpleImputer()
    train_X = my_imputer.fit_transform(train_X)
    test_X = my_imputer.transform(test_X)
    
    early_stopping_rounds = 30
    xgb_model = XGBRegressor(n_estimators=600, learning_rate=0.06)
    fit_params={'early_stopping_rounds': early_stopping_rounds, 
                'eval_metric': 'mae',
                'verbose': False,
                'eval_set': [[test_X, test_y]]}

    xgb_cv = cross_val_score(xgb_model, train_X, train_y, 
                             cv = 5, 
                             scoring = 'neg_mean_absolute_error',
                             fit_params = fit_params)
    
    xgb_model.fit(train_X, train_y, early_stopping_rounds=early_stopping_rounds, eval_set=[(test_X, test_y)], verbose=False)    
    return xgb_cv, xgb_model
def prepare_data(x_train,method):
    #Convert numeric data
    if method=="test":
        id=x_train['PassengerId'].tolist()
        x_train.drop('PassengerId',1)
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    housing_cat = x_train["Sex"]
    housing_cat_encoded = encoder.fit_transform(housing_cat)
    x_train=x_train.drop("Sex",1)
    x_train["Sex"]=housing_cat_encoded

    housing = x_train['Embarked'].astype(str)
    housing = encoder.fit_transform(housing)
    x_train=x_train.drop('Embarked',1)
    x_train['Embarked']=housing


    #Create Pipeline
    from sklearn.preprocessing import Imputer as SimpleImputer
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler

    num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])

    x_train_arry = num_pipeline.fit_transform(x_train)

    train_x=pd.DataFrame(x_train_arry,columns=x_train.columns)
    if method == "test":
        train_x['PassengerId']=id


    return train_x
Example #6
0
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),


# In[60]:


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)


# In[61]:


import sklearn
sklearn.__version__


# In[62]:
"""

# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# importing the dataset
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Taking care of missing data: Not needed in template
from sklearn.preprocessing import SimpleImputer
missingvalues = SimpleImputer(missing_values=np.nan,
                              strategy="mean",
                              verbose=0)
missingvalues = missingvalues.fit(X[:, 1:])
X[:, 1:] = missingvalues.transform(X[:, 1:])

# Encoding Categorical Data: Not needed in template
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)

from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)
Example #8
0
        strat_test_set = loaddata.loc[test_index]

    for _set in (strat_train_set, strat_test_set):
        _set.drop("income_cat", axis=1, inplace=True)

    train_y = strat_train_set['median_house_value']
    train_x = strat_train_set.drop('median_house_value', axis=1)
    test_y = strat_test_set['median_house_value']
    test_x = strat_test_set.drop('median_house_value', axis=1)

    num_col = [x for x in train_x.columns.values if x != 'ocean_proximity']
    cat_col = ['ocean_proximity']

with timer('PipeLine'):
    num_pipe = Pipeline([('Selector', Selector(num_col)),
                         ('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder', CombinedAttributesAdder()),
                         ('std_scaler', StandardScaler())])

    cat_pipe = Pipeline([('Encoder', encoding(cat_col))])

    union = FeatureUnion(transformer_list=[('num_pipe',
                                            num_pipe), ('cat_pipe', cat_pipe)])

    housing_x = union.fit_transform(train_x)
    housing_x_t = union.transform(test_x)

min_score = 2147483647
min_param = None

with timer('parameter search'):
Example #9
0
sample_incomplete_rows


# In[70]:


housing.describe()


# In[48]:


# Let's use Scikit-Learn Imputer class to fill missing values

from sklearn.preprocessing import SimpleImputer
imputer = SimpleImputer(strategy='median')


# In[49]:


# Remove the text attribute because median can only be calculated on numerical attributes

housing_num = housing.drop('ocean_proximity', axis=1)


# In[50]:


# Fit the imputer instance to the training data
Example #10
0
    def transform(self, X):
        return X[self.selected_clmns]


# In[7]:


# let's creta a pipleline for two steps (selec coloumns and mputer for missed  numeric values of the selected columns)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer as SimpleImputer
# num_pipeline = Pipeline([
#         ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
#         ("imputer", SimpleImputer(strategy="median")),
#     ])
numeric_values_pipe= Pipeline([('numiric_ coloumns' ,DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
                               ('imputer_for_missed' ,SimpleImputer(strategy='median'))])


# In[38]:


num =numeric_values_pipe.fit_transform(train)# see below we selected only 4 num coloumns and get the missed values  based on median


# In[9]:


# let's  select a categorical data create an imputer for thier missed data 
#we build a calss to fill missed cat data with most frequent 
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
Example #11
0
              "xgbrg__verbose": False}

searchCV = GridSearchCV(
    my_pipeline,
    cv=5,
    param_grid=param_grid,
    fit_params=fit_params
)
searchCV.fit(train_X, train_y)

# Alternative to impute as a preprocessor:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing SimpleImputer, OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)
# Pipeline return a transformer piped

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
# Data Preprocessing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# Taking care of missing data
from sklearn.preprocessing import SimpleImputer
imputer = SimpleImputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])