Ejemplo n.º 1
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

ames_data = pd.read_csv(os.path.join(
    os.path.dirname(os.path.abspath("__file__")), 'amnes/data/train.csv'),
                        index_col="Id")

x = ames_data[ames_data.columns[ames_data.columns != "SalePrice"]]
y = ames_data["SalePrice"]

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

x_train_num = VariableSelector(variable_type="numeric").fit_transform(x_train)
x_train_cat = VariableSelector(
    variable_type="categorical").fit_transform(x_train)
x_test_num = VariableSelector(variable_type="numeric").fit_transform(x_test)
x_test_cat = VariableSelector(
    variable_type="categorical").fit_transform(x_test)

num_imputer = Imputer(strategy="median").fit(x_train_num)
x_train_num = num_imputer.transform(x_train_num)
x_test_num = num_imputer.transform(x_test_num)

x_train_cat = x_train_cat.fillna("None")
x_test_cat = x_test_cat.fillna("None")

label_encoders = defaultdict(LabelEncoder)
# dir(label_encoders)