Ejemplo n.º 1
0
def preprocess(data_path, save_path):
    data = pd.read_csv(data_path)

    preprocessed_data = process_by_column(data)

    train, test = train_test_split(
        preprocessed_data, 
        test_size=0.2,
        stratify=preprocessed_data[["stroke"]]
    )
    
    columns = list(preprocessed_data.columns)
    x_columns = columns[:-1]
    y_columns = [columns[-1]]

    final_trian_data, final_train_label = SMOTE().fit_resample(
        train[x_columns],
        train[y_columns]
    )
    
    final_test_data = test[x_columns]
    final_test_label = test[y_columns]

    final_trian_data.to_csv(save_path + "train_data.csv", encoding="utf-8", index=False)
    final_train_label.to_csv(save_path + "train_label.csv", encoding="utf-8", index=False)
    final_test_data.to_csv(save_path + "test_data.csv", encoding="utf-8", index=False)
    final_test_label.to_csv(save_path + "test_label.csv", encoding="utf-8", index=False)
Ejemplo n.º 2
0
X = dataset_train.drop(['user_id', 'is_churned'], axis=1)
y = dataset_train['is_churned']

scaler = MinMaxScaler()
X_mm = scaler.fit_transform(X)
with open('source/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

X_train, X_test, y_train, y_test = train_test_split(X_mm,
                                                    y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=100)

# Снизим дизбаланс классов
smote_on_1 = int(X_train.shape[0] * 3 / 10)
X_train_balanced, y_train_balanced = SMOTE(random_state=42,
                                           sampling_strategy={
                                               1: smote_on_1
                                           }).fit_sample(X_train, y_train)

X_train_balanced = pd.DataFrame(X_train_balanced, columns=X.columns)
X_train_balanced['is_churned'] = y_train_balanced.values
X_train_balanced.to_csv('dataset/dataset_train_balanced.csv',
                        sep=';',
                        index=False)

X_test = pd.DataFrame(X_test, columns=X.columns)
X_test['is_churned'] = y_test.values
X_test.to_csv('dataset/dataset_test_balanced.csv', sep=';', index=False)