def test_split_dataset(self): parquets = utils.get_files("parquets", "*.parquet") if len(parquets) > 0: data = pd.read_parquet(parquets[0]) # split into training and validation training_set, validation_set = model.split_dataset(data, 0.25, 1) number_of_customers = len(data) customers_to_train = len(training_set) customers_to_validate = len(validation_set) assert number_of_customers == customers_to_train + customers_to_validate
, random_state=42, verbose=30 )) ovr.fit(x_train, y_train) cross_validated = np.mean(cross_val_score(ovr, x_train, y_train, cv=5)) print(f'Cross Validation Score: {cross_validated}') return ovr if __name__ == '__main__': df = import_data(train=True) test_df = import_data(features='Datasets/test_set_features.csv', train=False) cols = list(df.columns) set_df_values(df) df = clean_data(df) x_train, x_val, y_train, y_val, train_ids, val_ids = split_dataset(df, test_size=0.1, seed=42) x_train, y_train = x_train.astype(str), y_train.astype(int) x_val, y_val = x_val.astype(str), y_val.astype(int) # model = fit_random_search_model(x_train, y_train) model = fit_model(x_train, y_train) h1n1_preds, seasonal_preds = make_predictions(model, x_train) h1n1_true, seasonal_true = y_train['h1n1_vaccine'].values.tolist(), y_train['seasonal_vaccine'].values.tolist() train_score = get_scores(h1n1_true, h1n1_preds, seasonal_true, seasonal_preds) print(f'Training Accuracy: {train_score}') h1n1_preds, seasonal_preds = make_predictions(model, x_val) h1n1_true, seasonal_true = y_val['h1n1_vaccine'].values.tolist(), y_val['seasonal_vaccine'].values.tolist() validation_score = get_scores(h1n1_true, h1n1_preds, seasonal_true, seasonal_preds) print(f'Validation Accuracy: {validation_score}')