from sklearn.model_selection import train_test_split, cross_validate from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.ensemble import GradientBoostingClassifier x, y, sum_labels = load_data('red-velvet-features.csv', 'rv-comparison-features.csv') x_train, x_val, y_train, y_val = train_test_split(x, y) x_smote, y_smote = SMOTE().fit_resample(x_train, y_train) x_smote = pd.DataFrame(x_smote, columns=x_train.columns) num_processor = Pipeline([('scaler', StandardScaler())]) cat_processor = Pipeline([('ohe', OneHotEncoder())]) processor = ColumnTransformer([ ('num', num_processor, x_smote.select_dtypes('float64').columns), ('cat', cat_processor, x_smote.select_dtypes('object').columns) ]) model = Pipeline([('processor', processor), ('classifier', GradientBoostingClassifier(learning_rate=0.1, n_estimators=1000))]) model.fit(x_smote, y_smote) #%% from sklearn.metrics import classification_report, roc_auc_score guesses = model.predict_proba(x_val)[:, 1] answers = sum_labels.iloc[y_val.index][[ 'artist name', 'rating', 'song title', 'release date'