from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

x, y, sum_labels = load_data('red-velvet-features.csv',
                             'rv-comparison-features.csv')
x_train, x_val, y_train, y_val = train_test_split(x, y)
x_smote, y_smote = SMOTE().fit_resample(x_train, y_train)
x_smote = pd.DataFrame(x_smote, columns=x_train.columns)

num_processor = Pipeline([('scaler', StandardScaler())])
cat_processor = Pipeline([('ohe', OneHotEncoder())])
processor = ColumnTransformer([
    ('num', num_processor, x_smote.select_dtypes('float64').columns),
    ('cat', cat_processor, x_smote.select_dtypes('object').columns)
])
model = Pipeline([('processor', processor),
                  ('classifier',
                   GradientBoostingClassifier(learning_rate=0.1,
                                              n_estimators=1000))])

model.fit(x_smote, y_smote)

#%%
from sklearn.metrics import classification_report, roc_auc_score

guesses = model.predict_proba(x_val)[:, 1]
answers = sum_labels.iloc[y_val.index][[
    'artist name', 'rating', 'song title', 'release date'