-
Notifications
You must be signed in to change notification settings - Fork 0
/
generic-classifier sp.py
96 lines (70 loc) · 3.49 KB
/
generic-classifier sp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 8 16:45:41 2019
@author: Priyam
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def length_mask(df,upper=2.7e5,lower=1.5e5):
length_mask = (df['duration_ms']>upper)|(df['duration_ms']<lower)
return length_mask
def split_table(music_table):
labels = music_table.select_dtypes('object')
labels['release date'] = pd.to_datetime(labels['release date'],errors='coerce')
music_table[['key','mode','time_signature']] = music_table[['key','mode','time_signature']].astype('object')
features = music_table.drop(labels,1)
mask = length_mask(features)
labels = labels.drop(features[mask].index)
features = features.drop(features[mask].index)
return features, labels
def merge_tables(target_feats,target_labels,others_feats,others_labels):
target_labels['rating']=1
others_labels['rating']=0
labels=pd.concat([target_labels,others_labels],0).reset_index(drop=True)
feats =pd.concat([target_feats,others_feats],0).reset_index(drop=True)
ind=pd.isnull(feats).any(1).nonzero()[0]
feats = feats.drop(list(ind),0).reset_index(drop=True)
labels = labels.drop(list(ind),0).reset_index(drop=True)
return labels, feats
def load_data(target_filename,reference_filename):
mm_feats, mm_labels = split_table(pd.read_csv(target_filename))
other_feats, other_labels = split_table(pd.read_csv(reference_filename))
other_feats = other_feats.drop('0',1)
sum_labels, sum_feats = merge_tables(mm_feats,mm_labels,other_feats,other_labels)
y = sum_labels['rating']
x = sum_feats
return x,y, sum_labels
#%%
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
x,y,sum_labels = load_data('red-velvet-features.csv','rv-comparison-features.csv')
x_train, x_val, y_train, y_val = train_test_split(x,y)
x_smote, y_smote = SMOTE().fit_resample(x_train,y_train)
x_smote = pd.DataFrame(x_smote,columns=x_train.columns)
num_processor = Pipeline([('scaler',StandardScaler())])
cat_processor = Pipeline([('ohe',OneHotEncoder())])
processor = ColumnTransformer([('num',num_processor,x_smote.select_dtypes('float64').columns),
('cat',cat_processor,x_smote.select_dtypes('object').columns)
])
model = Pipeline([('processor',processor),
('classifier',GradientBoostingClassifier(learning_rate=0.1,n_estimators=1000))])
model.fit(x_smote,y_smote)
#%%
from sklearn.metrics import classification_report, roc_auc_score
guesses = model.predict_proba(x_val)[:,1]
answers = sum_labels.iloc[y_val.index][['artist name','rating','song title','release date']]
results = pd.concat([answers.reset_index(drop=True),pd.Series(guesses,name='guesses')],1)
results = results.sort_values(['rating','guesses'],ascending=False)
print(classification_report(y_val,np.round(guesses)))
print(roc_auc_score(y_val,np.round(guesses)))
fn_mask = (results['guesses']<0.2) & (results['rating']==1)
fp_mask = (results['guesses']>0.8) & (results['rating']==0)
fn = results[fn_mask].drop('rating',1)
fp = results[fp_mask].drop('rating',1)
#%%
fp.to_csv('rv-false-positives.csv',header=True,index=None)