import matplotlib.pyplot as plt import numpy as np from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score from MlcLinReg import MlcLinReg from helpers import shuffle_dataset, split_train_test, tic, toc, load_delicious """ This script is used to compare performance of MlcLinReg and sklearn's SGDClassifier. """ feature = 1 # Load shuffle and split data X_train, y_train, X_test, y_test = load_delicious(feature) X_train_s, y_train_s = shuffle_dataset(X_train, y_train) X_train, y_train, X_test, y_test = split_train_test(X_train_s, y_train_s) # N = np.array(range(100, 1000, 10)) # N = np.array([50, 100, 200, 300, 400, 600]) # N = np.array([0.001, 0.005, 0.025, 0.05, 0.1]) N = np.array([5, 32, 64, 100, 128, 200, 256, 350, 512, 700, 1024, 1500, 2048]) scores = list() scores_sgd = list() times = list() times_sgd = list() batch_size = 2048 iterations = 200
""" This script runs a randomised grid search on all features of delicious dataset """ warnings.filterwarnings("ignore") param_dist = { "learning_rate": st.uniform(0.001, 0.4), "iterations": sp_randint(50, 1000), "batch_size": sp_randint(2, 2000), "l_one": st.uniform(0.01, 0.5) } best_params = np.zeros((501, 4)) # run randomized search for feature in tqdm.tqdm(range(0, 501)): X_train, y_train, X_test, y_test = helpers.load_delicious(feature) clf = MlcLinReg() n_iter_search = 60 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X_train.toarray(), y_train.toarray()) conf = helpers.report_params(random_search.cv_results_, n_top=1) best_params[feature, :] = conf.values() # 'delicious_best_params' has the following columns : # learning_rate l_one iterations batch_size np.savetxt("delicious_best_params.txt", best_params)
import MlcLinReg import helpers # for i in range(100, 100): # helpers.plot_roc_curve(MlcLinReg.MlcLinReg(batch_size=i), savefig=True) helpers.plot_roc_curve(MlcLinReg.MlcLinReg(batch_size=256), dataset=helpers.load_delicious(2), savefig=False)