def k_train(clusters, dataset, labelset, verbose=False): # Create k initial clusters and assign all points to them randomly from training data for (label_index, feature) in enumerate(dataset): point = Point(labelset[label_index], feature) cluster_index = rand.randrange(len(clusters)) clusters[cluster_index].points.append(point) # Assign cluster center randomly from training samples for cluster in clusters: cluster.center = cluster.points[rand.randrange(len(cluster.points))].features # Iterate the below until convergence mse_, mss_, ent_ = calc_kmeans_stats(clusters) if verbose: helper.print_stats(mse_, mss_, ent_) counter = 0 mse_t, mss_t, ent_t = 0, 0, 0 converged = True while update_clusters(clusters): prev_mse, prev_mss, prev_ent = mse_t, mss_t, ent_t mse_, mss_, ent_ = calc_kmeans_stats(clusters) mse_t, mss_t, ent_t = mse_, mss_, ent_ if verbose: helper.print_stats(mse_, mss_, ent_) counter += 1 if (prev_mse - mse_t) + (prev_mss - mss_t) + (prev_ent - ent_t) < 5 and counter >= 100 and counter != 1: # Probably indicates non-convergence converged = False if verbose: print("Non-convergence detected, finishing training sequence...") break return mse_, mss_, ent_, clusters, converged
svm) #pipeline_svm obj to be used in all svm algos pipeline_svm_fitted = pipeline_svm.fit(docs_train.data, docs_train.target) # svm_predict = pipeline_svm_fitted.predict(docs_test.data) # utility.print_stats(docs_test.target, svm_predict, 'SVM Normal') # utility.draw_roc_curve(docs_test.target, pipeline_svm_fitted.predict_proba(docs_test.data)[:, 1]) #Soft margin SVM -> #confirm this part, not sure of any other way to implement soft margin SVM params = { 'learning_algo__gamma': [1e-3, 1e3] #10^-3 to 10^3 } svm_soft_margin = GridSearchCV(pipeline_svm, params, cv=5) svm_soft_margin_fitted = svm_soft_margin.fit(docs_train.data, docs_train.target) svm_soft_margin_predict = svm_soft_margin_fitted.predict(docs_test.data) utility.print_stats(docs_test.target, svm_soft_margin_predict, 'Soft Margin SVM') utility.draw_roc_curve( docs_test.target, svm_soft_margin_fitted.predict_proba(docs_test.data)[:, 1]) best_params = svm_soft_margin.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t{}: {}".format(param_name, best_params[param_name])) #Logistic Regression -> # logistic_regr = LogisticRegression(penalty='l2', max_iter=5, random_state=40) # pipeline_regr = utility.pipeline_setup(logistic_regr) # pipeline_regr_fitted = pipeline_regr.fit(docs_train.data, docs_train.target) # regr_predict = pipeline_regr_fitted.predict(docs_test.data) # utility.print_stats(docs_test.target, regr_predict, 'Logistic Regression') # utility.draw_roc_curve(docs_test.target, pipeline_regr_fitted.predict_proba(docs_test.data)[:, 1])
#SVM -> svm = SVC(kernel='linear', probability=True, random_state=40) pipeline_svm = utility.pipeline_setup(svm) #pipeline_svm obj to be used in all svm algos pipeline_svm_fitted = pipeline_svm.fit(docs_train.data, docs_train.target) # svm_predict = pipeline_svm_fitted.predict(docs_test.data) # utility.print_stats(docs_test.target, svm_predict, 'SVM Normal') # utility.draw_roc_curve(docs_test.target, pipeline_svm_fitted.predict_proba(docs_test.data)[:, 1]) #Soft margin SVM -> #confirm this part, not sure of any other way to implement soft margin SVM params = { 'learning_algo__gamma': [1e-3, 1e3] #10^-3 to 10^3 } svm_soft_margin = GridSearchCV(pipeline_svm, params, cv=5) svm_soft_margin_fitted = svm_soft_margin.fit(docs_train.data, docs_train.target) svm_soft_margin_predict = svm_soft_margin_fitted.predict(docs_test.data) utility.print_stats(docs_test.target, svm_soft_margin_predict, 'Soft Margin SVM') utility.draw_roc_curve(docs_test.target, svm_soft_margin_fitted.predict_proba(docs_test.data)[:, 1]) best_params = svm_soft_margin.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t{}: {}".format(param_name, best_params[param_name])) #Logistic Regression -> # logistic_regr = LogisticRegression(penalty='l2', max_iter=5, random_state=40) # pipeline_regr = utility.pipeline_setup(logistic_regr) # pipeline_regr_fitted = pipeline_regr.fit(docs_train.data, docs_train.target) # regr_predict = pipeline_regr_fitted.predict(docs_test.data) # utility.print_stats(docs_test.target, regr_predict, 'Logistic Regression') # utility.draw_roc_curve(docs_test.target, pipeline_regr_fitted.predict_proba(docs_test.data)[:, 1])
from sklearn.datasets import fetch_20newsgroups from sklearn.naive_bayes import GaussianNB from sklearn import metrics import utility categories = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian' ] docs_train = fetch_20newsgroups( subset='train', categories=categories, shuffle=True, random_state=42) #, remove=('headers','footers','quotes')) docs_test = fetch_20newsgroups( subset='test', categories=categories, shuffle=True, random_state=42) #, remove=('headers','footers','quotes')) model = utility.pipeline_setup(GaussianNB()) model.fit(docs_train.data, docs_train.target) # print(model) # make predictions expected = docs_test.target predicted = model.predict(docs_test.data) utility.print_stats(expected, predicted, 'Naive Bayes Multiclass')
from sklearn.datasets import fetch_20newsgroups import matplotlib.pyplot as pyplot from sklearn.naive_bayes import GaussianNB import utility docs_train, docs_test = utility.custom_2class_classifier() model = utility.pipeline_setup(GaussianNB()) model_fitted = model.fit(docs_train.data, docs_train.target) #print(model) # make predictions expected = docs_test.target predicted = model_fitted.predict(docs_test.data) utility.print_stats(expected, predicted, 'Naive Bayes Basic') utility.draw_roc_curve(expected, model_fitted.predict_proba(docs_test.data)[:, 1])
categories=categories, shuffle=True, random_state=42) docs_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) svm_basic = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=40) svm_onerest = OneVsRestClassifier(svm_basic) pipeline_svm_onerest = utility.pipeline_setup(svm_onerest) pipeline_svm_fitted = pipeline_svm_onerest.fit(docs_train.data, docs_train.target) svm_predict = pipeline_svm_fitted.predict(docs_test.data) utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSOne') svm_weighted = SVC( kernel='linear', class_weight='balanced', probability=True, random_state=40 ) #balanced param to make sure both docs have same no. of samples in onevsone svm_oneone = OneVsOneClassifier(svm_weighted) pipeline_svm_oneone = utility.pipeline_setup(svm_oneone) pipeline_svm_fitted = pipeline_svm_oneone.fit(docs_train.data, docs_train.target) svm_predict = pipeline_svm_fitted.predict(docs_test.data) utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSRest')
import utility from sklearn.datasets import fetch_20newsgroups from sklearn.svm import SVC from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier from sklearn import metrics categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian'] docs_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) docs_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) svm_basic = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=40) svm_onerest = OneVsRestClassifier(svm_basic) pipeline_svm_onerest = utility.pipeline_setup(svm_onerest) pipeline_svm_fitted = pipeline_svm_onerest.fit(docs_train.data, docs_train.target) svm_predict = pipeline_svm_fitted.predict(docs_test.data) utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSOne') svm_weighted = SVC(kernel='linear', class_weight='balanced', probability=True,random_state=40) #balanced param to make sure both docs have same no. of samples in onevsone svm_oneone = OneVsOneClassifier(svm_weighted) pipeline_svm_oneone = utility.pipeline_setup(svm_oneone) pipeline_svm_fitted = pipeline_svm_oneone.fit(docs_train.data, docs_train.target) svm_predict = pipeline_svm_fitted.predict(docs_test.data) utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSRest')
from sklearn.datasets import fetch_20newsgroups from sklearn.naive_bayes import GaussianNB from sklearn import metrics import utility categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian'] docs_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)#, remove=('headers','footers','quotes')) docs_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)#, remove=('headers','footers','quotes')) model = utility.pipeline_setup(GaussianNB()) model.fit(docs_train.data, docs_train.target) # print(model) # make predictions expected = docs_test.target predicted = model.predict(docs_test.data) utility.print_stats(expected, predicted, 'Naive Bayes Multiclass')