Example #1
0
def k_train(clusters, dataset, labelset, verbose=False):
    # Create k initial clusters and assign all points to them randomly from training data
    for (label_index, feature) in enumerate(dataset):
        point = Point(labelset[label_index], feature)
        cluster_index = rand.randrange(len(clusters))
        clusters[cluster_index].points.append(point)
    # Assign cluster center randomly from training samples
    for cluster in clusters:
        cluster.center = cluster.points[rand.randrange(len(cluster.points))].features
    # Iterate the below until convergence
    mse_, mss_, ent_ = calc_kmeans_stats(clusters)
    if verbose:
        helper.print_stats(mse_, mss_, ent_)

    counter = 0
    mse_t, mss_t, ent_t = 0, 0, 0
    converged = True
    while update_clusters(clusters):
        prev_mse, prev_mss, prev_ent = mse_t, mss_t, ent_t
        mse_, mss_, ent_ = calc_kmeans_stats(clusters)
        mse_t, mss_t, ent_t = mse_, mss_, ent_
        if verbose:
            helper.print_stats(mse_, mss_, ent_)
        counter += 1
        if (prev_mse - mse_t) + (prev_mss - mss_t) + (prev_ent - ent_t) < 5 and counter >= 100 and counter != 1:  # Probably indicates non-convergence
            converged = False
            if verbose:
                print("Non-convergence detected, finishing training sequence...")
            break

    return mse_, mss_, ent_, clusters, converged
Example #2
0
    svm)  #pipeline_svm obj to be used in all svm algos
pipeline_svm_fitted = pipeline_svm.fit(docs_train.data, docs_train.target)
# svm_predict = pipeline_svm_fitted.predict(docs_test.data)
# utility.print_stats(docs_test.target, svm_predict, 'SVM Normal')
# utility.draw_roc_curve(docs_test.target, pipeline_svm_fitted.predict_proba(docs_test.data)[:, 1])

#Soft margin SVM ->
#confirm this part, not sure of any other way to implement soft margin SVM
params = {
    'learning_algo__gamma': [1e-3, 1e3]  #10^-3 to 10^3
}
svm_soft_margin = GridSearchCV(pipeline_svm, params, cv=5)
svm_soft_margin_fitted = svm_soft_margin.fit(docs_train.data,
                                             docs_train.target)
svm_soft_margin_predict = svm_soft_margin_fitted.predict(docs_test.data)
utility.print_stats(docs_test.target, svm_soft_margin_predict,
                    'Soft Margin SVM')
utility.draw_roc_curve(
    docs_test.target,
    svm_soft_margin_fitted.predict_proba(docs_test.data)[:, 1])

best_params = svm_soft_margin.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t{}: {}".format(param_name, best_params[param_name]))

#Logistic Regression ->
# logistic_regr = LogisticRegression(penalty='l2', max_iter=5, random_state=40)
# pipeline_regr = utility.pipeline_setup(logistic_regr)
# pipeline_regr_fitted = pipeline_regr.fit(docs_train.data, docs_train.target)
# regr_predict = pipeline_regr_fitted.predict(docs_test.data)
# utility.print_stats(docs_test.target, regr_predict, 'Logistic Regression')
# utility.draw_roc_curve(docs_test.target, pipeline_regr_fitted.predict_proba(docs_test.data)[:, 1])
Example #3
0
#SVM ->
svm = SVC(kernel='linear', probability=True, random_state=40)
pipeline_svm = utility.pipeline_setup(svm) #pipeline_svm obj to be used in all svm algos
pipeline_svm_fitted = pipeline_svm.fit(docs_train.data, docs_train.target)
# svm_predict = pipeline_svm_fitted.predict(docs_test.data)
# utility.print_stats(docs_test.target, svm_predict, 'SVM Normal')
# utility.draw_roc_curve(docs_test.target, pipeline_svm_fitted.predict_proba(docs_test.data)[:, 1])

#Soft margin SVM ->
#confirm this part, not sure of any other way to implement soft margin SVM
params = {
    'learning_algo__gamma': [1e-3, 1e3] #10^-3 to 10^3
}
svm_soft_margin = GridSearchCV(pipeline_svm, params, cv=5)
svm_soft_margin_fitted = svm_soft_margin.fit(docs_train.data, docs_train.target)
svm_soft_margin_predict = svm_soft_margin_fitted.predict(docs_test.data)
utility.print_stats(docs_test.target, svm_soft_margin_predict, 'Soft Margin SVM')
utility.draw_roc_curve(docs_test.target, svm_soft_margin_fitted.predict_proba(docs_test.data)[:, 1])

best_params = svm_soft_margin.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t{}: {}".format(param_name, best_params[param_name]))
              
#Logistic Regression ->
# logistic_regr = LogisticRegression(penalty='l2', max_iter=5, random_state=40)
# pipeline_regr = utility.pipeline_setup(logistic_regr)
# pipeline_regr_fitted = pipeline_regr.fit(docs_train.data, docs_train.target)
# regr_predict = pipeline_regr_fitted.predict(docs_test.data)
# utility.print_stats(docs_test.target, regr_predict, 'Logistic Regression')
# utility.draw_roc_curve(docs_test.target, pipeline_regr_fitted.predict_proba(docs_test.data)[:, 1])
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import utility

categories = [
    'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
    'soc.religion.christian'
]
docs_train = fetch_20newsgroups(
    subset='train', categories=categories, shuffle=True,
    random_state=42)  #, remove=('headers','footers','quotes'))
docs_test = fetch_20newsgroups(
    subset='test', categories=categories, shuffle=True,
    random_state=42)  #, remove=('headers','footers','quotes'))

model = utility.pipeline_setup(GaussianNB())
model.fit(docs_train.data, docs_train.target)
# print(model)
# make predictions
expected = docs_test.target
predicted = model.predict(docs_test.data)

utility.print_stats(expected, predicted, 'Naive Bayes Multiclass')
Example #5
0
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as pyplot
from sklearn.naive_bayes import GaussianNB
import utility

docs_train, docs_test = utility.custom_2class_classifier()

model = utility.pipeline_setup(GaussianNB())
model_fitted = model.fit(docs_train.data, docs_train.target)
#print(model)
# make predictions
expected = docs_test.target
predicted = model_fitted.predict(docs_test.data)
utility.print_stats(expected, predicted, 'Naive Bayes Basic')
utility.draw_roc_curve(expected, model_fitted.predict_proba(docs_test.data)[:, 1])
Example #6
0
                                categories=categories,
                                shuffle=True,
                                random_state=42)
docs_test = fetch_20newsgroups(subset='test',
                               categories=categories,
                               shuffle=True,
                               random_state=42)

svm_basic = SVC(kernel='linear',
                class_weight='balanced',
                probability=True,
                random_state=40)
svm_onerest = OneVsRestClassifier(svm_basic)
pipeline_svm_onerest = utility.pipeline_setup(svm_onerest)
pipeline_svm_fitted = pipeline_svm_onerest.fit(docs_train.data,
                                               docs_train.target)
svm_predict = pipeline_svm_fitted.predict(docs_test.data)
utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSOne')

svm_weighted = SVC(
    kernel='linear',
    class_weight='balanced',
    probability=True,
    random_state=40
)  #balanced param to make sure both docs have same no. of samples in onevsone
svm_oneone = OneVsOneClassifier(svm_weighted)
pipeline_svm_oneone = utility.pipeline_setup(svm_oneone)
pipeline_svm_fitted = pipeline_svm_oneone.fit(docs_train.data,
                                              docs_train.target)
svm_predict = pipeline_svm_fitted.predict(docs_test.data)
utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSRest')
Example #7
0
import utility
from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn import metrics

categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']
docs_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
docs_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

svm_basic = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=40)
svm_onerest = OneVsRestClassifier(svm_basic)
pipeline_svm_onerest = utility.pipeline_setup(svm_onerest)
pipeline_svm_fitted = pipeline_svm_onerest.fit(docs_train.data, docs_train.target)
svm_predict = pipeline_svm_fitted.predict(docs_test.data)
utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSOne')


svm_weighted = SVC(kernel='linear', class_weight='balanced', probability=True,random_state=40) #balanced param to make sure both docs have same no. of samples in onevsone
svm_oneone = OneVsOneClassifier(svm_weighted)
pipeline_svm_oneone  = utility.pipeline_setup(svm_oneone)
pipeline_svm_fitted = pipeline_svm_oneone.fit(docs_train.data, docs_train.target)
svm_predict = pipeline_svm_fitted.predict(docs_test.data)
utility.print_stats(docs_test.target, svm_predict, 'SVM OneVSRest')
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import utility

categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']
docs_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)#, remove=('headers','footers','quotes'))
docs_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)#, remove=('headers','footers','quotes'))

model = utility.pipeline_setup(GaussianNB())
model.fit(docs_train.data, docs_train.target)
# print(model)
# make predictions
expected = docs_test.target
predicted = model.predict(docs_test.data)

utility.print_stats(expected, predicted, 'Naive Bayes Multiclass')