Exemple #1
0
    def experimentDefaultSetting(self, trainset, testset):
        print("Reading data")
        x, y = DataService().read_corpus(trainset)
        clf = SVM().construct_classifier("linear", 1.0)

        # Vectorize the text data and return an (n_samples, n_features) matrix.
        x_vec = DataService().vectorize_input(x)
        conversion_dict, y = DataService().labels_string_to_float(y)

        x_train, y_train, x_dev, y_dev, x_test, y_test = DataService(
        ).test_dev_train_split(x_vec, y)
        x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService(
        ).test_train_split(x_dev, y_dev)

        start_time = datetime.utcnow()
        print('Fitting training data on', len(x_train), 'Samples')
        clf.fit(x_train, y_train)

        training_time = (datetime.utcnow() - start_time).seconds
        print("Training took", training_time, 'seconds..')

        y_pred = clf.predict(x_dev_test)
        print("Accuracy score:",
              accuracy_score(y_pred=y_pred, y_true=y_dev_test))
        print("F1 score (macro):",
              f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
Exemple #2
0
    def experimentLinearKernel(self, trainset, testset):
        print("Reading data")
        x, y = DataService().read_corpus(trainset)

        # Vectorize the text data and return an (n_samples, n_features) matrix.
        x_vec = DataService().vectorize_input(x)
        conversion_dict, y = DataService().labels_string_to_float(y)

        x_train, y_train, x_dev, y_dev, x_test, y_test = DataService(
        ).test_dev_train_split(x_vec, y)

        dev_sets = DataService().cross_validation_split(x_train, y_train)

        best_accuracy = -inf
        best_classifier = None

        cv_results1 = {}
        cv_results2 = {}
        for C in arange(0.5, 2.25, 0.25):
            print("\nProcessing C:", C)
            average_score1 = []
            average_score2 = []
            for set in dev_sets:
                clf2 = SVM().construct_linear_classifier(penalty='l2', C=C)
                validation_set = set
                union_set = DataService().construct_union_set(
                    set.copy(), dev_sets.copy())

                # fit on the rest of the data
                clf2.fit(union_set[0], union_set[1])

                # validate on validation set
                y_pred = clf2.predict(validation_set[0])

                score = f1_score(y_true=validation_set[1],
                                 y_pred=y_pred,
                                 average='binary')
                average_score1.append(score)
            cv_results1[C] = mean(average_score1)

            score = round(mean(average_score2), 3)
            print("Average F1 score for CLF1:", round(mean(average_score1), 3))
            print("Average F1 score for CLF2:", round(mean(average_score2), 3))

            # save the best model and use that to classify the testset
            if score > best_accuracy:
                best_accuracy = score
                best_classifier = clf2

        y_pred = best_classifier.predict(x_test)
        print("F1 score (macro):",
              f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
Exemple #3
0
class EnsembleClassifier:
    
    # SVM and LR classifers are considered to make final model more robust
    # Scratch implementation of SVM : SupportVectorMachine.py
    # Scratch implementation of Logistic Regression : LogisticRegression.py
    
    # Code by: Yashitha Agarwal (20230091)
    def __init__(self,lrAlpha=0.01,svmAlpha=0.01,iterations=1000): # Constructor function to initalize individual hyperparameters for LR and SVM
        self.lrAlpha = lrAlpha
        self.svmAlpha =svmAlpha   
        self.iterations = iterations
        self.lrModel = None
        self.svmModel = None
    
    # Code by: Yashitha Agarwal (20230091)
    def fit(self,X,y):
        self.lrModel = LogisticRegression(self.lrAlpha,self.iterations)
        self.svmModel = SVM(self.svmAlpha,self.iterations)
        self.lrModel.fit(X,y)   # Fitting independent and dependent feature in Logistic Regression
        self.svmModel.fit(X,y)  # Fitting independent and dependent feature in Support Vector Machine Classifier
        
    # Code by: Prakhar Gurawa (20231064)
    def predict(self,X,y): 
        lrScore = self.lrModel.score(X,y)
        svmScore = self.svmModel.score(X,y)
        lrPrediction = self.lrModel.predict(X)      # Prediction of Logistic Regression model
        svmPrediction = self.svmModel.predict(X)    # Prediction of Support Vector Machine model
        # Currently we are considering only two algorithms and considering prediction of that higher score classifier in case of disagreement
        finalPrediction = list() # Storing predicted classes
        for i in range(len(lrPrediction)):
            if lrPrediction[i] == svmPrediction[i]:
                finalPrediction.append(lrPrediction[i]) # Case 1: Both LR ans SVM predict to same class
            else:                                       # Case 2: Disagreement between LR and SVM classifiers
                if lrScore > svmScore:
                    finalPrediction.append(lrPrediction[i])
                else:
                    finalPrediction.append(svmPrediction[i])              
        # Future work : If we have more than two algorithms we will make this as a voting classifier.
        # Mutiple classifer are considered and majority of prediction is taken as final prediction.
        return finalPrediction # Final Predictions using ensemble

    # Code by: Prakhar Gurawa (20231064)
    def score(self,X,y): # Function to calculate number of matches between actual classes and predicted classes by our model
        size = len(y)        
        return sum(self.predict(X,y)==y)/size # Number of matches divided by total inputs
Exemple #4
0
    def experimentBestModel(self, trainset, testset):
        print("Reading data")
        x, y = DataService().read_corpus(trainset)
        clf = SVM().construct_best_classifier()

        # Vectorize the text data and return an (n_samples, n_features) matrix.
        x_vec = DataService().vectorize_input(x)
        conversion_dict, y = DataService().labels_string_to_float(y)

        x_train, y_train, x_dev, y_dev, x_test, y_test = DataService(
        ).test_dev_train_split(x_vec, y)

        dev_sets = DataService().cross_validation_split(x_train, y_train)

        best_accuracy = -inf
        best_classifier = None

        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        print("F1 score (macro):",
              f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
class CrossValidation():

    classificationAlgorithms = [
        logisticRegression(),
        RandomForest(),
        SVM(),
        AdaBoost(),
        XGBoost()
    ]

    def __init__(self, dataset, X_train, X_test, y_train, y_test):
        self.ds = dataset
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.accuracyDict = {}
        self.models = {}

    def run(self):
        for alg in self.classificationAlgorithms:
            results = alg.run(self.ds, self.X_train, self.X_test, self.y_train,
                              self.y_test)
            #results incuding: the name of the algorithm and the model
            self.appendToAccuracyDict(
                results[0], self.kFoldCrossValidation(results[0], results[1]))
            self.appendModel(results[0], results[1])

    def kFoldCrossValidation(self, algName, classifier):
        accuracies = cross_val_score(estimator=classifier,
                                     X=self.X_train,
                                     y=self.y_train,
                                     cv=300)
        accuracy = accuracies.mean()
        print algName + ' accuracy:', accuracy * 100, '%'
        return accuracy

    def appendToAccuracyDict(self, algName, accuracy):
        #tup[0]->algorithm name, tup[1]->accuracy
        self.accuracyDict[algName] = accuracy * 100

    def appendModel(self, algName, model):
        #tup[0]->algorithm name, tup[1]->accuracy
        self.models[algName] = model

    def getAccuracyDict(self):
        return self.accuracyDict

    def getModel(self, name):
        return self.models[name]
Exemple #6
0
    def experimentCombinatorialCrossValidation(self, trainset, testset):
        print("Reading data")
        x, y = DataService().read_corpus(trainset)
        clf = SVM().construct_classifier("linear", 1.0)

        # Vectorize the text data and return an (n_samples, n_features) matrix.
        x_vec = DataService().vectorize_input(x)
        conversion_dict, y = DataService().labels_string_to_float(y)

        x_train, y_train, x_dev, y_dev, x_test, y_test = DataService(
        ).test_dev_train_split(x_vec, y)

        dev_sets = DataService().cross_validation_split(x_train, y_train)

        best_accuracy = -inf
        best_classifier = None

        cv_results = {}
        for gamma in arange(0.5, 1.4, 0.15):
            for C in arange(0.5, 2.1, 0.25):
                print("\nProcessing Gamma:", gamma, "C:", C)
                average_score = []
                for set in dev_sets:
                    clf = SVM().construct_rbf_classifier(kernel='rbf',
                                                         gamma=gamma,
                                                         C=C)
                    validation_set = set
                    union_set = DataService().construct_union_set(
                        set.copy(), dev_sets.copy())

                    # fit on the rest of the data
                    clf.fit(union_set[0], union_set[1])

                    # validate on validation set
                    y_pred = clf.predict(validation_set[0])

                    score = f1_score(y_true=validation_set[1],
                                     y_pred=y_pred,
                                     average='binary')
                    average_score.append(score)
                score = round(mean(average_score), 3)
                cv_results[[C, gamma]] = score
                print("Average F1 score for C:", str(C) + ".", score)

                # save the best model and use that to classify the testset
                if score > best_accuracy:
                    best_accuracy = score
                    best_classifier = clf

        y_pred = best_classifier.predict(x_test)
        print("F1 score (macro):",
              f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
from SupportVectorMachine import SVM
from sklearn.svm import SVC


def test_classifier(cls, X_train, y_train, X_test, y_test):
    start = time()
    cls.fit(X_train, y_train)
    end = time()
    y_pred = cls.predict(X_test)
    print("Time:", end - start)
    print("Accuracy:", accuracy_score(y_true=y_test, y_pred=y_pred))


data = pd.read_csv('admission.csv', index_col="Serial No.")[:50]

y = data['TOEFL Score'].to_numpy()
del data['TOEFL Score']

scaler = StandardScaler()
X = scaler.fit_transform(data.values)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=126)

model = SVM()
test_classifier(model, X_train, y_train, X_test, y_test)

model = SVC(C=1.0, kernel='linear', tol=0.001)
test_classifier(model, X_train, y_train, X_test, y_test)
Exemple #8
0
    def experimentFeatures(self, trainset, testset):
        print("Reading data")
        x, y = DataService().read_corpus(trainset)
        clf = SVM().construct_classifier("linear", 1.0)

        # Vectorize the text data and return an (n_samples, n_features) matrix.
        x_vec = DataService().vectorize_input(x)
        conversion_dict, y = DataService().labels_string_to_float(y)

        x_train, y_train, x_dev, y_dev, x_test, y_test = DataService(
        ).test_dev_train_split(x_vec, y)
        x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService(
        ).test_train_split(x_dev, y_dev)

        start_time = datetime.utcnow()
        print('Fitting training data on', len(x_dev_train), 'Samples')
        clf.fit(x_train, y_train)
        non_zero = []
        training_time = (datetime.utcnow() - start_time).seconds
        print("Training took", training_time, 'seconds..')

        y_pred = clf.predict(x_dev_test)
        print("Accuracy score:",
              accuracy_score(y_pred=y_pred, y_true=y_dev_test))
        print("F1 score (macro):",
              f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))

        coef = clf.coef_

        def identity(x):
            return x

        vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity)
        vec.fit_transform(x)
        names = vec.get_feature_names()
        coefs_and_features = list(zip(coef[0], names))
        list_sorted_pos = sorted(coefs_and_features,
                                 key=lambda x: x[0],
                                 reverse=True)
        list_sorted_neg = sorted(coefs_and_features, key=lambda x: x[0])
        features = []
        for i in range(200):
            features.append(list_sorted_pos[i][1])
        for i in range(200):
            features.append(list_sorted_neg[i][1])
        print("\nneg", list_sorted_neg[:100], "\npos", list_sorted_pos[:100])

        new_data = DataService().get_features_from_data(x, features)

        clf2 = SVM().construct_classifier("linear", 1.0)
        # Vectorize the text data and return an (n_samples, n_features) matrix.
        x_vec = DataService().vectorize_input(new_data)
        conversion_dict, y = DataService().labels_string_to_float(y)
        x_train, y_train, x_dev, y_dev, x_test, y_test = DataService(
        ).test_dev_train_split(x_vec, y)
        x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService(
        ).test_train_split(x_dev, y_dev)
        start_time = datetime.utcnow()
        print("\nTRIMMED DATA SET\n----------")
        print('Fitting training data on', len(x_dev_train), 'Samples')
        clf2.fit(x_dev_train, y_dev_train)
        non_zero = []

        training_time = (datetime.utcnow() - start_time).seconds
        print("Training took", training_time, 'seconds..')

        y_pred = clf2.predict(x_dev_test)
        print("Accuracy score:",
              accuracy_score(y_pred=y_pred, y_true=y_dev_test))
        print("F1 score (macro):",
              f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
from SupportVectorMachine import SVM
import numpy as np

features = np.array([[1, 7], [2, 8], [3, 8], [5, 1], [6, -1], [7, 3]])
labels = np.array([-1, -1, -1, 1, 1, 1])

clf = SVM()
clf.fit(features, labels)

predict_us = [[0, 10], [1, 3], [3, 4], [3, 5]]
for p in predict_us:
    print(p, clf.predict(p))
import pandas as pd
from SupportVectorMachine import SVM

df = pd.read_csv("../datasets/iris.data", header=None)
y = df.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', -1, 1)
"""
0 = sepal length
1 = sepal width
2 = petal length
3 = petal width
"""

X = df.iloc[0:100, [0, 3]].values

svm = SVM()
svm.fit(X, y)


def hyperplane(x, w, b, offset):
    return (-w[0] * x + b + offset) / w[1]


plt.scatter(X[:50, 0], X[:50, 1], color='red', marker='o', label='setosa')
plt.scatter(X[50:100, 0],
            X[50:100, 1],
            color='blue',
            marker='x',
            label='versicolor')

x_max = np.amax(X[:, 0])
Exemple #11
0
def main():
    SVM.execute_SVM_process('12stars', 'unigram', create_new_samples=False)
Exemple #12
0
 def loss(self, X_batch, Y_batch, reg):
     return SVM.svm_loss_vectorized(self.w, X_batch, Y_batch, reg)
Exemple #13
0
 def fit(self,X,y):
     self.lrModel = LogisticRegression(self.lrAlpha,self.iterations)
     self.svmModel = SVM(self.svmAlpha,self.iterations)
     self.lrModel.fit(X,y)   # Fitting independent and dependent feature in Logistic Regression
     self.svmModel.fit(X,y)  # Fitting independent and dependent feature in Support Vector Machine Classifier