def run_classifier( features, labels, X_test, Y_test, ## params classifier, ): try: classifier = importlib.import_module(classifier) print 'IMPORTED', classifier except ImportError as error: print error print "Failed to import classifier module in run_classifier.py" print "Available modules: 1) 'logistic_regression' 2) 'adaline' 3) 'naivebayes'" sys.exit(0) X, Y = features, labels w = classifier.fit(X, Y) pred = classifier.predict(X_test, w) error = computeError(Y_test, pred) TPR, FPR, FNR, TNR = computeRates(Y_test, pred, 0, 1) AUC = computeAUC(Y_test, pred) return (error, TPR, FPR, FNR, TNR, AUC)
def run_classifier(features, labels, X_test, Y_test, ## params classifier, ): try: classifier = importlib.import_module(classifier) print 'IMPORTED', classifier except ImportError as error: print error print "Failed to import classifier module in run_classifier.py" print "Available modules: 1) 'logistic_regression' 2) 'adaline' 3) 'naivebayes'" sys.exit(0) X, Y = features, labels w = classifier.fit(X, Y) pred = classifier.predict(X_test, w) error = computeError(Y_test, pred) TPR, FPR, FNR, TNR = computeRates(Y_test, pred, 0, 1) AUC = computeAUC(Y_test, pred) return (error, TPR, FPR, FNR, TNR, AUC)
def gradient_descent(features, labels, ## functions specific to classifier: calculate_output, cost_function, predict, ## params: batch_size, learning_rate, max_epochs, initial_weights, convergence_threshold, convergence_look_back, adaptive_learning_rate=False ): ''' Returns the optimal weights for a given training set and a given model using the gradient descent method. The model is determined by the 'calculate_output', 'cost_function' and 'predict' functions. /!\ Assumes bias term is already in the features input. Input: - features: N * D Numpy matrix of binary values (0 and 1) with N: the number of training examples and D: the number of features for each example - labels: N * 1 Numpy vector of binary values (0 and 1) - batch_size: int between 1 and N 1 = stochastic gradient descent N = batch gradient descent everything in between = mini-batch gradient descent - learning_rate: float, between 0 and 1 - max_epochs: int, >= 0; maximum number of times to run through training set - initial_weights: D * 1 Numpy vector of feature weights - convergence_threshold: float, very small number; e.g. 1e-5 - convergence_look_back: int, >= 1 stops if the error difference hasn't been over threshold for the last X epochs. Output: - W: D * 1 Numpy vector of real values ''' ## notation X, Y = features, labels N, D = X.shape # N training samples; D features ## initialize weights W = np.zeros((D,1)) if initial_weights is None else initial_weights.reshape((D, 1)) ## evaluate the termination conditions previous_errors = deque(maxlen=convergence_look_back) previous_errors.append(1e6) epoch = 0 while epoch < max_epochs: ## mix up samples (they will therefore be fed in different order ## at each training) -> commonly accepted to improve gradient ## descent, making convergence faster permuted_indices = np.random.permutation(N) no_batches = ceil(float(N)/batch_size) batch_number = 0 while batch_number < no_batches: x, y = get_batch(X, Y, permuted_indices, batch_number, batch_size) ## classifier output of current batch o = calculate_output(x, W) ## gradient descent: minimize the cost function ## gradient equation was obtained by deriving the LMS cost function gradient = -np.mean(np.multiply((y - o), x), axis=0) ## update weights W = W - learning_rate * gradient.reshape(W.shape) batch_number += 1 ## Keep track of cost and error P = predict(X, W) error = computeError(Y, P) cost = cost_function(Y, P) previous_errors.append(error) ## check for convergence in last x epochs if all(abs(np.array(previous_errors) - error) < convergence_threshold): return W epoch += 1 return W
def adadelta( features, labels, ## functions specific to classifier: calculate_output, cost_function, predict, ## params: learning_rate=.1, max_epochs=100, initial_weights=None, convergence_threshold=1e-5, convergence_look_back=1, smoothing_term=1e-8, decay=0.9, ): ''' Returns the optimal weights for a given training set and a given model using the stochastic gradient descent method with an ADADELTA adaptive learning rate. The model is determined by the 'calculate_output', 'cost_function' and 'predict' functions. /!\ Assumes bias term is already in the features input. Input: - features: N * D Numpy matrix of binary values (0 and 1) with N: the number of training examples and D: the number of features for each example - labels: N * 1 Numpy vector of binary values (0 and 1) - learning_rate: float, between 0 and 1 - max_epochs: int, >= 0; maximum number of times to run through training set - initial_weights: D * 1 Numpy vector of feature weights - convergence_threshold: float, very small number; e.g. 1e-5 - convergence_look_back: int, >= 1 stops if the error difference hasn't been over threshold for the last X epochs. - smoothing_term: very small number; e.g. 1e-8, ensure no divide by zero error. - decay: squared gradient window. Output: - W: D * 1 Numpy vector of real values ''' ## notation X, Y = features, labels N, D = X.shape # N training samples; D features ## initialize weights W = np.zeros( (D, 1)) if initial_weights is None else initial_weights.reshape((D, 1)) ## evaluate the termination conditions previous_errors = deque(maxlen=convergence_look_back) previous_errors.append(1e6) ## initialise mean_gradient_square = 0 mean_updates_square = 0 epoch = 0 while epoch < max_epochs: ## mix up samples (they will therefore be fed in different order ## at each training) -> commonly accepted to improve gradient ## descent, making convergence faster permuted_indices = np.random.permutation(N) X = X[permuted_indices, :] Y = Y[permuted_indices] for instance in xrange(N): x = X[instance] y = Y[instance] ## classifier output of current training instance o = calculate_output(x, W) ## gradient descent: minimize the cost function ## gradient equation was obtained by deriving the LMS cost function gradient = -np.multiply((y - o), x) ## add proportion of square of gradient mean_gradient_square = decay * mean_gradient_square + ( 1 - decay) * gradient**2 ## adadelta adjustment adjusted_gradient = np.sqrt( (mean_updates_square + smoothing_term) / (mean_gradient_square + smoothing_term)) * gradient ## add proportion of square of the adjusted gradient mean_updates_square = decay * mean_updates_square + ( 1 - decay) * adjusted_gradient**2 ## update weights W = W - adjusted_gradient.reshape(W.shape) ## Keep track of cost and error P = predict(X, W) error = computeError(Y, P) cost = cost_function(Y, P) previous_errors.append(error) ## check for convergence in last x epochs if all(abs(np.array(previous_errors) - error) < convergence_threshold): return W epoch += 1 return W
def bagPredictors(X_train, y_train, X_test, y_test, no_predictors, perc_instances=1, perc_feature_subsampling=1, perc_label_switching=0, classifier='logistic_regression', ham_label=0, spam_label=1): ''' Returns array of error rates for each time a predictor is added to the bagged classifier. The last error rate in the bag represents the error rate resulting from the fully bagged classifier. Inputs: - X_train: N * D Numpy matrix of binary feature values (0 and 1); training set with N: the number of training examples and D: the number of features for each example - y_train: N * 1 Numpy vector of binary values (0 and 1); training set - X_test: M * D Numpy matrix of binary feature values (0 and 1); test set with M: the number of test examples - y_test: M * 1 Numpy vector of binary values (0 and 1); test set - no_predictors: Number of predictors to bag - perc_instances: Float between 0 and 1 representing the percentage of instances to include in each bootstrap replicate set based on the number of instances in the training set - perc_feature_subsampling: Float between 0 and 1 representing the percentage of features to use in bagging implemented with feature subsampling (sampling features without replacement) - perc_label_switching: Float between 0 and 1 representing the percentage of labels to switch (flip) in the training set - classifier: default string classifier name; Options: 1) 'logisticReg' 2) 'adaline' Output: - errors: 1 * no_predictors array listing the error at each bagging iteration (i.e. after each predictor is added to the bag) ''' ## import classifier module try: classifier = importlib.import_module(classifier) except ImportError as error: print error print "Failed to import classifier module in bagging.py" print "Available modules: 1) 'logisticReg' 2) 'adaline'" sys.exit(0) ## initialize metrics to keep track of errors = [] TPRs = [] FPRs = [] FNRs = [] TNRs = [] AUCs = [] ## implement feature subsampling and label switching if (perc_feature_subsampling != 1): X_train, X_test = fs.featureSubsampling(X_train, X_test, perc_feature_subsampling) if (perc_label_switching != 0): y_train = ls.labelSwitching(y_train, perc_label_switching) ## generate bootstrap replicate replicate, labels = swr.generateReplicate(X_train, y_train, perc_instances) classifier_weights = classifier.fit(replicate, labels) votes = classifier.predict(X_test, classifier_weights) predictions = votes errors.append(met.computeError(y_test, predictions)) AUCs.append(met.computeAUC(y_test, predictions)) [TPR, FPR, FNR, TNR] = met.computeRates(y_test, predictions, ham_label, spam_label) TPRs.append(TPR) FPRs.append(FPR) FNRs.append(FNR) TNRs.append(TNR) for ith_predictor in range(1, no_predictors): if (ith_predictor%10 == 0): print 'Predictor:', ith_predictor replicate, labels = swr.generateReplicate(X_train, y_train) classifier_weights = classifier.fit(replicate, labels) votes += classifier.predict(X_test, classifier_weights) predictions = computeClass(votes, ith_predictor + 1) errors.append(met.computeError(y_test, predictions)) AUCs.append(met.computeAUC(y_test, predictions)) [TPR, FPR, FNR, TNR] = met.computeRates(y_test, predictions, ham_label, spam_label) TPRs.append(TPR) FPRs.append(FPR) FNRs.append(FNR) TNRs.append(TNR) return (errors, TPRs, FPRs, FNRs, TNRs, AUCs)
def gradient_descent( features, labels, ## functions specific to classifier: calculate_output, cost_function, predict, ## params: batch_size, learning_rate, max_epochs, initial_weights, convergence_threshold, convergence_look_back, adaptive_learning_rate=False): ''' Returns the optimal weights for a given training set and a given model using the gradient descent method. The model is determined by the 'calculate_output', 'cost_function' and 'predict' functions. /!\ Assumes bias term is already in the features input. Input: - features: N * D Numpy matrix of binary values (0 and 1) with N: the number of training examples and D: the number of features for each example - labels: N * 1 Numpy vector of binary values (0 and 1) - batch_size: int between 1 and N 1 = stochastic gradient descent N = batch gradient descent everything in between = mini-batch gradient descent - learning_rate: float, between 0 and 1 - max_epochs: int, >= 0; maximum number of times to run through training set - initial_weights: D * 1 Numpy vector of feature weights - convergence_threshold: float, very small number; e.g. 1e-5 - convergence_look_back: int, >= 1 stops if the error difference hasn't been over threshold for the last X epochs. Output: - W: D * 1 Numpy vector of real values ''' ## notation X, Y = features, labels N, D = X.shape # N training samples; D features ## initialize weights W = np.zeros( (D, 1)) if initial_weights is None else initial_weights.reshape((D, 1)) ## evaluate the termination conditions previous_errors = deque(maxlen=convergence_look_back) previous_errors.append(1e6) epoch = 0 while epoch < max_epochs: ## mix up samples (they will therefore be fed in different order ## at each training) -> commonly accepted to improve gradient ## descent, making convergence faster permuted_indices = np.random.permutation(N) no_batches = ceil(float(N) / batch_size) batch_number = 0 while batch_number < no_batches: x, y = get_batch(X, Y, permuted_indices, batch_number, batch_size) ## classifier output of current batch o = calculate_output(x, W) ## gradient descent: minimize the cost function ## gradient equation was obtained by deriving the LMS cost function gradient = -np.mean(np.multiply((y - o), x), axis=0) ## update weights W = W - learning_rate * gradient.reshape(W.shape) batch_number += 1 ## Keep track of cost and error P = predict(X, W) error = computeError(Y, P) cost = cost_function(Y, P) previous_errors.append(error) ## check for convergence in last x epochs if all(abs(np.array(previous_errors) - error) < convergence_threshold): return W epoch += 1 return W
def trainBaseClassifier(no_iterations, perc_poisoning, train_folder, test_folder, data_folder, attack='Dict', classifier='logistic_regression', dataset=None, ham_label=0, spam_label=1): ''' Inputs: - no_iterations: integer number of experiments to run on a given test set-up; results of the experiments are averaged - perc_poisoning: integer between 0 and 100; percentage of poisoning of the training set - train_folder: string; path to correct attack folder training sets - test_folder: string; path to correct test set folder - data_folder: string; folder for a given percentage of poisoning; empty for 'NoAttack' - attack: string; Choose from: 1) 'Dict', 2) 'Empty', 3) 'Ham', 4) 'Optimal' - classifier: string; Choose from: 1) 'logisticReg', 2) 'adaline', 3) 'naivebayes' - dataset: string; choose from 1) 'enron' 2) 'lingspam' Ouput: - error: error value - TPR: true positive rate - FPR: false positive rate - FNR: false negative rate - TNR: true negative rate - AUC: AUC value (see sklearn.metrics.roc_auc_score documentation) ''' ## import the classifier that we are going to train try: learner = importlib.import_module(classifier) except ImportError as error: print error print "Failed to import learner module in run_tests.py" print "Available modules: 1) 'logisticReg' 2) 'adaline'" sys.exit(0) ## initialize metrics sum_error, sum_TPR, sum_FPR, sum_FNR, sum_TNR, sum_AUC = 0, 0, 0, 0, 0, 0 for iter in xrange(no_iterations): print 'STARTING ITER:', iter X_train_file = 'X_train_' + str(iter) + '.csv' y_train_file = 'y_train_' + str(iter) + '.csv' X_test_file = 'X_test_' + str(iter) + '.csv' y_test_file = 'y_test_' + str(iter) + '.csv' df_train = pd.read_csv(train_folder + data_folder + X_train_file, header = None) X_train = np.array(df_train) df_train = pd.read_csv(train_folder + data_folder + y_train_file, header = None) y_train = np.array(df_train) df_test = pd.read_csv(test_folder + X_test_file, header = None) X_test = np.array(df_test) df_test = pd.read_csv(test_folder + y_test_file, header = None) y_test = np.array(df_test) if classifier is not 'naivebayes': X_train = addBias(X_train) X_test = addBias(X_test) ## train the classifier and make predictions on the test set weights = learner.fit(X_train, y_train) predictions = learner.predict(X_test, weights) ## record the metrics for this iteration sum_error += met.computeError(y_test, predictions) sum_AUC += met.computeAUC(y_test, predictions) [TPR, FPR, FNR, TNR] = met.computeRates(y_test, predictions, ham_label, spam_label) sum_TPR += TPR sum_FPR += FPR sum_FNR += FNR sum_TNR += TNR ## take the average of all the metrics error = sum_error/no_iterations TPR = sum_TPR/no_iterations FPR = sum_FPR/no_iterations FNR = sum_FNR/no_iterations TNR = sum_TNR/no_iterations AUC = sum_AUC/no_iterations # arguments 0,0,0 signal base classifier saveToFile(0,0,0,perc_poisoning,error,TPR,FPR,FNR,TNR,AUC,attack,classifier,dataset=dataset) return (error, TPR, FPR, FNR, TNR, AUC)
def fit(features, labels, ## params: initial_weights=None, learning_rate=0.1, max_epoch = 200, threshold=1e-5, ham_label=0, spam_label=1, add_bias = True ): ''' Returns the optimal weights for a given training set (features and corresponding label inputs) for the ADALINE model. These weights are found using the gradient descent method. Input: - features: N * D Numpy matrix of binary values (0 and 1) with N: the number of training examples and D: the number of features for each example - labels: N * 1 Numpy vector of binary values (0 and 1) - learning_rate: float between 0 and 1 - initial_weights: D * 1 Numpy vector, beginning weights Output: - W: D * 1 Numpy vector of real values ''' if (add_bias): features = addBias(features) ## 0. Prepare notations X, Y = features, labels N, D = features.shape # N #training samples; D #features cost = [] # keep track of cost error = [] # keep track of error ## 1. Initialise weights W = np.zeros((D, 1)) if initial_weights is None else initial_weights.reshape((D, 1)) ## 2. Evaluate the termination condition epoch = 0 last_epoch_error = 1e10 while epoch < max_epoch: ## current iteration classifier output last_W = W O = np.dot(X, W) ## specialty of ADALINE is that training is done on the weighted sum, ## _before_ the activation function ## batch gradient descent gradient = -np.mean(np.multiply((Y - O), X), axis=0) ## 3. Update weights W = W - learning_rate * gradient.reshape(W.shape) ## Keep track of error and cost (weights from previous iteration) ## T is equivalent to threshold/step activation function if ham_label is 0: ## spam label assumed 1 T = np.zeros(O.shape) T[O > 0.5] = 1 else: ## ham label is assumed -1, spam label assumed 1 T = np.ones(O.shape) T[O < 0] = -1 current_error = computeError(T, Y) error.append(current_error) current_cost = computeCost(Y, O) cost.append(current_cost) if (current_error < last_epoch_error): current_error = computeError(T, Y) error.append(current_error) learning_rate = learning_rate*1.05 elif (current_error - last_epoch_error > 1e-8): learning_rate = learning_rate*.5 W = last_W epoch -= 1 current_error = 1e10 else: current_error = computeError(T, Y) error.append(current_error) epoch += 1 last_epoch_error = current_error return W
def fit( features, labels, ## params: initial_weights=None, learning_rate=0.1, max_epoch=200, threshold=1e-5, ham_label=0, spam_label=1, add_bias=True): ''' Returns the optimal weights for a given training set (features and corresponding label inputs) for the ADALINE model. These weights are found using the gradient descent method. Input: - features: N * D Numpy matrix of binary values (0 and 1) with N: the number of training examples and D: the number of features for each example - labels: N * 1 Numpy vector of binary values (0 and 1) - learning_rate: float between 0 and 1 - initial_weights: D * 1 Numpy vector, beginning weights Output: - W: D * 1 Numpy vector of real values ''' if (add_bias): features = addBias(features) ## 0. Prepare notations X, Y = features, labels N, D = features.shape # N #training samples; D #features cost = [] # keep track of cost error = [] # keep track of error ## 1. Initialise weights W = np.zeros( (D, 1)) if initial_weights is None else initial_weights.reshape((D, 1)) ## 2. Evaluate the termination condition epoch = 0 last_epoch_error = 1e10 while epoch < max_epoch: ## current iteration classifier output last_W = W O = np.dot(X, W) ## specialty of ADALINE is that training is done on the weighted sum, ## _before_ the activation function ## batch gradient descent gradient = -np.mean(np.multiply((Y - O), X), axis=0) ## 3. Update weights W = W - learning_rate * gradient.reshape(W.shape) ## Keep track of error and cost (weights from previous iteration) ## T is equivalent to threshold/step activation function if ham_label is 0: ## spam label assumed 1 T = np.zeros(O.shape) T[O > 0.5] = 1 else: ## ham label is assumed -1, spam label assumed 1 T = np.ones(O.shape) T[O < 0] = -1 current_error = computeError(T, Y) error.append(current_error) current_cost = computeCost(Y, O) cost.append(current_cost) if (current_error < last_epoch_error): current_error = computeError(T, Y) error.append(current_error) learning_rate = learning_rate * 1.05 elif (current_error - last_epoch_error > 1e-8): learning_rate = learning_rate * .5 W = last_W epoch -= 1 current_error = 1e10 else: current_error = computeError(T, Y) error.append(current_error) epoch += 1 last_epoch_error = current_error return W
def adagrad(features, labels, ## functions specific to classifier: calculate_output, cost_function, predict, ## params: learning_rate=.1, max_epochs=100, initial_weights=None, convergence_threshold=1e-5, convergence_look_back=1, smoothing_term=1e-8, ): ''' Returns the optimal weights for a given training set and a given model using the stochastic gradient descent method with an ADAGRAD adaptive learning rate. The model is determined by the 'calculate_output', 'cost_function' and 'predict' functions. /!\ Assumes bias term is already in the features input. Input: - features: N * D Numpy matrix of binary values (0 and 1) with N: the number of training examples and D: the number of features for each example - labels: N * 1 Numpy vector of binary values (0 and 1) - learning_rate: float, between 0 and 1 - max_epochs: int, >= 0; maximum number of times to run through training set - initial_weights: D * 1 Numpy vector of feature weights - convergence_threshold: float, very small number; e.g. 1e-5 - convergence_look_back: int, >= 1 stops if the error difference hasn't been over threshold for the last X epochs. - smoothing_term: very small number; e.g. 1e-8, ensure no divide by zero error. Output: - W: D * 1 Numpy vector of real values ''' ## notation X, Y = features, labels N, D = X.shape # N training samples; D features ## initialize weights W = np.zeros((D,1)) if initial_weights is None else initial_weights.reshape((D, 1)) ## evaluate the termination conditions previous_errors = deque(maxlen=convergence_look_back) previous_errors.append(1e6) ## store gradient sum of squares gti = np.zeros(D) epoch = 0 while epoch < max_epochs: ## mix up samples (they will therefore be fed in different order ## at each training) -> commonly accepted to improve gradient ## descent, making convergence faster permuted_indices = np.random.permutation(N) X = X[permuted_indices, :] Y = Y[permuted_indices] for instance in xrange(N): x = X[instance] y = Y[instance] ## classifier output of current training instance o = calculate_output(x, W) ## gradient descent: minimize the cost function ## gradient equation was obtained by deriving the LMS cost function gradient = -np.multiply((y - o), x) ## add square of gradient gti += gradient ** 2 ## adagrad adjustment adjusted_gradient = gradient / (np.sqrt(gti) + smoothing_term) ## update weights W = W - learning_rate * adjusted_gradient.reshape(W.shape) ## Keep track of cost and error P = predict(X, W) error = computeError(Y, P) cost = cost_function(Y, P) previous_errors.append(error) ## check for convergence in last x epochs if all(abs(np.array(previous_errors) - error) < convergence_threshold): return W epoch += 1 return W