Python computeError Examples, metrics.computeError Python Examples

Example #1

0

Show file

File: run_classifier.py Project: shapzka/SpamFilter

def run_classifier(
    features,
    labels,
    X_test,
    Y_test,
    ## params
    classifier,
):
    try:
        classifier = importlib.import_module(classifier)
        print 'IMPORTED', classifier
    except ImportError as error:
        print error
        print "Failed to import classifier module in run_classifier.py"
        print "Available modules: 1) 'logistic_regression' 2) 'adaline' 3) 'naivebayes'"
        sys.exit(0)

    X, Y = features, labels

    w = classifier.fit(X, Y)
    pred = classifier.predict(X_test, w)

    error = computeError(Y_test, pred)
    TPR, FPR, FNR, TNR = computeRates(Y_test, pred, 0, 1)

    AUC = computeAUC(Y_test, pred)
    return (error, TPR, FPR, FNR, TNR, AUC)

Example #2

0

Show file

File: run_classifier.py Project: kaylashapiro/SpamFilter

def run_classifier(features, labels, X_test, Y_test,
                   ## params
                   classifier,
                   ):
    try:
        classifier = importlib.import_module(classifier)
        print 'IMPORTED', classifier
    except ImportError as error:
        print error
        print "Failed to import classifier module in run_classifier.py"
        print "Available modules: 1) 'logistic_regression' 2) 'adaline' 3) 'naivebayes'"
        sys.exit(0)

    X, Y = features, labels
    
    w = classifier.fit(X, Y)
    pred = classifier.predict(X_test, w)
    
    error = computeError(Y_test, pred)
    TPR, FPR, FNR, TNR = computeRates(Y_test, pred, 0, 1)

    
    AUC = computeAUC(Y_test, pred)
    return (error, TPR, FPR, FNR, TNR, AUC)

Example #3

0

Show file

File: gradientdescent.py Project: kaylashapiro/SpamFilter

def gradient_descent(features, labels,
                     ## functions specific to classifier:
                     calculate_output,
                     cost_function,
                     predict,
                     ## params:
                     batch_size,
                     learning_rate,
                     max_epochs,
                     initial_weights,
                     convergence_threshold,
                     convergence_look_back,
                     adaptive_learning_rate=False
                     ):
    '''
    Returns the optimal weights for a given training set and a given model 
    using the gradient descent method. The model is determined by the 
    'calculate_output', 'cost_function' and 'predict' functions.
    
    /!\ Assumes bias term is already in the features input.
    
    Input:
    - features: N * D Numpy matrix of binary values (0 and 1)
                with N: the number of training examples
                and  D: the number of features for each example
    - labels: N * 1 Numpy vector of binary values (0 and 1)
    - batch_size: int between 1 and N
                    1 = stochastic gradient descent
                    N = batch gradient descent
                    everything in between = mini-batch gradient descent
    - learning_rate: float, between 0 and 1
    - max_epochs: int, >= 0; maximum number of times to run through training set
    - initial_weights: D * 1 Numpy vector of feature weights
    - convergence_threshold: float, very small number; e.g. 1e-5
    - convergence_look_back: int, >= 1
                             stops if the error difference hasn't been over threshold
                             for the last X epochs.
    
    Output:
    - W: D * 1 Numpy vector of real values    
    '''    
    ## notation
    X, Y = features, labels
    N, D = X.shape # N training samples; D features
    
    ## initialize weights
    W = np.zeros((D,1)) if initial_weights is None else initial_weights.reshape((D, 1))
    
    ## evaluate the termination conditions
    previous_errors = deque(maxlen=convergence_look_back)
    previous_errors.append(1e6)
    
    epoch = 0
    while epoch < max_epochs:
        ## mix up samples (they will therefore be fed in different order
        ## at each training) -> commonly accepted to improve gradient
        ## descent, making convergence faster
        permuted_indices = np.random.permutation(N)
        
        no_batches = ceil(float(N)/batch_size)
        batch_number = 0
        
        while batch_number < no_batches:
            x, y = get_batch(X, Y, permuted_indices, batch_number, batch_size)
            
            ## classifier output of current batch
            o = calculate_output(x, W)
            
            ## gradient descent: minimize the cost function
            ## gradient equation was obtained by deriving the LMS cost function
            gradient = -np.mean(np.multiply((y - o), x), axis=0)
            
            ## update weights
            W = W - learning_rate * gradient.reshape(W.shape)
            
            batch_number += 1
            
        ## Keep track of cost and error
        P = predict(X, W)
        error = computeError(Y, P)
        cost = cost_function(Y, P)
         
        previous_errors.append(error) 
         
        ## check for convergence in last x epochs
        if all(abs(np.array(previous_errors) - error) < convergence_threshold):
            return W
        epoch += 1
        
    return W

Example #4

0

Show file

File: adadelta.py Project: shapzka/SpamFilter

def adadelta(
    features,
    labels,
    ## functions specific to classifier:
    calculate_output,
    cost_function,
    predict,
    ## params:
    learning_rate=.1,
    max_epochs=100,
    initial_weights=None,
    convergence_threshold=1e-5,
    convergence_look_back=1,
    smoothing_term=1e-8,
    decay=0.9,
):
    '''
    Returns the optimal weights for a given training set and a given model 
    using the stochastic gradient descent method with an ADADELTA adaptive
    learning rate. The model is determined by the 'calculate_output', 
    'cost_function' and 'predict' functions.
    
    /!\ Assumes bias term is already in the features input.
    
    Input:
    - features: N * D Numpy matrix of binary values (0 and 1)
                with N: the number of training examples
                and  D: the number of features for each example
    - labels: N * 1 Numpy vector of binary values (0 and 1)
    - learning_rate: float, between 0 and 1
    - max_epochs: int, >= 0; maximum number of times to run through training set
    - initial_weights: D * 1 Numpy vector of feature weights
    - convergence_threshold: float, very small number; e.g. 1e-5
    - convergence_look_back: int, >= 1
                             stops if the error difference hasn't been over threshold
                             for the last X epochs.
    - smoothing_term: very small number; e.g. 1e-8,
                      ensure no divide by zero error.
    - decay: squared gradient window.
    
    Output:
    - W: D * 1 Numpy vector of real values
    '''

    ## notation
    X, Y = features, labels
    N, D = X.shape  # N training samples; D features

    ## initialize weights
    W = np.zeros(
        (D, 1)) if initial_weights is None else initial_weights.reshape((D, 1))

    ## evaluate the termination conditions
    previous_errors = deque(maxlen=convergence_look_back)
    previous_errors.append(1e6)

    ## initialise
    mean_gradient_square = 0
    mean_updates_square = 0

    epoch = 0
    while epoch < max_epochs:
        ## mix up samples (they will therefore be fed in different order
        ## at each training) -> commonly accepted to improve gradient
        ## descent, making convergence faster
        permuted_indices = np.random.permutation(N)

        X = X[permuted_indices, :]
        Y = Y[permuted_indices]

        for instance in xrange(N):
            x = X[instance]
            y = Y[instance]

            ## classifier output of current training instance
            o = calculate_output(x, W)

            ## gradient descent: minimize the cost function
            ## gradient equation was obtained by deriving the LMS cost function
            gradient = -np.multiply((y - o), x)

            ## add proportion of square of gradient
            mean_gradient_square = decay * mean_gradient_square + (
                1 - decay) * gradient**2

            ## adadelta adjustment
            adjusted_gradient = np.sqrt(
                (mean_updates_square + smoothing_term) /
                (mean_gradient_square + smoothing_term)) * gradient

            ## add proportion of square of the adjusted gradient
            mean_updates_square = decay * mean_updates_square + (
                1 - decay) * adjusted_gradient**2

            ## update weights
            W = W - adjusted_gradient.reshape(W.shape)

        ## Keep track of cost and error
        P = predict(X, W)
        error = computeError(Y, P)
        cost = cost_function(Y, P)

        previous_errors.append(error)

        ## check for convergence in last x epochs
        if all(abs(np.array(previous_errors) - error) < convergence_threshold):
            return W
        epoch += 1

    return W

Example #5

0

Show file

File: bagging.py Project: shapzka/SpamFilter

def bagPredictors(X_train, y_train, X_test, y_test, no_predictors, 
                  perc_instances=1, perc_feature_subsampling=1, perc_label_switching=0,
                  classifier='logistic_regression',
                  ham_label=0,
                  spam_label=1):
    '''
    Returns array of error rates for each time a predictor is added to the
    bagged classifier. The last error rate in the bag represents the error
    rate resulting from the fully bagged classifier.
    
    Inputs:
    - X_train: N * D Numpy matrix of binary feature values (0 and 1); training set
               with N: the number of training examples
               and  D: the number of features for each example
    - y_train: N * 1 Numpy vector of binary values (0 and 1); training set
    - X_test: M * D Numpy matrix of binary feature values (0 and 1); test set
              with M: the number of test examples
    - y_test: M * 1 Numpy vector of binary values (0 and 1); test set
    - no_predictors: Number of predictors to bag
    - perc_instances: Float between 0 and 1 representing the percentage of instances to include
                      in each bootstrap replicate set based on the number of instances in the 
                      training set
    - perc_feature_subsampling: Float between 0 and 1 representing the percentage of features to
                                use in bagging implemented with feature subsampling (sampling 
                                features without replacement)
    - perc_label_switching: Float between 0 and 1 representing the percentage of labels to switch
                            (flip) in the training set
    - classifier: default string classifier name; Options: 1) 'logisticReg' 2) 'adaline'
         
    Output:
    - errors: 1 * no_predictors array listing the error at each bagging iteration 
              (i.e. after each predictor is added to the bag)
    '''
    ## import classifier module
    try:
        classifier = importlib.import_module(classifier)
    except ImportError as error:
        print error
        print "Failed to import classifier module in bagging.py"
        print "Available modules: 1) 'logisticReg' 2) 'adaline'"
        sys.exit(0)
    
    ## initialize metrics to keep track of
    errors = []
    TPRs = []
    FPRs = []
    FNRs = []
    TNRs = []
    AUCs = []
    
    ## implement feature subsampling and label switching
    if (perc_feature_subsampling != 1):
        X_train, X_test = fs.featureSubsampling(X_train, X_test, perc_feature_subsampling)
    if (perc_label_switching != 0):
        y_train = ls.labelSwitching(y_train, perc_label_switching)
    
    ## generate bootstrap replicate
    replicate, labels = swr.generateReplicate(X_train, y_train, perc_instances)
    
    classifier_weights = classifier.fit(replicate, labels)     
         
    votes = classifier.predict(X_test, classifier_weights)
   
    predictions = votes
    
    errors.append(met.computeError(y_test, predictions))
    
    AUCs.append(met.computeAUC(y_test, predictions))
    
    [TPR, FPR, FNR, TNR] = met.computeRates(y_test, predictions, ham_label, spam_label)
    
    TPRs.append(TPR)
    FPRs.append(FPR)
    FNRs.append(FNR)
    TNRs.append(TNR)
    
    for ith_predictor in range(1, no_predictors):
        if (ith_predictor%10 == 0):
            print 'Predictor:', ith_predictor
            
        replicate, labels = swr.generateReplicate(X_train, y_train)
        
        classifier_weights = classifier.fit(replicate, labels)

        votes += classifier.predict(X_test, classifier_weights)
        
        predictions = computeClass(votes, ith_predictor + 1)
        
        errors.append(met.computeError(y_test, predictions))
        
        AUCs.append(met.computeAUC(y_test, predictions))
                
        [TPR, FPR, FNR, TNR] = met.computeRates(y_test, predictions, ham_label, spam_label)
    
        TPRs.append(TPR)
        FPRs.append(FPR)
        FNRs.append(FNR)
        TNRs.append(TNR)
               
    return (errors, TPRs, FPRs, FNRs, TNRs, AUCs)

Example #6

0

Show file

File: gradientdescent.py Project: shapzka/SpamFilter

def gradient_descent(
        features,
        labels,
        ## functions specific to classifier:
        calculate_output,
        cost_function,
        predict,
        ## params:
        batch_size,
        learning_rate,
        max_epochs,
        initial_weights,
        convergence_threshold,
        convergence_look_back,
        adaptive_learning_rate=False):
    '''
    Returns the optimal weights for a given training set and a given model 
    using the gradient descent method. The model is determined by the 
    'calculate_output', 'cost_function' and 'predict' functions.
    
    /!\ Assumes bias term is already in the features input.
    
    Input:
    - features: N * D Numpy matrix of binary values (0 and 1)
                with N: the number of training examples
                and  D: the number of features for each example
    - labels: N * 1 Numpy vector of binary values (0 and 1)
    - batch_size: int between 1 and N
                    1 = stochastic gradient descent
                    N = batch gradient descent
                    everything in between = mini-batch gradient descent
    - learning_rate: float, between 0 and 1
    - max_epochs: int, >= 0; maximum number of times to run through training set
    - initial_weights: D * 1 Numpy vector of feature weights
    - convergence_threshold: float, very small number; e.g. 1e-5
    - convergence_look_back: int, >= 1
                             stops if the error difference hasn't been over threshold
                             for the last X epochs.
    
    Output:
    - W: D * 1 Numpy vector of real values    
    '''
    ## notation
    X, Y = features, labels
    N, D = X.shape  # N training samples; D features

    ## initialize weights
    W = np.zeros(
        (D, 1)) if initial_weights is None else initial_weights.reshape((D, 1))

    ## evaluate the termination conditions
    previous_errors = deque(maxlen=convergence_look_back)
    previous_errors.append(1e6)

    epoch = 0
    while epoch < max_epochs:
        ## mix up samples (they will therefore be fed in different order
        ## at each training) -> commonly accepted to improve gradient
        ## descent, making convergence faster
        permuted_indices = np.random.permutation(N)

        no_batches = ceil(float(N) / batch_size)
        batch_number = 0

        while batch_number < no_batches:
            x, y = get_batch(X, Y, permuted_indices, batch_number, batch_size)

            ## classifier output of current batch
            o = calculate_output(x, W)

            ## gradient descent: minimize the cost function
            ## gradient equation was obtained by deriving the LMS cost function
            gradient = -np.mean(np.multiply((y - o), x), axis=0)

            ## update weights
            W = W - learning_rate * gradient.reshape(W.shape)

            batch_number += 1

        ## Keep track of cost and error
        P = predict(X, W)
        error = computeError(Y, P)
        cost = cost_function(Y, P)

        previous_errors.append(error)

        ## check for convergence in last x epochs
        if all(abs(np.array(previous_errors) - error) < convergence_threshold):
            return W
        epoch += 1

    return W

Example #7

0

Show file

def trainBaseClassifier(no_iterations, perc_poisoning, train_folder, test_folder, data_folder, 
                        attack='Dict', 
                        classifier='logistic_regression',
                        dataset=None,
                        ham_label=0,
                        spam_label=1):
    '''
    Inputs:
    - no_iterations: integer number of experiments to run on a given test set-up; results of the experiments are
                     averaged
    - perc_poisoning: integer between 0 and 100; percentage of poisoning of the training set
    - train_folder: string; path to correct attack folder training sets
    - test_folder: string; path to correct test set folder
    - data_folder: string; folder for a given percentage of poisoning; empty for 'NoAttack'
    - attack: string; Choose from: 1) 'Dict', 2) 'Empty', 3) 'Ham', 4) 'Optimal'
    - classifier: string; Choose from: 1) 'logisticReg', 2) 'adaline', 3) 'naivebayes'
    - dataset: string; choose from 1) 'enron' 2) 'lingspam'
    
    Ouput:
    - error: error value
    - TPR: true positive rate
    - FPR: false positive rate
    - FNR: false negative rate
    - TNR: true negative rate
    - AUC: AUC value (see sklearn.metrics.roc_auc_score documentation)
    '''
    ## import the classifier that we are going to train
    try:
        learner = importlib.import_module(classifier)
    except ImportError as error:
        print error
        print "Failed to import learner module in run_tests.py"
        print "Available modules: 1) 'logisticReg' 2) 'adaline'"
        sys.exit(0)
    
    ## initialize metrics
    sum_error, sum_TPR, sum_FPR, sum_FNR, sum_TNR, sum_AUC = 0, 0, 0, 0, 0, 0
    
    for iter in xrange(no_iterations): 
        print 'STARTING ITER:', iter
        X_train_file = 'X_train_' + str(iter) + '.csv'
        y_train_file = 'y_train_' + str(iter) + '.csv'
        X_test_file = 'X_test_' + str(iter) + '.csv'
        y_test_file = 'y_test_' + str(iter) + '.csv'
    
        df_train = pd.read_csv(train_folder + data_folder + X_train_file, header = None)
        X_train = np.array(df_train)
    
        df_train = pd.read_csv(train_folder + data_folder + y_train_file, header = None)
        y_train = np.array(df_train)
        
        df_test = pd.read_csv(test_folder + X_test_file, header = None)
        X_test = np.array(df_test)
        
        df_test = pd.read_csv(test_folder + y_test_file, header = None)
        y_test = np.array(df_test)
        
        if classifier is not 'naivebayes':
            X_train = addBias(X_train)
            X_test = addBias(X_test)
    
        ## train the classifier and make predictions on the test set
        weights = learner.fit(X_train, y_train)
        predictions = learner.predict(X_test, weights)
        
        ## record the metrics for this iteration
        sum_error += met.computeError(y_test, predictions)
        sum_AUC += met.computeAUC(y_test, predictions)
        [TPR, FPR, FNR, TNR] = met.computeRates(y_test, predictions, ham_label, spam_label)
        
        sum_TPR += TPR
        sum_FPR += FPR
        sum_FNR += FNR
        sum_TNR += TNR
        
    ## take the average of all the metrics
    error = sum_error/no_iterations
    TPR = sum_TPR/no_iterations
    FPR = sum_FPR/no_iterations
    FNR = sum_FNR/no_iterations
    TNR = sum_TNR/no_iterations
    AUC = sum_AUC/no_iterations
    
    # arguments 0,0,0 signal base classifier
    saveToFile(0,0,0,perc_poisoning,error,TPR,FPR,FNR,TNR,AUC,attack,classifier,dataset=dataset)
    
    return (error, TPR, FPR, FNR, TNR, AUC)

Example #8

0

Show file

File: boldAdaline.py Project: kaylashapiro/SpamFilter

def fit(features, labels,
        ## params:
        initial_weights=None,
        learning_rate=0.1,
        max_epoch = 200,
        threshold=1e-5,
        ham_label=0,
        spam_label=1,
        add_bias = True
        ):
    '''
    Returns the optimal weights for a given training set (features
    and corresponding label inputs) for the ADALINE model.
    These weights are found using the gradient descent method.
        
    Input:
    - features: N * D Numpy matrix of binary values (0 and 1)
                with N: the number of training examples
                and  D: the number of features for each example
    - labels:   N * 1 Numpy vector of binary values (0 and 1)
    - learning_rate: float between 0 and 1
    - initial_weights: D * 1 Numpy vector, beginning weights
    
    Output:
    - W: D * 1 Numpy vector of real values
    '''           
    if (add_bias):
        features = addBias(features)
    
    ## 0. Prepare notations
    X, Y = features, labels
    N, D = features.shape   # N #training samples; D #features
    cost = []               # keep track of cost
    error = []              # keep track of error

    ## 1. Initialise weights
    W = np.zeros((D, 1)) if initial_weights is None else initial_weights.reshape((D, 1))

    ## 2. Evaluate the termination condition
    epoch = 0
    last_epoch_error = 1e10

    while epoch < max_epoch:
        ## current iteration classifier output
        
        last_W = W
        
        O = np.dot(X, W)

        ## specialty of ADALINE is that training is done on the weighted sum,
        ## _before_ the activation function
        ## batch gradient descent
        gradient = -np.mean(np.multiply((Y - O), X), axis=0)

        ## 3. Update weights
        W = W - learning_rate * gradient.reshape(W.shape)

        ## Keep track of error and cost (weights from previous iteration)
        ## T is equivalent to threshold/step activation function
        if ham_label is 0:               ## spam label assumed 1
            T = np.zeros(O.shape)
            T[O > 0.5] = 1
        else:   ## ham label is assumed -1, spam label assumed 1
            T = np.ones(O.shape)
            T[O < 0] = -1

        current_error = computeError(T, Y)
        error.append(current_error)
        
        current_cost = computeCost(Y, O)
        cost.append(current_cost)
        
        if (current_error < last_epoch_error): 
            current_error = computeError(T, Y)
            error.append(current_error)
            learning_rate = learning_rate*1.05
        elif (current_error - last_epoch_error > 1e-8):
            learning_rate = learning_rate*.5
            W = last_W
            epoch -= 1
            current_error = 1e10
        else:
            current_error = computeError(T, Y)
            error.append(current_error)
            
        epoch += 1
            
        last_epoch_error = current_error
       
    return W

Example #9

0

Show file

def fit(
        features,
        labels,
        ## params:
        initial_weights=None,
        learning_rate=0.1,
        max_epoch=200,
        threshold=1e-5,
        ham_label=0,
        spam_label=1,
        add_bias=True):
    '''
    Returns the optimal weights for a given training set (features
    and corresponding label inputs) for the ADALINE model.
    These weights are found using the gradient descent method.
        
    Input:
    - features: N * D Numpy matrix of binary values (0 and 1)
                with N: the number of training examples
                and  D: the number of features for each example
    - labels:   N * 1 Numpy vector of binary values (0 and 1)
    - learning_rate: float between 0 and 1
    - initial_weights: D * 1 Numpy vector, beginning weights
    
    Output:
    - W: D * 1 Numpy vector of real values
    '''
    if (add_bias):
        features = addBias(features)

    ## 0. Prepare notations
    X, Y = features, labels
    N, D = features.shape  # N #training samples; D #features
    cost = []  # keep track of cost
    error = []  # keep track of error

    ## 1. Initialise weights
    W = np.zeros(
        (D, 1)) if initial_weights is None else initial_weights.reshape((D, 1))

    ## 2. Evaluate the termination condition
    epoch = 0
    last_epoch_error = 1e10

    while epoch < max_epoch:
        ## current iteration classifier output

        last_W = W

        O = np.dot(X, W)

        ## specialty of ADALINE is that training is done on the weighted sum,
        ## _before_ the activation function
        ## batch gradient descent
        gradient = -np.mean(np.multiply((Y - O), X), axis=0)

        ## 3. Update weights
        W = W - learning_rate * gradient.reshape(W.shape)

        ## Keep track of error and cost (weights from previous iteration)
        ## T is equivalent to threshold/step activation function
        if ham_label is 0:  ## spam label assumed 1
            T = np.zeros(O.shape)
            T[O > 0.5] = 1
        else:  ## ham label is assumed -1, spam label assumed 1
            T = np.ones(O.shape)
            T[O < 0] = -1

        current_error = computeError(T, Y)
        error.append(current_error)

        current_cost = computeCost(Y, O)
        cost.append(current_cost)

        if (current_error < last_epoch_error):
            current_error = computeError(T, Y)
            error.append(current_error)
            learning_rate = learning_rate * 1.05
        elif (current_error - last_epoch_error > 1e-8):
            learning_rate = learning_rate * .5
            W = last_W
            epoch -= 1
            current_error = 1e10
        else:
            current_error = computeError(T, Y)
            error.append(current_error)

        epoch += 1

        last_epoch_error = current_error

    return W

Example #10

0

Show file

File: adagrad.py Project: kaylashapiro/SpamFilter

def adagrad(features, labels,
            ## functions specific to classifier:
            calculate_output,
            cost_function,
            predict,
            ## params:
            learning_rate=.1,
            max_epochs=100,
            initial_weights=None,
            convergence_threshold=1e-5,
            convergence_look_back=1,
            smoothing_term=1e-8,
            ):
    '''
    Returns the optimal weights for a given training set and a given model 
    using the stochastic gradient descent method with an ADAGRAD adaptive
    learning rate. The model is determined by the 'calculate_output', 
    'cost_function' and 'predict' functions.
    
    /!\ Assumes bias term is already in the features input.
    
    Input:
    - features: N * D Numpy matrix of binary values (0 and 1)
                with N: the number of training examples
                and  D: the number of features for each example
    - labels: N * 1 Numpy vector of binary values (0 and 1)
    - learning_rate: float, between 0 and 1
    - max_epochs: int, >= 0; maximum number of times to run through training set
    - initial_weights: D * 1 Numpy vector of feature weights
    - convergence_threshold: float, very small number; e.g. 1e-5
    - convergence_look_back: int, >= 1
                             stops if the error difference hasn't been over threshold
                             for the last X epochs.
    - smoothing_term: very small number; e.g. 1e-8,
                      ensure no divide by zero error.
    
    Output:
    - W: D * 1 Numpy vector of real values
    '''
    
    ## notation
    X, Y = features, labels
    N, D = X.shape # N training samples; D features
    
    ## initialize weights
    W = np.zeros((D,1)) if initial_weights is None else initial_weights.reshape((D, 1))
    
    ## evaluate the termination conditions
    previous_errors = deque(maxlen=convergence_look_back)
    previous_errors.append(1e6)
    
    ## store gradient sum of squares
    gti = np.zeros(D)
    
    epoch = 0
    while epoch < max_epochs:
        ## mix up samples (they will therefore be fed in different order
        ## at each training) -> commonly accepted to improve gradient
        ## descent, making convergence faster
        permuted_indices = np.random.permutation(N)
        
        X = X[permuted_indices, :]
        Y = Y[permuted_indices]
        
        for instance in xrange(N):
            x = X[instance]
            y = Y[instance]
        
            ## classifier output of current training instance
            o = calculate_output(x, W)
            
            ## gradient descent: minimize the cost function
            ## gradient equation was obtained by deriving the LMS cost function
            gradient = -np.multiply((y - o), x)
            
            ## add square of gradient
            gti += gradient ** 2
            
            ## adagrad adjustment
            adjusted_gradient = gradient / (np.sqrt(gti) + smoothing_term) 

            ## update weights
            W = W - learning_rate * adjusted_gradient.reshape(W.shape)
            
        ## Keep track of cost and error
        P = predict(X, W)
        error = computeError(Y, P)
        cost = cost_function(Y, P)
         
        previous_errors.append(error) 
         
        ## check for convergence in last x epochs
        if all(abs(np.array(previous_errors) - error) < convergence_threshold):
            return W
        epoch += 1
        
    return W