Example #1
0
def storeFScores():

    # load the prediction-ready training and testing data

    trainData = pd.load('proc_data/prediction_ready_trainDat.pda')
    testData = pd.load('proc_data/prediction_ready_testDat.pda')
    trainData = trainData.dropna()
    testData = testData.dropna()

    # To answer question 5, we need to perform binary classification on 7 different sub-datasets.
    # We will use Hector's decision trees, and for each case we will get both the true and the predicted ratings
    # and extract the bootstrapped F-score. As a beginning experiment, we will consider K = 10 folds
    # per bootstrap evaluation.

    FScoreMeansAndVars = list(
    )  # a list of (mean, stdev) tuples that might prove handy

    # We have 7 different feature vectors for both training and testing data
    # that we need to consider, which we now put in the following two lists.

    trainFeatVecs = [
        trainData.ix[:, :23],  # content-based system (user + item features)
        trainData.ix[:, (22, 24)],  # using only item-item predicted ratings
        trainData.ix[:, (22, 23)],  # using only user-user predicted ratings
        trainData.
        ix[:, 22:25],  # using both item-item and user-user predicted ratings
        trainData.ix[:, :23].join(
            trainData.ix[:, 24:25]
        ),  # content-based + item-item predicted ratings
        trainData.ix[:, :23].join(
            trainData.ix[:, 23:24]
        ),  # content-based + user-user predicted ratings
        trainData.ix[:, :23].join(
            trainData.ix[:, 23:25])  # content-based + both predicted ratings
    ]

    testFeatVecs = [
        testData.ix[:, :23],  # content-based system (user + item features)
        testData.ix[:, (22, 24)],  # using only item-item predicted ratings
        testData.ix[:, (22, 23)],  # using only user-user predicted ratings
        testData.
        ix[:, 22:25],  # using both item-item and user-user predicted ratings
        testData.ix[:, :23].join(
            testData.ix[:,
                        24:25]),  # content-based + item-item predicted ratings
        testData.ix[:, :23].join(
            testData.ix[:,
                        23:24]),  # content-based + user-user predicted ratings
        testData.ix[:, :23].join(
            testData.ix[:, 23:25])  # content-based + both predicted ratings
    ]

    # Now that we have all 7 different training and testing datasets,
    # we can compute the bootstrapped F-score for every setup.
    # We will store these bootstrapped F-scores in a list, which we will
    # then store on disk for easy future access. We will
    # use K=100 folds for our experiments.

    for i in range(len(trainFeatVecs)):

        print "Training decision tree for configuration %d." % (i + 1)
        tree = get_tree(trainFeatVecs[i], 'isgood')
        print "Trained a decision tree, found an optimal depth of: %d" % (
            tree.optimal_depth)
        print "Getting predictions of decision tree on testing data."
        predictions = tree.predict(testFeatVecs[i])

        print "Computing bootstrapped F-score."
        mean, stddev = bootStrapEval(testFeatVecs[i]['isgood'].values,
                                     predictions, 1000)
        print "Computed a mean F-score of %.4f with a std. dev. of %.4f." % (
            mean, stddev)

        print "Storing bootstrapped F-score of configuration %d in a list." % (
            i + 1)
        FScoreMeansAndVars.append((mean, stddev))

    print "Storing all F-scores on disk."
    fp = open('proc_data/bootstrappedFScores.pda', 'wb')
    pkl.dump(FScoreMeansAndVars, fp)
    fp.close()
Example #2
0
def bootStrapEval(trueLabs, predictedLabs, K, bag=False, dataset=None):
    '''
    This method has two different usages, which are disambiguated between by its boolean
    4th argument, "bag".
     
        1) If bag = false (the default), bootstrap evaluation is implemented. The method takes
        K different bootstrapped samples of its first two arguments (true and predicted labels)
        and computes the F-score for each sample, which it then stores in a list. At the end of
        the execution, it returns the mean and the variance of those F-scores. This is an encoding
        of algorithm 10 at page 65 of CIML.
    
        2) If bag = true, ensemble learning via bagging is implemented. This effectively draws
        K bootstrapped samples (i.e samples with replacement) from the dataset pointed to by 
        the 5th argument and for each sample it trains a decision tree classifier. Every tree
        is stored in a list, which is returned to the caller at the end of execution.
    
     
    
    @param trueLabs: If bag == false, a list of length N representing the true labels of our data. None otherwise.
    @param predictedLabs: If bag == false, a list of length N representing the predicted labels of our data. None otherwise.
    @param K: If bag == false, the number of folds to perform over the labels. Otherwise, the number of 
            bootstrapped samples to draw from the training data.
    @param bag: boolean flag. If false (the default), the method performs bootstrap resampling. If true,
            the method performs bagging of decision trees.
    @param dataset: by default, None. If bag == true,  must be non-None (this is checked for), and is 
            a reference to a pandas.DataFrame which holds the training data to draw samples from.
    @return: If bag == false, mean and standard deviation of "K" - many F-scores. Otherwise, list of
            trained decision tree classifiers.
    @raise LogicalError: If there is some inconsistency, among numerous possible, with respect to the
            arguments provided in each case.
    '''

    # Because this method is quite complex, we need to make sure that
    # the arguments provided to it are consistent with the context
    # in which we want to use it. We therefore need to do some
    # sanity checking.

    if K == None or K == 0:  # this is applicable in both usage contexts: we need K > 0
        raise LogicalError, "Method %s: Please provide a positive integer for the K parameter." % inspect.stack(
        )[0][3]

    if bag == False:  # need to check the validity of the two first arguments
        if trueLabs == None or predictedLabs == None or len(
                trueLabs) == 0 or len(predictedLabs) == 0:
            raise LogicalError, "Method %s: Cannot compute bootsrapped F-score without true or predicted labels." % inspect.stack(
            )[0][3]
        if len(trueLabs) != len(predictedLabs):
            raise LogicalError, "Method %s: Mismatch between amount of true and predicted labels." % inspect.stack(
            )[0][3]
    else:  # need to check the validity of the last argument
        if dataset is None or dataset.shape[0] == 0:
            raise DatasetError, "Method %s: Caller provided a null or empty dataset." % inspect.stack(
            )[0][3]

    # Case 1: Bootstrap Resampling

    if bag == False:

        # Initialize algorithm

        scores = list()  # a list of F-scores, initially empty
        numExamples = len(trueLabs)

        # For every fold

        for _i in range(K):
            foldTrueLabels = list()
            foldPredictedLabels = list()

            # For every example

            for _j in range(numExamples):

                # retrieve and store true and predicted label of example

                sampledExampleIndex = np.random.randint(
                    numExamples)  # sample a random example from 0 up to N - 1
                foldTrueLabels.append(trueLabs[sampledExampleIndex])
                foldPredictedLabels.append(predictedLabs[sampledExampleIndex])

            # Compute and store the F score for the current fold.

            scores.append(
                __computeFScore__(foldTrueLabels, foldPredictedLabels))

        # Return mean and standard deviation of all F scores.

        return np.mean(scores), np.std(scores)

    # Case 2: Bagging of decision trees

    else:

        nexamples = dataset.shape[0]

        # keep a list of all the decision tree classifiers
        # that we will train

        DTreeList = list()

        # for every sample

        for datasetSample in range(K):

            # keep a list of every example that you sample.
            # In Python terms, this is a list of Series, and
            # we will convert it to a pandas.DataFrame after we
            # complete our inner loop.

            examplesInSample = list()

            # Select N examples for our sub-dataset
            # by sampling with replacement.

            for _example in range(nexamples):
                selectedExample = np.random.randint(0, nexamples)
                examplesInSample.append(dataset.irow(selectedExample))

            subDataset = pd.DataFrame(examplesInSample)
            subDataset.index = np.arange(subDataset.shape[0])

            # Train a decision tree classifier on the bootstrapped data
            # and store it in a list.
            print "Building random tree %d." % (datasetSample + 1)
            tree = get_tree(subDataset, 'isgood')
            #print "Tree number %d has an optimal depth of: %d" %(datasetSample+1, tree.optimal_depth)
            DTreeList.append(tree)

        # end for _datasetSample

        return DTreeList
Example #3
0
def adaboost(trainDat, K):
    
    """
    Implement the ADABoost algorithm, as described in CIML, page 152.
    @param trainDat: a pandas.DataFrame representing our training data.
    @param K: the number of decision tree stumps that we would like to train.
    @return --- a list of K decision tree stumps, trained on weighted data.
            --- a list of K adaptive parameters, used on predictions alongside 
                the individual classifiers' predictions.
    @raise LogicalError if K<= 0, None or not an int
    @raise DatasetError if trainDat is None or empty
    """
    
    if trainDat is None or len(trainDat) == 0:
        raise DatasetError, "Method %s: Cannot train ADAboost on a null or empty dataset." %(inspect.stack()[0][3])
    if K is None or K <= 0 or not isinstance(K, int):
        raise LogicalError, "Method %s: Need to train a positive number of classifiers" %(inspect.stack()[0][3])
    
    print "Starting AdaBoost algorithm."
    # initialize uniform weights
    
    exampleWeights = np.array([(1 / trainDat.shape[0]) for _x_ in range(trainDat.shape[0])])
        
    # run main algorithm 
    classifierList = list()
    adaptParams = list()
    for k in range(K):
        
        # train a decision tree stump on the weighted training data
        print "Training stump #%d." %(k+1)
        stump = get_tree(trainDat, 'isgood', exampleWeights, 1, 0)  
        classifierList.append(stump)
        # Run predictions on weighted training data 
        print "Getting training data predictions for stump #%d." %(k+1)
        predictions = stump.predict(trainDat)
        
        # Compute training error
        
        trueValues = trainDat['isgood'].values
        
        if len(predictions) != len(trueValues):
            raise LogicalError, "Method %s, model #%d: predictions have to be as many as the true labels." %(inspect.stack()[0][3], k + 1)
        
        
        misclassifiedExampleWeights = [exampleWeights[n] for n in range(len(predictions)) if predictions[n] != trueValues[n]]
        trainingError = np.sum(misclassifiedExampleWeights)   # this is how we consider weighted training error in AdaBoost.
        
        # Compute and store the "adaptive" parameter a(k)
        
        currentAdaptParam = 0.5 * np.log((1 - trainingError) / trainingError)
        
        #if type(currentAdaptParam) != float:
            #raise LogicalError, "Method %s, model #%d: type of adaptive parameter was %s instead of float." %(inspect.stack()[0][3], k + 1, type(currentAdaptParam))
        
        adaptParams.append(currentAdaptParam)
        print "Computed adaptive parameter for classifier %d. It is equal to: %.4f" %(k+1, currentAdaptParam)
         
        # Update and normalize example weights
        # Note that this is not a dot product, but an element-wise multiplication.
        
        exponent = -currentAdaptParam *np.array([trueValues[n] for n in range(trainDat.shape[0])])* np.array([predictions[n] for n in range(trainDat.shape[0])])
        
        try:
            len(exponent)
        except TypeError:
            raise LogicalError, "Method %s: \"exponent\" is not an iterable." %(inspect.stack()[0][3]) 
        if len(exponent) != trainDat.shape[0]:
            raise LogicalError, "Method %s: our derivation of \"exponent\" should've yielded a numpy.ndarray of size %d at this point." %(inspect.stack()[0][3], trainDat.shape[0])
        
        multiplier = exampleWeights * np.exp(exponent)
        
        try:
            len(multiplier)
        except TypeError:
            raise LogicalError, "Method %s: \"multiplier\" is not an iterable." %(inspect.stack()[0][3]) 
        
        if len(multiplier) != trainDat.shape[0]:
            raise LogicalError, "Method %s: our derivation of \"multiplier\" should've yielded a numpy.ndarray of size %d at this point." %(inspect.stack()[0][3], trainDat.shape[0])
        
        # Now we need to normalize, and God only knows how we're supposed to do this.
        
        normalizer = np.sum(multiplier)             # TODO: Decide whether this is the correct normalizer    
        exampleWeights = exampleWeights / normalizer   
        
        try:
            len(exampleWeights)
        except TypeError:
            raise LogicalError, "Method %s, model #%d: after the update to \"exampleWeights\", this variable no longer represents a numpy.ndarray." %(inspect.stack()[0][3], k + 1)
        if  len(exampleWeights) != trainDat.shape[0]:
            raise LogicalError, "Method %s, model #%d: the update to exampleWeights should've yielded a numpy.ndarray of size %d at this point." %(inspect.stack()[0][3], k + 1, trainDat.shape[0])
        
    return classifierList, adaptParams
Example #4
0
def adaboost(trainDat, K):
    """
    Implement the ADABoost algorithm, as described in CIML, page 152.
    @param trainDat: a pandas.DataFrame representing our training data.
    @param K: the number of decision tree stumps that we would like to train.
    @return --- a list of K decision tree stumps, trained on weighted data.
            --- a list of K adaptive parameters, used on predictions alongside 
                the individual classifiers' predictions.
    @raise LogicalError if K<= 0, None or not an int
    @raise DatasetError if trainDat is None or empty
    """

    if trainDat is None or len(trainDat) == 0:
        raise DatasetError, "Method %s: Cannot train ADAboost on a null or empty dataset." % (
            inspect.stack()[0][3])
    if K is None or K <= 0 or not isinstance(K, int):
        raise LogicalError, "Method %s: Need to train a positive number of classifiers" % (
            inspect.stack()[0][3])

    print "Starting AdaBoost algorithm."
    # initialize uniform weights

    exampleWeights = np.array([(1 / trainDat.shape[0])
                               for _x_ in range(trainDat.shape[0])])

    # run main algorithm
    classifierList = list()
    adaptParams = list()
    for k in range(K):

        # train a decision tree stump on the weighted training data
        print "Training stump #%d." % (k + 1)
        stump = get_tree(trainDat, 'isgood', exampleWeights, 1, 0)
        classifierList.append(stump)
        # Run predictions on weighted training data
        print "Getting training data predictions for stump #%d." % (k + 1)
        predictions = stump.predict(trainDat)

        # Compute training error

        trueValues = trainDat['isgood'].values

        if len(predictions) != len(trueValues):
            raise LogicalError, "Method %s, model #%d: predictions have to be as many as the true labels." % (
                inspect.stack()[0][3], k + 1)

        misclassifiedExampleWeights = [
            exampleWeights[n] for n in range(len(predictions))
            if predictions[n] != trueValues[n]
        ]
        trainingError = np.sum(
            misclassifiedExampleWeights
        )  # this is how we consider weighted training error in AdaBoost.

        # Compute and store the "adaptive" parameter a(k)

        currentAdaptParam = 0.5 * np.log((1 - trainingError) / trainingError)

        #if type(currentAdaptParam) != float:
        #raise LogicalError, "Method %s, model #%d: type of adaptive parameter was %s instead of float." %(inspect.stack()[0][3], k + 1, type(currentAdaptParam))

        adaptParams.append(currentAdaptParam)
        print "Computed adaptive parameter for classifier %d. It is equal to: %.4f" % (
            k + 1, currentAdaptParam)

        # Update and normalize example weights
        # Note that this is not a dot product, but an element-wise multiplication.

        exponent = -currentAdaptParam * np.array([
            trueValues[n] for n in range(trainDat.shape[0])
        ]) * np.array([predictions[n] for n in range(trainDat.shape[0])])

        try:
            len(exponent)
        except TypeError:
            raise LogicalError, "Method %s: \"exponent\" is not an iterable." % (
                inspect.stack()[0][3])
        if len(exponent) != trainDat.shape[0]:
            raise LogicalError, "Method %s: our derivation of \"exponent\" should've yielded a numpy.ndarray of size %d at this point." % (
                inspect.stack()[0][3], trainDat.shape[0])

        multiplier = exampleWeights * np.exp(exponent)

        try:
            len(multiplier)
        except TypeError:
            raise LogicalError, "Method %s: \"multiplier\" is not an iterable." % (
                inspect.stack()[0][3])

        if len(multiplier) != trainDat.shape[0]:
            raise LogicalError, "Method %s: our derivation of \"multiplier\" should've yielded a numpy.ndarray of size %d at this point." % (
                inspect.stack()[0][3], trainDat.shape[0])

        # Now we need to normalize, and God only knows how we're supposed to do this.

        normalizer = np.sum(
            multiplier)  # TODO: Decide whether this is the correct normalizer
        exampleWeights = exampleWeights / normalizer

        try:
            len(exampleWeights)
        except TypeError:
            raise LogicalError, "Method %s, model #%d: after the update to \"exampleWeights\", this variable no longer represents a numpy.ndarray." % (
                inspect.stack()[0][3], k + 1)
        if len(exampleWeights) != trainDat.shape[0]:
            raise LogicalError, "Method %s, model #%d: the update to exampleWeights should've yielded a numpy.ndarray of size %d at this point." % (
                inspect.stack()[0][3], k + 1, trainDat.shape[0])

    return classifierList, adaptParams
Example #5
0
def storeFScores():
    
    # load the prediction-ready training and testing data
        
    trainData = pd.load('proc_data/prediction_ready_trainDat.pda')
    testData = pd.load('proc_data/prediction_ready_testDat.pda')
    trainData = trainData.dropna()
    testData = testData.dropna()
        
    # To answer question 5, we need to perform binary classification on 7 different sub-datasets.
    # We will use Hector's decision trees, and for each case we will get both the true and the predicted ratings
    # and extract the bootstrapped F-score. As a beginning experiment, we will consider K = 10 folds
    # per bootstrap evaluation.
        
    FScoreMeansAndVars = list()             # a list of (mean, stdev) tuples that might prove handy
        
    # We have 7 different feature vectors for both training and testing data 
    # that we need to consider, which we now put in the following two lists.
        
    trainFeatVecs = [
                        trainData.ix[:, :23],                                    # content-based system (user + item features)
                        trainData.ix[:, (22, 24)],                               # using only item-item predicted ratings
                        trainData.ix[:, (22, 23)],                               # using only user-user predicted ratings
                        trainData.ix[:, 22:25],                                  # using both item-item and user-user predicted ratings
                        trainData.ix[:, :23].join(trainData.ix[:, 24:25]),     # content-based + item-item predicted ratings
                        trainData.ix[:, :23].join(trainData.ix[:, 23:24]),     # content-based + user-user predicted ratings
                        trainData.ix[:, :23].join(trainData.ix[:, 23:25])      # content-based + both predicted ratings
                            ]
        
    testFeatVecs = [
                            testData.ix[:, :23],                                     # content-based system (user + item features)
                            testData.ix[:, (22, 24)],                                # using only item-item predicted ratings
                            testData.ix[:, (22, 23)],                                # using only user-user predicted ratings
                            testData.ix[:, 22:25],                                   # using both item-item and user-user predicted ratings
                            testData.ix[:, :23].join(testData.ix[:, 24:25]),       # content-based + item-item predicted ratings
                            testData.ix[:, :23].join(testData.ix[:, 23:24]),       # content-based + user-user predicted ratings
                            testData.ix[:, :23].join(testData.ix[:, 23:25])        # content-based + both predicted ratings
                            ]
        
        # Now that we have all 7 different training and testing datasets,
        # we can compute the bootstrapped F-score for every setup.
        # We will store these bootstrapped F-scores in a list, which we will
        # then store on disk for easy future access. We will
        # use K=100 folds for our experiments.
        
    for i in range(len(trainFeatVecs)):
            
        print "Training decision tree for configuration %d." %(i+1)
        tree = get_tree(trainFeatVecs[i], 'isgood')
        print "Trained a decision tree, found an optimal depth of: %d" %(tree.optimal_depth)
        print "Getting predictions of decision tree on testing data."
        predictions = tree.predict(testFeatVecs[i])
            
        print "Computing bootstrapped F-score."
        mean, stddev = bootStrapEval(testFeatVecs[i]['isgood'].values, predictions, 1000)
        print "Computed a mean F-score of %.4f with a std. dev. of %.4f." %(mean, stddev)
            
        print "Storing bootstrapped F-score of configuration %d in a list." %(i+1)
        FScoreMeansAndVars.append((mean, stddev))
       
    print "Storing all F-scores on disk."
    fp = open('proc_data/bootstrappedFScores.pda', 'wb')
    pkl.dump(FScoreMeansAndVars, fp)
    fp.close()
Example #6
0
def bootStrapEval(trueLabs, predictedLabs, K, bag = False, dataset = None):
    
    '''
    This method has two different usages, which are disambiguated between by its boolean
    4th argument, "bag".
     
        1) If bag = false (the default), bootstrap evaluation is implemented. The method takes
        K different bootstrapped samples of its first two arguments (true and predicted labels)
        and computes the F-score for each sample, which it then stores in a list. At the end of
        the execution, it returns the mean and the variance of those F-scores. This is an encoding
        of algorithm 10 at page 65 of CIML.
    
        2) If bag = true, ensemble learning via bagging is implemented. This effectively draws
        K bootstrapped samples (i.e samples with replacement) from the dataset pointed to by 
        the 5th argument and for each sample it trains a decision tree classifier. Every tree
        is stored in a list, which is returned to the caller at the end of execution.
    
     
    
    @param trueLabs: If bag == false, a list of length N representing the true labels of our data. None otherwise.
    @param predictedLabs: If bag == false, a list of length N representing the predicted labels of our data. None otherwise.
    @param K: If bag == false, the number of folds to perform over the labels. Otherwise, the number of 
            bootstrapped samples to draw from the training data.
    @param bag: boolean flag. If false (the default), the method performs bootstrap resampling. If true,
            the method performs bagging of decision trees.
    @param dataset: by default, None. If bag == true,  must be non-None (this is checked for), and is 
            a reference to a pandas.DataFrame which holds the training data to draw samples from.
    @return: If bag == false, mean and standard deviation of "K" - many F-scores. Otherwise, list of
            trained decision tree classifiers.
    @raise LogicalError: If there is some inconsistency, among numerous possible, with respect to the
            arguments provided in each case.
    '''
    
    # Because this method is quite complex, we need to make sure that 
    # the arguments provided to it are consistent with the context 
    # in which we want to use it. We therefore need to do some
    # sanity checking.
    
    if K == None or K == 0: # this is applicable in both usage contexts: we need K > 0
        raise LogicalError, "Method %s: Please provide a positive integer for the K parameter." % inspect.stack()[0][3]
    
    if bag== False: # need to check the validity of the two first arguments
        if trueLabs == None or predictedLabs == None or len(trueLabs) == 0 or len(predictedLabs) == 0:
            raise LogicalError, "Method %s: Cannot compute bootsrapped F-score without true or predicted labels." %  inspect.stack()[0][3]
        if len(trueLabs) != len(predictedLabs):
            raise LogicalError, "Method %s: Mismatch between amount of true and predicted labels." %  inspect.stack()[0][3]
    else:   # need to check the validity of the last argument
        if dataset is None or dataset.shape[0] == 0:
            raise DatasetError, "Method %s: Caller provided a null or empty dataset." % inspect.stack()[0][3]
    
    # Case 1: Bootstrap Resampling
    
    if bag == False:
        
        # Initialize algorithm
    
        scores = list()             # a list of F-scores, initially empty
        numExamples = len(trueLabs)
        
        # For every fold
        
        for _i in range(K):
            foldTrueLabels = list()
            foldPredictedLabels = list()
            
            # For every example
            
            for _j in range(numExamples):
                
                # retrieve and store true and predicted label of example
                
                sampledExampleIndex = np.random.randint(numExamples)        # sample a random example from 0 up to N - 1
                foldTrueLabels.append(trueLabs[sampledExampleIndex])
                foldPredictedLabels.append(predictedLabs[sampledExampleIndex])
            
            # Compute and store the F score for the current fold.
             
            scores.append(__computeFScore__(foldTrueLabels, foldPredictedLabels))
            
        # Return mean and standard deviation of all F scores.
        
        return np.mean(scores), np.std(scores)
    
    # Case 2: Bagging of decision trees
    
    else:
        
        nexamples = dataset.shape[0]
        
        # keep a list of all the decision tree classifiers 
        # that we will train
        
        DTreeList = list()
        
        # for every sample
        
        for datasetSample in range(K):
            
            # keep a list of every example that you sample.
            # In Python terms, this is a list of Series, and
            # we will convert it to a pandas.DataFrame after we
            # complete our inner loop.
            
            examplesInSample = list()
            
            # Select N examples for our sub-dataset
            # by sampling with replacement.
            
            for _example in range(nexamples):
                selectedExample = np.random.randint(0, nexamples)
                examplesInSample.append(dataset.irow(selectedExample))       
            
            subDataset = pd.DataFrame(examplesInSample)
            subDataset.index = np.arange(subDataset.shape[0])
            
            # Train a decision tree classifier on the bootstrapped data
            # and store it in a list.
            print "Building random tree %d." %(datasetSample + 1)
            tree = get_tree(subDataset, 'isgood')
            #print "Tree number %d has an optimal depth of: %d" %(datasetSample+1, tree.optimal_depth)
            DTreeList.append(tree)
        
        # end for _datasetSample    
           
        return DTreeList