def storeFScores(): # load the prediction-ready training and testing data trainData = pd.load('proc_data/prediction_ready_trainDat.pda') testData = pd.load('proc_data/prediction_ready_testDat.pda') trainData = trainData.dropna() testData = testData.dropna() # To answer question 5, we need to perform binary classification on 7 different sub-datasets. # We will use Hector's decision trees, and for each case we will get both the true and the predicted ratings # and extract the bootstrapped F-score. As a beginning experiment, we will consider K = 10 folds # per bootstrap evaluation. FScoreMeansAndVars = list( ) # a list of (mean, stdev) tuples that might prove handy # We have 7 different feature vectors for both training and testing data # that we need to consider, which we now put in the following two lists. trainFeatVecs = [ trainData.ix[:, :23], # content-based system (user + item features) trainData.ix[:, (22, 24)], # using only item-item predicted ratings trainData.ix[:, (22, 23)], # using only user-user predicted ratings trainData. ix[:, 22:25], # using both item-item and user-user predicted ratings trainData.ix[:, :23].join( trainData.ix[:, 24:25] ), # content-based + item-item predicted ratings trainData.ix[:, :23].join( trainData.ix[:, 23:24] ), # content-based + user-user predicted ratings trainData.ix[:, :23].join( trainData.ix[:, 23:25]) # content-based + both predicted ratings ] testFeatVecs = [ testData.ix[:, :23], # content-based system (user + item features) testData.ix[:, (22, 24)], # using only item-item predicted ratings testData.ix[:, (22, 23)], # using only user-user predicted ratings testData. ix[:, 22:25], # using both item-item and user-user predicted ratings testData.ix[:, :23].join( testData.ix[:, 24:25]), # content-based + item-item predicted ratings testData.ix[:, :23].join( testData.ix[:, 23:24]), # content-based + user-user predicted ratings testData.ix[:, :23].join( testData.ix[:, 23:25]) # content-based + both predicted ratings ] # Now that we have all 7 different training and testing datasets, # we can compute the bootstrapped F-score for every setup. # We will store these bootstrapped F-scores in a list, which we will # then store on disk for easy future access. We will # use K=100 folds for our experiments. for i in range(len(trainFeatVecs)): print "Training decision tree for configuration %d." % (i + 1) tree = get_tree(trainFeatVecs[i], 'isgood') print "Trained a decision tree, found an optimal depth of: %d" % ( tree.optimal_depth) print "Getting predictions of decision tree on testing data." predictions = tree.predict(testFeatVecs[i]) print "Computing bootstrapped F-score." mean, stddev = bootStrapEval(testFeatVecs[i]['isgood'].values, predictions, 1000) print "Computed a mean F-score of %.4f with a std. dev. of %.4f." % ( mean, stddev) print "Storing bootstrapped F-score of configuration %d in a list." % ( i + 1) FScoreMeansAndVars.append((mean, stddev)) print "Storing all F-scores on disk." fp = open('proc_data/bootstrappedFScores.pda', 'wb') pkl.dump(FScoreMeansAndVars, fp) fp.close()
def bootStrapEval(trueLabs, predictedLabs, K, bag=False, dataset=None): ''' This method has two different usages, which are disambiguated between by its boolean 4th argument, "bag". 1) If bag = false (the default), bootstrap evaluation is implemented. The method takes K different bootstrapped samples of its first two arguments (true and predicted labels) and computes the F-score for each sample, which it then stores in a list. At the end of the execution, it returns the mean and the variance of those F-scores. This is an encoding of algorithm 10 at page 65 of CIML. 2) If bag = true, ensemble learning via bagging is implemented. This effectively draws K bootstrapped samples (i.e samples with replacement) from the dataset pointed to by the 5th argument and for each sample it trains a decision tree classifier. Every tree is stored in a list, which is returned to the caller at the end of execution. @param trueLabs: If bag == false, a list of length N representing the true labels of our data. None otherwise. @param predictedLabs: If bag == false, a list of length N representing the predicted labels of our data. None otherwise. @param K: If bag == false, the number of folds to perform over the labels. Otherwise, the number of bootstrapped samples to draw from the training data. @param bag: boolean flag. If false (the default), the method performs bootstrap resampling. If true, the method performs bagging of decision trees. @param dataset: by default, None. If bag == true, must be non-None (this is checked for), and is a reference to a pandas.DataFrame which holds the training data to draw samples from. @return: If bag == false, mean and standard deviation of "K" - many F-scores. Otherwise, list of trained decision tree classifiers. @raise LogicalError: If there is some inconsistency, among numerous possible, with respect to the arguments provided in each case. ''' # Because this method is quite complex, we need to make sure that # the arguments provided to it are consistent with the context # in which we want to use it. We therefore need to do some # sanity checking. if K == None or K == 0: # this is applicable in both usage contexts: we need K > 0 raise LogicalError, "Method %s: Please provide a positive integer for the K parameter." % inspect.stack( )[0][3] if bag == False: # need to check the validity of the two first arguments if trueLabs == None or predictedLabs == None or len( trueLabs) == 0 or len(predictedLabs) == 0: raise LogicalError, "Method %s: Cannot compute bootsrapped F-score without true or predicted labels." % inspect.stack( )[0][3] if len(trueLabs) != len(predictedLabs): raise LogicalError, "Method %s: Mismatch between amount of true and predicted labels." % inspect.stack( )[0][3] else: # need to check the validity of the last argument if dataset is None or dataset.shape[0] == 0: raise DatasetError, "Method %s: Caller provided a null or empty dataset." % inspect.stack( )[0][3] # Case 1: Bootstrap Resampling if bag == False: # Initialize algorithm scores = list() # a list of F-scores, initially empty numExamples = len(trueLabs) # For every fold for _i in range(K): foldTrueLabels = list() foldPredictedLabels = list() # For every example for _j in range(numExamples): # retrieve and store true and predicted label of example sampledExampleIndex = np.random.randint( numExamples) # sample a random example from 0 up to N - 1 foldTrueLabels.append(trueLabs[sampledExampleIndex]) foldPredictedLabels.append(predictedLabs[sampledExampleIndex]) # Compute and store the F score for the current fold. scores.append( __computeFScore__(foldTrueLabels, foldPredictedLabels)) # Return mean and standard deviation of all F scores. return np.mean(scores), np.std(scores) # Case 2: Bagging of decision trees else: nexamples = dataset.shape[0] # keep a list of all the decision tree classifiers # that we will train DTreeList = list() # for every sample for datasetSample in range(K): # keep a list of every example that you sample. # In Python terms, this is a list of Series, and # we will convert it to a pandas.DataFrame after we # complete our inner loop. examplesInSample = list() # Select N examples for our sub-dataset # by sampling with replacement. for _example in range(nexamples): selectedExample = np.random.randint(0, nexamples) examplesInSample.append(dataset.irow(selectedExample)) subDataset = pd.DataFrame(examplesInSample) subDataset.index = np.arange(subDataset.shape[0]) # Train a decision tree classifier on the bootstrapped data # and store it in a list. print "Building random tree %d." % (datasetSample + 1) tree = get_tree(subDataset, 'isgood') #print "Tree number %d has an optimal depth of: %d" %(datasetSample+1, tree.optimal_depth) DTreeList.append(tree) # end for _datasetSample return DTreeList
def adaboost(trainDat, K): """ Implement the ADABoost algorithm, as described in CIML, page 152. @param trainDat: a pandas.DataFrame representing our training data. @param K: the number of decision tree stumps that we would like to train. @return --- a list of K decision tree stumps, trained on weighted data. --- a list of K adaptive parameters, used on predictions alongside the individual classifiers' predictions. @raise LogicalError if K<= 0, None or not an int @raise DatasetError if trainDat is None or empty """ if trainDat is None or len(trainDat) == 0: raise DatasetError, "Method %s: Cannot train ADAboost on a null or empty dataset." %(inspect.stack()[0][3]) if K is None or K <= 0 or not isinstance(K, int): raise LogicalError, "Method %s: Need to train a positive number of classifiers" %(inspect.stack()[0][3]) print "Starting AdaBoost algorithm." # initialize uniform weights exampleWeights = np.array([(1 / trainDat.shape[0]) for _x_ in range(trainDat.shape[0])]) # run main algorithm classifierList = list() adaptParams = list() for k in range(K): # train a decision tree stump on the weighted training data print "Training stump #%d." %(k+1) stump = get_tree(trainDat, 'isgood', exampleWeights, 1, 0) classifierList.append(stump) # Run predictions on weighted training data print "Getting training data predictions for stump #%d." %(k+1) predictions = stump.predict(trainDat) # Compute training error trueValues = trainDat['isgood'].values if len(predictions) != len(trueValues): raise LogicalError, "Method %s, model #%d: predictions have to be as many as the true labels." %(inspect.stack()[0][3], k + 1) misclassifiedExampleWeights = [exampleWeights[n] for n in range(len(predictions)) if predictions[n] != trueValues[n]] trainingError = np.sum(misclassifiedExampleWeights) # this is how we consider weighted training error in AdaBoost. # Compute and store the "adaptive" parameter a(k) currentAdaptParam = 0.5 * np.log((1 - trainingError) / trainingError) #if type(currentAdaptParam) != float: #raise LogicalError, "Method %s, model #%d: type of adaptive parameter was %s instead of float." %(inspect.stack()[0][3], k + 1, type(currentAdaptParam)) adaptParams.append(currentAdaptParam) print "Computed adaptive parameter for classifier %d. It is equal to: %.4f" %(k+1, currentAdaptParam) # Update and normalize example weights # Note that this is not a dot product, but an element-wise multiplication. exponent = -currentAdaptParam *np.array([trueValues[n] for n in range(trainDat.shape[0])])* np.array([predictions[n] for n in range(trainDat.shape[0])]) try: len(exponent) except TypeError: raise LogicalError, "Method %s: \"exponent\" is not an iterable." %(inspect.stack()[0][3]) if len(exponent) != trainDat.shape[0]: raise LogicalError, "Method %s: our derivation of \"exponent\" should've yielded a numpy.ndarray of size %d at this point." %(inspect.stack()[0][3], trainDat.shape[0]) multiplier = exampleWeights * np.exp(exponent) try: len(multiplier) except TypeError: raise LogicalError, "Method %s: \"multiplier\" is not an iterable." %(inspect.stack()[0][3]) if len(multiplier) != trainDat.shape[0]: raise LogicalError, "Method %s: our derivation of \"multiplier\" should've yielded a numpy.ndarray of size %d at this point." %(inspect.stack()[0][3], trainDat.shape[0]) # Now we need to normalize, and God only knows how we're supposed to do this. normalizer = np.sum(multiplier) # TODO: Decide whether this is the correct normalizer exampleWeights = exampleWeights / normalizer try: len(exampleWeights) except TypeError: raise LogicalError, "Method %s, model #%d: after the update to \"exampleWeights\", this variable no longer represents a numpy.ndarray." %(inspect.stack()[0][3], k + 1) if len(exampleWeights) != trainDat.shape[0]: raise LogicalError, "Method %s, model #%d: the update to exampleWeights should've yielded a numpy.ndarray of size %d at this point." %(inspect.stack()[0][3], k + 1, trainDat.shape[0]) return classifierList, adaptParams
def adaboost(trainDat, K): """ Implement the ADABoost algorithm, as described in CIML, page 152. @param trainDat: a pandas.DataFrame representing our training data. @param K: the number of decision tree stumps that we would like to train. @return --- a list of K decision tree stumps, trained on weighted data. --- a list of K adaptive parameters, used on predictions alongside the individual classifiers' predictions. @raise LogicalError if K<= 0, None or not an int @raise DatasetError if trainDat is None or empty """ if trainDat is None or len(trainDat) == 0: raise DatasetError, "Method %s: Cannot train ADAboost on a null or empty dataset." % ( inspect.stack()[0][3]) if K is None or K <= 0 or not isinstance(K, int): raise LogicalError, "Method %s: Need to train a positive number of classifiers" % ( inspect.stack()[0][3]) print "Starting AdaBoost algorithm." # initialize uniform weights exampleWeights = np.array([(1 / trainDat.shape[0]) for _x_ in range(trainDat.shape[0])]) # run main algorithm classifierList = list() adaptParams = list() for k in range(K): # train a decision tree stump on the weighted training data print "Training stump #%d." % (k + 1) stump = get_tree(trainDat, 'isgood', exampleWeights, 1, 0) classifierList.append(stump) # Run predictions on weighted training data print "Getting training data predictions for stump #%d." % (k + 1) predictions = stump.predict(trainDat) # Compute training error trueValues = trainDat['isgood'].values if len(predictions) != len(trueValues): raise LogicalError, "Method %s, model #%d: predictions have to be as many as the true labels." % ( inspect.stack()[0][3], k + 1) misclassifiedExampleWeights = [ exampleWeights[n] for n in range(len(predictions)) if predictions[n] != trueValues[n] ] trainingError = np.sum( misclassifiedExampleWeights ) # this is how we consider weighted training error in AdaBoost. # Compute and store the "adaptive" parameter a(k) currentAdaptParam = 0.5 * np.log((1 - trainingError) / trainingError) #if type(currentAdaptParam) != float: #raise LogicalError, "Method %s, model #%d: type of adaptive parameter was %s instead of float." %(inspect.stack()[0][3], k + 1, type(currentAdaptParam)) adaptParams.append(currentAdaptParam) print "Computed adaptive parameter for classifier %d. It is equal to: %.4f" % ( k + 1, currentAdaptParam) # Update and normalize example weights # Note that this is not a dot product, but an element-wise multiplication. exponent = -currentAdaptParam * np.array([ trueValues[n] for n in range(trainDat.shape[0]) ]) * np.array([predictions[n] for n in range(trainDat.shape[0])]) try: len(exponent) except TypeError: raise LogicalError, "Method %s: \"exponent\" is not an iterable." % ( inspect.stack()[0][3]) if len(exponent) != trainDat.shape[0]: raise LogicalError, "Method %s: our derivation of \"exponent\" should've yielded a numpy.ndarray of size %d at this point." % ( inspect.stack()[0][3], trainDat.shape[0]) multiplier = exampleWeights * np.exp(exponent) try: len(multiplier) except TypeError: raise LogicalError, "Method %s: \"multiplier\" is not an iterable." % ( inspect.stack()[0][3]) if len(multiplier) != trainDat.shape[0]: raise LogicalError, "Method %s: our derivation of \"multiplier\" should've yielded a numpy.ndarray of size %d at this point." % ( inspect.stack()[0][3], trainDat.shape[0]) # Now we need to normalize, and God only knows how we're supposed to do this. normalizer = np.sum( multiplier) # TODO: Decide whether this is the correct normalizer exampleWeights = exampleWeights / normalizer try: len(exampleWeights) except TypeError: raise LogicalError, "Method %s, model #%d: after the update to \"exampleWeights\", this variable no longer represents a numpy.ndarray." % ( inspect.stack()[0][3], k + 1) if len(exampleWeights) != trainDat.shape[0]: raise LogicalError, "Method %s, model #%d: the update to exampleWeights should've yielded a numpy.ndarray of size %d at this point." % ( inspect.stack()[0][3], k + 1, trainDat.shape[0]) return classifierList, adaptParams
def storeFScores(): # load the prediction-ready training and testing data trainData = pd.load('proc_data/prediction_ready_trainDat.pda') testData = pd.load('proc_data/prediction_ready_testDat.pda') trainData = trainData.dropna() testData = testData.dropna() # To answer question 5, we need to perform binary classification on 7 different sub-datasets. # We will use Hector's decision trees, and for each case we will get both the true and the predicted ratings # and extract the bootstrapped F-score. As a beginning experiment, we will consider K = 10 folds # per bootstrap evaluation. FScoreMeansAndVars = list() # a list of (mean, stdev) tuples that might prove handy # We have 7 different feature vectors for both training and testing data # that we need to consider, which we now put in the following two lists. trainFeatVecs = [ trainData.ix[:, :23], # content-based system (user + item features) trainData.ix[:, (22, 24)], # using only item-item predicted ratings trainData.ix[:, (22, 23)], # using only user-user predicted ratings trainData.ix[:, 22:25], # using both item-item and user-user predicted ratings trainData.ix[:, :23].join(trainData.ix[:, 24:25]), # content-based + item-item predicted ratings trainData.ix[:, :23].join(trainData.ix[:, 23:24]), # content-based + user-user predicted ratings trainData.ix[:, :23].join(trainData.ix[:, 23:25]) # content-based + both predicted ratings ] testFeatVecs = [ testData.ix[:, :23], # content-based system (user + item features) testData.ix[:, (22, 24)], # using only item-item predicted ratings testData.ix[:, (22, 23)], # using only user-user predicted ratings testData.ix[:, 22:25], # using both item-item and user-user predicted ratings testData.ix[:, :23].join(testData.ix[:, 24:25]), # content-based + item-item predicted ratings testData.ix[:, :23].join(testData.ix[:, 23:24]), # content-based + user-user predicted ratings testData.ix[:, :23].join(testData.ix[:, 23:25]) # content-based + both predicted ratings ] # Now that we have all 7 different training and testing datasets, # we can compute the bootstrapped F-score for every setup. # We will store these bootstrapped F-scores in a list, which we will # then store on disk for easy future access. We will # use K=100 folds for our experiments. for i in range(len(trainFeatVecs)): print "Training decision tree for configuration %d." %(i+1) tree = get_tree(trainFeatVecs[i], 'isgood') print "Trained a decision tree, found an optimal depth of: %d" %(tree.optimal_depth) print "Getting predictions of decision tree on testing data." predictions = tree.predict(testFeatVecs[i]) print "Computing bootstrapped F-score." mean, stddev = bootStrapEval(testFeatVecs[i]['isgood'].values, predictions, 1000) print "Computed a mean F-score of %.4f with a std. dev. of %.4f." %(mean, stddev) print "Storing bootstrapped F-score of configuration %d in a list." %(i+1) FScoreMeansAndVars.append((mean, stddev)) print "Storing all F-scores on disk." fp = open('proc_data/bootstrappedFScores.pda', 'wb') pkl.dump(FScoreMeansAndVars, fp) fp.close()
def bootStrapEval(trueLabs, predictedLabs, K, bag = False, dataset = None): ''' This method has two different usages, which are disambiguated between by its boolean 4th argument, "bag". 1) If bag = false (the default), bootstrap evaluation is implemented. The method takes K different bootstrapped samples of its first two arguments (true and predicted labels) and computes the F-score for each sample, which it then stores in a list. At the end of the execution, it returns the mean and the variance of those F-scores. This is an encoding of algorithm 10 at page 65 of CIML. 2) If bag = true, ensemble learning via bagging is implemented. This effectively draws K bootstrapped samples (i.e samples with replacement) from the dataset pointed to by the 5th argument and for each sample it trains a decision tree classifier. Every tree is stored in a list, which is returned to the caller at the end of execution. @param trueLabs: If bag == false, a list of length N representing the true labels of our data. None otherwise. @param predictedLabs: If bag == false, a list of length N representing the predicted labels of our data. None otherwise. @param K: If bag == false, the number of folds to perform over the labels. Otherwise, the number of bootstrapped samples to draw from the training data. @param bag: boolean flag. If false (the default), the method performs bootstrap resampling. If true, the method performs bagging of decision trees. @param dataset: by default, None. If bag == true, must be non-None (this is checked for), and is a reference to a pandas.DataFrame which holds the training data to draw samples from. @return: If bag == false, mean and standard deviation of "K" - many F-scores. Otherwise, list of trained decision tree classifiers. @raise LogicalError: If there is some inconsistency, among numerous possible, with respect to the arguments provided in each case. ''' # Because this method is quite complex, we need to make sure that # the arguments provided to it are consistent with the context # in which we want to use it. We therefore need to do some # sanity checking. if K == None or K == 0: # this is applicable in both usage contexts: we need K > 0 raise LogicalError, "Method %s: Please provide a positive integer for the K parameter." % inspect.stack()[0][3] if bag== False: # need to check the validity of the two first arguments if trueLabs == None or predictedLabs == None or len(trueLabs) == 0 or len(predictedLabs) == 0: raise LogicalError, "Method %s: Cannot compute bootsrapped F-score without true or predicted labels." % inspect.stack()[0][3] if len(trueLabs) != len(predictedLabs): raise LogicalError, "Method %s: Mismatch between amount of true and predicted labels." % inspect.stack()[0][3] else: # need to check the validity of the last argument if dataset is None or dataset.shape[0] == 0: raise DatasetError, "Method %s: Caller provided a null or empty dataset." % inspect.stack()[0][3] # Case 1: Bootstrap Resampling if bag == False: # Initialize algorithm scores = list() # a list of F-scores, initially empty numExamples = len(trueLabs) # For every fold for _i in range(K): foldTrueLabels = list() foldPredictedLabels = list() # For every example for _j in range(numExamples): # retrieve and store true and predicted label of example sampledExampleIndex = np.random.randint(numExamples) # sample a random example from 0 up to N - 1 foldTrueLabels.append(trueLabs[sampledExampleIndex]) foldPredictedLabels.append(predictedLabs[sampledExampleIndex]) # Compute and store the F score for the current fold. scores.append(__computeFScore__(foldTrueLabels, foldPredictedLabels)) # Return mean and standard deviation of all F scores. return np.mean(scores), np.std(scores) # Case 2: Bagging of decision trees else: nexamples = dataset.shape[0] # keep a list of all the decision tree classifiers # that we will train DTreeList = list() # for every sample for datasetSample in range(K): # keep a list of every example that you sample. # In Python terms, this is a list of Series, and # we will convert it to a pandas.DataFrame after we # complete our inner loop. examplesInSample = list() # Select N examples for our sub-dataset # by sampling with replacement. for _example in range(nexamples): selectedExample = np.random.randint(0, nexamples) examplesInSample.append(dataset.irow(selectedExample)) subDataset = pd.DataFrame(examplesInSample) subDataset.index = np.arange(subDataset.shape[0]) # Train a decision tree classifier on the bootstrapped data # and store it in a list. print "Building random tree %d." %(datasetSample + 1) tree = get_tree(subDataset, 'isgood') #print "Tree number %d has an optimal depth of: %d" %(datasetSample+1, tree.optimal_depth) DTreeList.append(tree) # end for _datasetSample return DTreeList