def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y) train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print(' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) '''
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options[ "meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError( "Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y) train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print(' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) '''
def internal_cross_validation(cls, kwargs, paramname, paramrange, statistic, X, y): """ Performs internal cross validation, returns the best parameter value. Varies a parameter over a range and returns the one that maximizes a statistic in 5-fold cross validation. :param cls: The classifier class to use for cross validation. :param kwargs: The options (other than the parameter we're varying) to use for the classifier's constructor. :param paramname: The name of the constructor parameter we'll be varying. :param paramrange: The range of values we will vary it through. :param statistic: Name of the statistic to use for decision. :param X: examples-by-features NumPy matrix :param y: vector of class labels """ # Delay these imports so that we don't have circular imports! from main import get_folds from stats import StatisticsManager # Much of this code is sourced from main.py's template. It simply creates # a StatisticsManager for each parameter value. It does the cross # validation on the same folds and picks the best value of the parameter. stats_managers = [StatisticsManager() for _ in paramrange] folds = get_folds(X, y, 5) for train_X, train_y, test_X, test_y in folds: for value, stats_manager in zip(paramrange, stats_managers): kwargs[paramname] = value train_time = time.time() classifier = cls(**kwargs) classifier.fit(train_X, train_y) train_time = train_time - time.time() predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) stats_manager.add_fold(test_y, predictions, scores, train_time) log.debug('internal-cv: fold completed') # Get values for our statistic of interest. stat_values = [] for i, mgr in enumerate(stats_managers): # pooled might as well be True, since we don't want a std stat = mgr.get_statistic(statistic, pooled=True) stat_values.append(stat) log.debug('internal-cv gets %s=%r for param %s=%r' % (statistic, stat, paramname, paramrange[i])) log.debug('internal-cv gets argmax=%d' % np.argmax(stat_values)) # Get the parameter value that maximizes our statistic. selection = paramrange[np.argmax(stat_values)] log.info('internal-cv selects %s=%r' % (paramname, selection)) return selection
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting. """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() pool = mp.Pool(k) # one process per fold # CPU gogogo results = pool.map(train_and_evaluate, [(fold, options) for fold in folds]) for test_y, predictions, scores, train_time in results: stats_manager.add_fold(test_y, predictions, scores, train_time) accuracy, std_dev = stats_manager.get_statistic( 'accuracy', pooled=False, ) print((' Accuracy: %.03f %.03f' % (accuracy, std_dev))) precision, std_dev = stats_manager.get_statistic( 'precision', pooled=False, ) print((' Precision: %.03f %.03f' % (precision, std_dev))) recall, std_dev = stats_manager.get_statistic( 'recall', pooled=False, ) print((' Recall: %.03f %.03f' % (recall, std_dev))) area = stats_manager.get_statistic('auc', pooled=True) print(('Area under ROC: %.03f' % area))
def main(**options): dataset_directory = options.pop("dataset_directory", ".") dataset = options.pop("dataset") k = options.pop("k") if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() # import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y, schema) train_time = train_start - time.time() if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print (" Accuracy: %.03f %.03f" % stats_manager.get_statistic("accuracy", pooled=False)) print (" Precision: %.03f %.03f" % stats_manager.get_statistic("precision", pooled=False)) print (" Recall: %.03f %.03f" % stats_manager.get_statistic("recall", pooled=False)) print ("Area under ROC: %.03f" % stats_manager.get_statistic("auc", pooled=True))
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end maxSize = -1 maxDepth = -1 for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) classifier.schema = schema # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) #Note that I changed fit to take in the schema classifier.fit(train_X, train_y, schema) train_time = (train_start - time.time()) #To see the values and confidences of the root node #for attrVal, child in classifier.treeHead.children.iteritems(): # print "%d with confidence %f" % (attrVal, child.classLabelConfidence) #Maintennce to keep track of the maxSize and maxDepth if classifier.size > maxSize: maxSize = classifier.size if classifier.depth > maxDepth: maxDepth = classifier.depth #For my testing purposes, I had printed out the train_time #print "train time: %f" % train_time #For spam and voting tests, I printed out the root attribute #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute]) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) #The printouts specified by the assignments print('\tAccuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print "\tMaximum Size: %d" % maxSize print "\tMaximum Depth: %d" % maxDepth
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: # Make sure they use --meta-iters if they want to do bagging/boosting raise ValueError( "Please indicate number of iterations for {}".format( options["meta_algorithm"], ), ) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError( "Please indicate number of features for {}".format(fs_alg), ) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() options['schema'] = schema for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print(options) classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) randoms = np.random.rand(len(train_y)) train_y[randoms < options['flip_probs']] = \ -1 * train_y[randoms < options['flip_probs']] classifier.fit(train_X, train_y) train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) print(len(np.where(predictions == test_y)[0]), len(test_y)) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print( ' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False), ) print( ' Precision: %.03f %.03f' % stats_manager.get_statistic('precision', pooled=False), ) print( ' Recall: %.03f %.03f' % stats_manager.get_statistic('recall', pooled=False), ) print( 'Area under ROC: %.03f' % stats_manager.get_statistic('auc', pooled=True), )
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end maxSize = -1 maxDepth = -1 for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) classifier.schema = schema # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) #Note that I changed fit to take in the schema classifier.fit(train_X, train_y, schema) train_time = (train_start - time.time()) #To see the values and confidences of the root node #for attrVal, child in classifier.treeHead.children.iteritems(): # print "%d with confidence %f" % (attrVal, child.classLabelConfidence) #Maintennce to keep track of the maxSize and maxDepth if classifier.size > maxSize: maxSize = classifier.size if classifier.depth > maxDepth: maxDepth = classifier.depth #For my testing purposes, I had printed out the train_time #print "train time: %f" % train_time #For spam and voting tests, I printed out the root attribute #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute]) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:,1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) #The printouts specified by the assignments print ('\tAccuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print "\tMaximum Size: %d" % maxSize print "\tMaximum Depth: %d" % maxDepth
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: # Make sure they use --meta-iters if they want to do bagging/boosting raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) # put schema in the options so the classifier has it options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() sizes = [] depths = [] for train_X, train_y, test_X, test_y in folds: # Construct classifier instance classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y) first_test = classifier.root.feature print_str = schema.feature_names[first_test] if classifier.root.split: print_str += " <= %f" % classifier.root.split # Print the first test of each learned classifier print "First test: %s" % print_str train_time = (train_start - time.time()) sizes.append(classifier.size()) depths.append(classifier.depth()) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print(' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print(' Average Size: %.03f' % np.mean(sizes)) print(' Average Depth: %.03f' % np.mean(depths)) '''