Example #1
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y)
        train_time = (train_start - time.time())

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print('      Accuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))
    '''
Example #2
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options[
                         "meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError(
                "Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y)
        train_time = (train_start - time.time())

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print('      Accuracy: %.03f %.03f'
          % stats_manager.get_statistic('accuracy', pooled=False))
    '''
Example #3
0
def internal_cross_validation(cls, kwargs, paramname, paramrange, statistic,
                              X, y):
    """
    Performs internal cross validation, returns the best parameter value.

    Varies a parameter over a range and returns the one that maximizes a
    statistic in 5-fold cross validation.

    :param cls: The classifier class to use for cross validation.
    :param kwargs: The options (other than the parameter we're varying) to use
      for the classifier's constructor.
    :param paramname: The name of the constructor parameter we'll be varying.
    :param paramrange: The range of values we will vary it through.
    :param statistic: Name of the statistic to use for decision.
    :param X: examples-by-features NumPy matrix
    :param y: vector of class labels
    """

    # Delay these imports so that we don't have circular imports!
    from main import get_folds
    from stats import StatisticsManager

    # Much of this code is sourced from main.py's template.  It simply creates
    # a StatisticsManager for each parameter value.  It does the cross
    # validation on the same folds and picks the best value of the parameter.
    stats_managers = [StatisticsManager() for _ in paramrange]
    folds = get_folds(X, y, 5)
    for train_X, train_y, test_X, test_y in folds:
        for value, stats_manager in zip(paramrange, stats_managers):
            kwargs[paramname] = value
            train_time = time.time()
            classifier = cls(**kwargs)
            classifier.fit(train_X, train_y)
            train_time = train_time - time.time()
            predictions = classifier.predict(test_X)
            scores = classifier.predict_proba(test_X)
            stats_manager.add_fold(test_y, predictions, scores, train_time)
        log.debug('internal-cv: fold completed')

    # Get values for our statistic of interest.
    stat_values = []
    for i, mgr in enumerate(stats_managers):
        # pooled might as well be True, since we don't want a std
        stat = mgr.get_statistic(statistic, pooled=True)
        stat_values.append(stat)
        log.debug('internal-cv gets %s=%r for param %s=%r' %
                  (statistic, stat, paramname, paramrange[i]))
    log.debug('internal-cv gets argmax=%d' % np.argmax(stat_values))
    # Get the parameter value that maximizes our statistic.
    selection = paramrange[np.argmax(stat_values)]
    log.info('internal-cv selects %s=%r' % (paramname, selection))
    return selection
Example #4
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """
        Make sure they use --meta-iters if they want to do bagging/boosting.
        """
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()

    pool = mp.Pool(k)  # one process per fold
    # CPU gogogo
    results = pool.map(train_and_evaluate, [(fold, options) for fold in folds])

    for test_y, predictions, scores, train_time in results:
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    accuracy, std_dev = stats_manager.get_statistic(
        'accuracy',
        pooled=False,
    )
    print(('      Accuracy: %.03f %.03f' % (accuracy, std_dev)))
    precision, std_dev = stats_manager.get_statistic(
        'precision',
        pooled=False,
    )
    print(('     Precision: %.03f %.03f' % (precision, std_dev)))
    recall, std_dev = stats_manager.get_statistic(
        'recall',
        pooled=False,
    )
    print(('        Recall: %.03f %.03f' % (recall, std_dev)))
    area = stats_manager.get_statistic('auc', pooled=True)
    print(('Area under ROC: %.03f' % area))
Example #5
0
def main(**options):
    dataset_directory = options.pop("dataset_directory", ".")
    dataset = options.pop("dataset")
    k = options.pop("k")

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    # import pdb;pdb.set_trace()
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y, schema)
        train_time = train_start - time.time()

        if fs_alg:
            test_X = selector.transform(test_X)

        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print ("      Accuracy: %.03f %.03f" % stats_manager.get_statistic("accuracy", pooled=False))

    print ("     Precision: %.03f %.03f" % stats_manager.get_statistic("precision", pooled=False))

    print ("        Recall: %.03f %.03f" % stats_manager.get_statistic("recall", pooled=False))

    print ("Area under ROC: %.03f" % stats_manager.get_statistic("auc", pooled=True))
Example #6
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    #import pdb;pdb.set_trace()
    #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end
    maxSize = -1
    maxDepth = -1
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)
        classifier.schema = schema

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        #Note that I changed fit to take in the schema
        classifier.fit(train_X, train_y, schema)
        train_time = (train_start - time.time())

        #To see the values and confidences of the root node
        #for attrVal, child in classifier.treeHead.children.iteritems():
        #    print "%d with confidence %f" % (attrVal, child.classLabelConfidence)

        #Maintennce to keep track of the maxSize and maxDepth
        if classifier.size > maxSize:
            maxSize = classifier.size
        if classifier.depth > maxDepth:
            maxDepth = classifier.depth

        #For my testing purposes, I had printed out the train_time
        #print "train time: %f" % train_time

        #For spam and voting tests, I printed out the root attribute
        #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute])

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    #The printouts specified by the assignments
    print('\tAccuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))
    print "\tMaximum Size: %d" % maxSize
    print "\tMaximum Depth: %d" % maxDepth
Example #7
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        # Make sure they use --meta-iters if they want to do bagging/boosting
        raise ValueError(
            "Please indicate number of iterations for {}".format(
                options["meta_algorithm"], ), )

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError(
                "Please indicate number of features for {}".format(fs_alg), )
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    options['schema'] = schema
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print(options)
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)

        randoms = np.random.rand(len(train_y))
        train_y[randoms < options['flip_probs']] = \
            -1 * train_y[randoms < options['flip_probs']]
        classifier.fit(train_X, train_y)
        train_time = (train_start - time.time())

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        print(len(np.where(predictions == test_y)[0]), len(test_y))
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print(
        '      Accuracy: %.03f %.03f' %
        stats_manager.get_statistic('accuracy', pooled=False), )

    print(
        '     Precision: %.03f %.03f' %
        stats_manager.get_statistic('precision', pooled=False), )

    print(
        '        Recall: %.03f %.03f' %
        stats_manager.get_statistic('recall', pooled=False), )

    print(
        'Area under ROC: %.03f' %
        stats_manager.get_statistic('auc', pooled=True), )
Example #8
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    #import pdb;pdb.set_trace()
    #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end
    maxSize = -1
    maxDepth = -1
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)
        classifier.schema = schema

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        #Note that I changed fit to take in the schema
        classifier.fit(train_X, train_y, schema)
        train_time = (train_start - time.time())

        #To see the values and confidences of the root node
        #for attrVal, child in classifier.treeHead.children.iteritems():
        #    print "%d with confidence %f" % (attrVal, child.classLabelConfidence)

        #Maintennce to keep track of the maxSize and maxDepth
        if classifier.size > maxSize:
            maxSize = classifier.size
        if classifier.depth > maxDepth:
            maxDepth = classifier.depth

        #For my testing purposes, I had printed out the train_time
        #print "train time: %f" % train_time
        
        #For spam and voting tests, I printed out the root attribute
        #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute])

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:,1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    #The printouts specified by the assignments
    print ('\tAccuracy: %.03f %.03f'
        % stats_manager.get_statistic('accuracy', pooled=False))
    print "\tMaximum Size: %d" % maxSize
    print "\tMaximum Depth: %d" % maxDepth
Example #9
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        # Make sure they use --meta-iters if they want to do bagging/boosting
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    # put schema in the options so the classifier has it
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    sizes = []
    depths = []
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)

        classifier.fit(train_X, train_y)
        first_test = classifier.root.feature
        print_str = schema.feature_names[first_test]
        if classifier.root.split:
            print_str += " <= %f" % classifier.root.split
        # Print the first test of each learned classifier
        print "First test: %s" % print_str
        train_time = (train_start - time.time())
        sizes.append(classifier.size())
        depths.append(classifier.depth())

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print('      Accuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))
    print('      Average Size: %.03f' % np.mean(sizes))
    print('      Average Depth: %.03f' % np.mean(depths))
    '''