コード例 #1
0
ファイル: kmeans.py プロジェクト: CerJesus/CS221FinalProject
def trainAndTest():

    # Import the training and test data as numpy arrays
    train_array = util.csvAsArray('data/train_updated.csv')
    test_array = util.csvAsArray('data/test.csv')
    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = util.getCsvHeaders('data/train_updated.csv')
    train_examples = []
    k_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))
        k_examples.append(feature_vector)
    # Train a k-means model on the training data and evaluate its mean
    # squared error with the test data

    random.shuffle(train_examples)
    for i in range(0, NUM_SPLITS, 2):
        startTest = i * len(train_examples) / NUM_SPLITS
        endTest = (i + 1) * len(train_examples) / NUM_SPLITS
        currentTrainExamples = train_examples[0:startTest] + train_examples[
            endTest:len(train_examples)]
        (centroids, assign, loss, loss_list,
         centroid_vals) = kmeans(currentTrainExamples, NUM_CLUSTERS, 500)

        currentBoostedExamples = [(currentTrainExamples[ind][0],
                                   loss_list[ind])
                                  for ind in range(len(currentTrainExamples))]

        boostedRegPredictor = learnBoostedRegression(currentBoostedExamples, 500, \
                0.00000000001, num_trees=NUM_B_TREES)

        pre_computed_centroid_dots = [
            util.dotProduct(centroids[ind], centroids[ind])
            for ind in range(NUM_CLUSTERS)
        ]

        def kmeanspredictor(x):
            assignment = 0
            min_dist = 1000000
            for j in range(NUM_CLUSTERS):
                cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct(
                    centroids[j], x) + pre_computed_centroid_dots[j]
                if cur_dist < min_dist:
                    assignment = j
                    min_dist = cur_dist
            return centroid_vals[assignment]

        def boostedKPredictor(x):
            return kmeanspredictor(x) + boostedRegPredictor(x)

        print "leaving out the", (
            i + 1
        ), "th segment of the data, the validation error for the regression is:", util.evaluatePredictor(
            boostedKPredictor, train_examples[startTest:endTest])
コード例 #2
0
def trainAndTest():
    """Trains neighborhood clustering and prints its results.
    """
    # Import the training and test data as numpy arrays
    train_array = util.csvAsArray('data/neighborhood_data_final_w_loc.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = util.getCsvHeaders('data/neighborhood_data_final_w_loc.csv')

    train_examples = []
    names = []
    for i in range(len(train_array)):
        feature_count  = range(2, len(train_array[i]))
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = util.featurize(feature_values, feature_names[1:])
        train_examples.append(feature_vector)
        names.append(train_array[i][1])
       

    # Train a k-means model on the training data and evaluate its mean
    # squared error with the test data
    for i in range(STEP_SIZE, MAX_NUM_CLUSTERS + 1, STEP_SIZE):
        (centroids, assign, loss, loss_list, centroid_vals) \
                = kmeans(full_examples=train_examples, K=i, maxIters=500)
        filename = "neighborhood_centroids" + str(i) + ".p"
        pickle.dump((centroids, assign, loss, loss_list, centroid_vals, names),
                open(os.path.join("neighborhood_centroids", filename), "wb"))
        print names, assign
コード例 #3
0
def trainAndEvaluate():
    """Trains a baseline predictor and prints its mean squared error.
    """
    # Import the training and test data as numpy matrices
    train_array = csvAsArray('data/train.csv')

    # Format the training data as a list of (input, output) tuples
    train_examples = []
    for i in range(len(train_array)):
        input_size = range(len(train_array[i]) - 1)
        input_data = (train_array[i][j] for j in input_size)
        output     = train_array[i][80] / 1000.0
        train_examples.append((input_data, output))

    # Define predictor functions for baseline and oracle
    baseline     = learnBaseline(train_array)
    oracle_train = learnOracle(train_examples)

    # Evaluate mean squared error of predictors
    baseline_error = evaluatePredictor(baseline, train_examples)
    oracle_error   = evaluatePredictor(oracle_train, train_examples)

    # Print the results
    print ""
    print "-------------------"
    print "BASELINE AND ORACLE"
    print "-------------------"
    print "Number of examples:    ", len(train_examples)
    print "Baseline (median) MSE: ", baseline_error
    print "Oracle MSE:            ", oracle_error
    print ""
コード例 #4
0
def crossValidate(predictor, num_folds):
    """Performs k-fold cross validation on a specified predictor function and
    prints the results.

    Args:
        predictor (func): A predictor function.
        num_folds (int): Number of data folds for cross-validation.
    """
    # Import the training data as a numpy array
    train_array = csvAsArray('data/train_updated.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = getCsvHeaders('data/train_updated.csv')

    # Convert the training array into ([features], value) example tuples
    train_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    # Randomize the order of the example tuples to aid validation
    random.shuffle(train_examples)

    # Validation on each fold
    validation_set_size = len(train_examples) / num_folds
    for fold in range(num_folds):

        # Create training and validation sets
        valdiation_start = fold * validation_set_size
        validation_end = validation_start + validation_set_size
        validation_set = train_examples[validation_start:validation_end]
        training_set = train_examples[:validation_start] + train_examples[
            validation_end:]

        # Train a regression model on the training data and evaluate its mean
        # squared error with the validation set
        tuning_parameter = 1
        predictor_fn = predictor(train, 1, 0.01, tuning_parameter)
        regression_error = evaluatePredictor(predictor_fn, validation_set)

        # Print the results
        print ""
        print "----------"
        print "REGRESSION"
        print "----------"
        print "Lambda: ", tuning_parameter
        print "Number of examples: ", len(train_examples)
        print "Regression MSE:     ", regression_error
        print ""
コード例 #5
0
def trainAndEvaluate():
    """Trains a linear regression predictor and prints its mean squared error.
    """
    # Import the training data as a numpy array
    train_array = csvAsArray('data/train_updated.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = getCsvHeaders('data/train_updated.csv')
    train_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    random.shuffle(train_examples)
    test = train_examples[:len(train_examples) / 10]
    train = train_examples[len(train_examples) / 10:]

    # Train a regression model on the training data and evaluate its mean
    # squared error with the test data
    for tuning_parameter in range(5, 21, 5):
        tuning_parameter = 1.0 * tuning_parameter / 10
        regressionPredictor = learnRegression(train, 500, 0.00000000001,
                                              tuning_parameter)
        regression_error = evaluatePredictor(regressionPredictor, test)

        # Print the results
        print ""
        print "----------"
        print "REGRESSION"
        print "----------"
        print "Lambda (lasso): ", tuning_parameter
        print "Number of examples: ", len(train_examples)
        print "Regression MSE:     ", regression_error
        print ""
コード例 #6
0
def trainAndEvaluate():
    """Trains a gradient-boosted linear regression predictor and prints its
    mean squared error.
    """
    # Import the training data as a numpy array
    train_array = csvAsArray('data/train_updated.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = getCsvHeaders('data/train_updated.csv')
    train_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    random.shuffle(train_examples)
    test = train_examples[:len(train_examples) / 10]
    train_examples = train_examples[len(train_examples) / 10:]

    # Train a regression model on the training data and evaluate its mean
    # squared error with the test data
    boostedRegressionPredictor = learnBoostedRegression(train_examples, 500, \
            0.000000001, num_trees=5)
    regression_error = evaluatePredictor(boostedRegressionPredictor, \
            test)

    # Print the results
    print ""
    print "------------------"
    print "BOOSTED REGRESSION"
    print "------------------"
    print "Number of examples: " + str(len(train_examples))
    print "Regression MSE:     " + str(regression_error)
    print ""
コード例 #7
0
def trainAndTest():
    """Defines K-means clustering and perform clustered regression.
    """
    # Import the training and test data as numpy arrays
    train_array = util.csvAsArray('data/train_updated.csv')
    test_array = util.csvAsArray('data/test.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = util.getCsvHeaders('data/train_updated.csv')

    train_examples = []
    k_examples = []

    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = util.featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    random.shuffle(train_examples)

    for i in range(1, NUM_SPLITS, 2):
        startTest = i * len(train_examples) / NUM_SPLITS
        endTest = (i + 1) * len(train_examples) / NUM_SPLITS
        currentTrain = train_examples[0:startTest] + train_examples[
            endTest:len(train_examples)]
        currentTest = train_examples[startTest:endTest]

        # Cluster the data using k-means
        (centroids, assign, loss, loss_list,
         centroid_vals) = kmeans.kmeans(currentTrain, NUM_CENTROIDS, K_ITERS)

        # Make clusters
        cluster_list = [[] for _ in range(len(centroids))]

        for j in range(len(currentTrain)):
            cluster_list[assign[i]].append(currentTrain[j])

        # Train a regression model on the training data (by cluster)
        # and evaluate its mean squared error with the train data
        regression_error = 0
        predictor_list = []
        pre_computed_centroid_dots = [
            util.dotProduct(centroids[k], centroids[k])
            for k in range(len(centroids))
        ]

        for cluster_points in cluster_list:
            boostedRegressionPredictor = boostedtree.learnBoostedRegression(
                cluster_points, SGD_ITERS, ETA, 5, 0)
            predictor_list.append(boostedRegressionPredictor)

        def predictor(x):
            centroid_ind = 0
            minDist = float('inf')
            for k in range(len(centroids)):
                cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct(
                    centroids[k], x) + pre_computed_centroid_dots[k]
                min_dist = float('inf')
                if cur_dist < min_dist:
                    assignment = k
                    min_dist = cur_dist
            return predictor_list[i](x)

        regression_error = boostedtree.evaluatePredictor(
            predictor, currentTest)
        #regression_error /= len(train_examples)

        # Print the results
        print ""
        print "------------------"
        print "CLUSTERED REGRESSION WITH BOOSTING"
        print "------------------"
        print "Leaving out segment: " + str(i)
        print "Number of centroids: " + str(10)
        print "Number of examples: " + str(len(train_examples))
        print "Regression MSE:     " + str(regression_error)
        print ""

    return predictor_list, centroids, regression_error