def get_train_test_data():
    records = get_records()
    records.cache()
    print "Mapping of first categorical feature column: %s" % get_mapping(records, 2)

    mappings = [get_mapping(records, i) for i in range(2,10)]
    for m in mappings:
        print m
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings)))
    data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k))
    test = data_with_idx.sample(False, 0.2, 42)
    train = data_with_idx.subtractByKey(test)

    train_data = train.map(lambda (idx, p): p)
    test_data = test.map(lambda (idx, p) : p)

    train_size = train_data.count()
    test_size = test_data.count()
    num_data = data.count()
    print "Training data size: %d" % train_size
    print "Test data size: %d" % test_size
    print "Total data size: %d " % num_data
    print "Train + Test size : %d" % (train_size + test_size)

    return train_data, test_data
def main():
    records = get_records()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2,10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])

    data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings)))
    data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))


    dt_model = DecisionTree.trainRegressor(data_dt, {})
    preds = dt_model.predict(data_dt.map(lambda p: p.features))
    actual = data.map(lambda p: p.label)
    true_vs_predicted_dt = actual.zip(preds)

    data_dt_log = data_dt.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features))
    dt_model_log = DecisionTree.trainRegressor(data_dt_log, {})

    preds_log = dt_model_log.predict(data_dt_log.map(lambda p: p.features))
    actual_log = data_dt_log.map(lambda p: p.label)
    true_vs_predicted_dt_log = actual_log.zip(preds_log).map(lambda (t, p): (np.exp(t), np.exp(p)))

    calculate_print_metrics("Decision Tree Log", true_vs_predicted_dt_log)
def main():
    records = get_records()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len
    print "Feature vector length for categorical features: %d" % cat_len
    print "Feature vector length for numerical features: %d" % num_len
    print "Total feature vector length: %d" % total_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))

    data_dt = records.map(
        lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
    first_point_dt = data_dt.first()
    print "Decision Tree feature vector: " + str(first_point_dt.features)
    print "Decision Tree feature vector length: " + str(
        len(first_point_dt.features))

    dt_model = DecisionTree.trainRegressor(data_dt, {})
    preds = dt_model.predict(data_dt.map(lambda p: p.features))
    actual = data.map(lambda p: p.label)
    true_vs_predicted_dt = actual.zip(preds)
    print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
    print "Decision Tree depth: " + str(dt_model.depth())
    print "Decision Tree number of nodes: " + str(dt_model.numNodes())

    calculate_print_metrics("Decision Tree", true_vs_predicted_dt)
def main():

    records = get_records()
    records.cache()

    print "Mapping of first categorical feature column: %s" % get_mapping(
        records, 2)

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))

    data_dt = records.map(
        lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
    cat_features = dict([(i - 2, len(get_mapping(records, i)) + 1)
                         for i in range(2, 10)])
    print "Categorical feature size mapping %s" % cat_features
    # train the model again
    dt_model = DecisionTree.trainRegressor(
        data_dt, categoricalFeaturesInfo=cat_features)
    preds = dt_model.predict(data_dt.map(lambda p: p.features))
    actual = data.map(lambda p: p.label)
    true_vs_predicted_dt = actual.zip(preds)

    calculate_print_metrics("Decision Tree Categorical Features",
                            true_vs_predicted_dt)
def main():
    records = get_records()
    mappings = [get_mapping(records, i) for i in range(2,10)]

    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings)))
    data_log = data.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features))
    model_log = LinearRegressionWithSGD.train(data_log, iterations=10, step=0.1)
    true_vs_predicted_log = data_log.map(lambda p: (np.exp(p.label), np.exp(model_log.predict(p.features))))
    calculate_print_metrics("Linear Regression Log", true_vs_predicted_log)
def get_train_test_data():
    records = get_records()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))

    data_dt = records.map(
        lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
    data_with_idx_dt = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
    test_dt = data_with_idx_dt.sample(False, 0.2, 42)
    train_dt = data_with_idx_dt.subtractByKey(test_dt)

    train_data_dt = train_dt.map(lambda (idx, p): p)
    test_data_dt = test_dt.map(lambda (idx, p): p)
    return train_data_dt, test_data_dt
Beispiel #7
0
def main():
    records = get_records()
    print records.first()
    print records.count()
    records.cache()
    print "Mapping of first categorical feature column: %s" % get_mapping(
        records, 2)

    mappings = [get_mapping(records, i) for i in range(2, 10)]
    for m in mappings:
        print m
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len
    print "Feature vector length for categorical features: %d" % cat_len
    print "Feature vector length for numerical features: %d" % num_len
    print "Total feature vector length: %d" % total_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))
    first_point = data.first()

    print "Linear Model feature vector:\n" + str(first_point.features)
    print "Linear Model feature vector length: " + str(
        len(first_point.features))

    #linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
    linear_model = LinearRegressionWithSGD.train(data,
                                                 iterations=10,
                                                 step=0.025,
                                                 regParam=0.0,
                                                 regType=None,
                                                 intercept=False)

    true_vs_predicted = data.map(lambda p:
                                 (p.label, linear_model.predict(p.features)))
    print "Linear Model predictions: " + str(true_vs_predicted.take(5))

    calculate_print_metrics("Linear Regression", true_vs_predicted)
Beispiel #8
0
def main():
    records = get_records()
    records.cache()

    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))

    rr_model = RidgeRegressionWithSGD.train(data,
                                            iterations=10,
                                            step=0.1,
                                            intercept=False)
    true_vs_predicted_rr = data.map(lambda p:
                                    (p.label, rr_model.predict(p.features)))

    print "Ridge Regression Model predictions: " + str(
        true_vs_predicted_rr.take(5))

    calculate_print_metrics("Ridge Regression", true_vs_predicted_rr)
Beispiel #9
0
def main():
    records = get_records()
    first = records.first()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))
    first_point = data.first()

    gbt_model = GradientBoostedTrees.trainRegressor(data,
                                                    categoricalFeaturesInfo={},
                                                    numIterations=3)
    true_vs_predicted_gbt = data.map(lambda p:
                                     (p.label, gbt_model.predict(p.features)))

    predictions = gbt_model.predict(data.map(lambda x: x.features))
    labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
    print "GradientBoosted Trees predictions: " + str(
        labelsAndPredictions.take(5))

    mse = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(data.count())
    mae = labelsAndPredictions.map(lambda (v, p): np.abs(v - p)).sum() /\
        float(data.count())
    rmsle = labelsAndPredictions.map(lambda (v,p) :  ((np.log(p + 1) - np.log(v + 1))**2)).sum() /\
        float(data.count())
    print('Gradient Boosted Trees - Mean Squared Error = ' + str(mse))
    print('Gradient Boosted Trees - Mean Absolute Error = ' + str(mae))
    print('Gradient Boosted Trees - Mean Root Mean Squared Log Error = ' +
          str(rmsle))