def main(): records = get_records() records.cache() print "Mapping of first categorical feature column: %s" % get_mapping( records, 2) # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) data_dt = records.map( lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) cat_features = dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2, 10)]) print "Categorical feature size mapping %s" % cat_features # train the model again dt_model = DecisionTree.trainRegressor( data_dt, categoricalFeaturesInfo=cat_features) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) calculate_print_metrics("Decision Tree Categorical Features", true_vs_predicted_dt)
def get_train_test_data(): records = get_records() records.cache() print "Mapping of first categorical feature column: %s" % get_mapping(records, 2) mappings = [get_mapping(records, i) for i in range(2,10)] for m in mappings: print m cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings))) data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k)) test = data_with_idx.sample(False, 0.2, 42) train = data_with_idx.subtractByKey(test) train_data = train.map(lambda (idx, p): p) test_data = test.map(lambda (idx, p) : p) train_size = train_data.count() test_size = test_data.count() num_data = data.count() print "Training data size: %d" % train_size print "Test data size: %d" % test_size print "Total data size: %d " % num_data print "Train + Test size : %d" % (train_size + test_size) return train_data, test_data
def main(): records = get_records() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2,10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings))) data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) data_dt_log = data_dt.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features)) dt_model_log = DecisionTree.trainRegressor(data_dt_log, {}) preds_log = dt_model_log.predict(data_dt_log.map(lambda p: p.features)) actual_log = data_dt_log.map(lambda p: p.label) true_vs_predicted_dt_log = actual_log.zip(preds_log).map(lambda (t, p): (np.exp(t), np.exp(p))) calculate_print_metrics("Decision Tree Log", true_vs_predicted_dt_log)
def main(): records = get_records() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len print "Feature vector length for categorical features: %d" % cat_len print "Feature vector length for numerical features: %d" % num_len print "Total feature vector length: %d" % total_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) data_dt = records.map( lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) first_point_dt = data_dt.first() print "Decision Tree feature vector: " + str(first_point_dt.features) print "Decision Tree feature vector length: " + str( len(first_point_dt.features)) dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) calculate_print_metrics("Decision Tree", true_vs_predicted_dt)
def main(): records = get_records() print records.first() print records.count() records.cache() print "Mapping of first categorical feature column: %s" % get_mapping( records, 2) mappings = [get_mapping(records, i) for i in range(2, 10)] for m in mappings: print m cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len print "Feature vector length for categorical features: %d" % cat_len print "Feature vector length for numerical features: %d" % num_len print "Total feature vector length: %d" % total_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) first_point = data.first() print "Linear Model feature vector:\n" + str(first_point.features) print "Linear Model feature vector length: " + str( len(first_point.features)) #linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.025, regParam=0.0, regType=None, intercept=False) true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features))) print "Linear Model predictions: " + str(true_vs_predicted.take(5)) calculate_print_metrics("Linear Regression", true_vs_predicted)
def main(): records = get_records() mappings = [get_mapping(records, i) for i in range(2,10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings))) data_log = data.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features)) model_log = LinearRegressionWithSGD.train(data_log, iterations=10, step=0.1) true_vs_predicted_log = data_log.map(lambda p: (np.exp(p.label), np.exp(model_log.predict(p.features)))) calculate_print_metrics("Linear Regression Log", true_vs_predicted_log)
def get_train_test_data(): records = get_records() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) data_dt = records.map( lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) data_with_idx_dt = data_dt.zipWithIndex().map(lambda (k, v): (v, k)) test_dt = data_with_idx_dt.sample(False, 0.2, 42) train_dt = data_with_idx_dt.subtractByKey(test_dt) train_data_dt = train_dt.map(lambda (idx, p): p) test_data_dt = test_dt.map(lambda (idx, p): p) return train_data_dt, test_data_dt
def main(): records = get_records() records.cache() mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) rr_model = RidgeRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted_rr = data.map(lambda p: (p.label, rr_model.predict(p.features))) print "Ridge Regression Model predictions: " + str( true_vs_predicted_rr.take(5)) calculate_print_metrics("Ridge Regression", true_vs_predicted_rr)
def main(): records = get_records() records.cache() mappings = [get_mapping(records, i) for i in range(2, 10)] for m in mappings: print m cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len #data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings))) parsed_data = records.map(lambda r: (extract_label( r), extract_sum_feature(r, cat_len, mappings), 1.0)) model = IsotonicRegression.train(parsed_data) first = parsed_data.first() print first true_vs_predicted = parsed_data.map(lambda p: (p[1], model.predict(p[2]))) print "Isotonic Regression: " + str(true_vs_predicted.take(5)) calculate_print_metrics("Isotonic Regression", true_vs_predicted)
def main(): records = get_records() first = records.first() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) first_point = data.first() gbt_model = GradientBoostedTrees.trainRegressor(data, categoricalFeaturesInfo={}, numIterations=3) true_vs_predicted_gbt = data.map(lambda p: (p.label, gbt_model.predict(p.features))) predictions = gbt_model.predict(data.map(lambda x: x.features)) labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions) print "GradientBoosted Trees predictions: " + str( labelsAndPredictions.take(5)) mse = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(data.count()) mae = labelsAndPredictions.map(lambda (v, p): np.abs(v - p)).sum() /\ float(data.count()) rmsle = labelsAndPredictions.map(lambda (v,p) : ((np.log(p + 1) - np.log(v + 1))**2)).sum() /\ float(data.count()) print('Gradient Boosted Trees - Mean Squared Error = ' + str(mse)) print('Gradient Boosted Trees - Mean Absolute Error = ' + str(mae)) print('Gradient Boosted Trees - Mean Root Mean Squared Log Error = ' + str(rmsle))