def test_logistic_random_data(self): X_train, X_test, y_train, y_test = get_classification_data(n_classes=2) logistic = LogisticRegression(sparkSession) logistic.fit(X_train, y_train) mllearn_predicted = logistic.predict(X_test) sklearn_logistic = linear_model.LogisticRegression() sklearn_logistic.fit(X_train, y_train) self.failUnless( test_accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted, y_test, 0.95))
def testLogisticSK1(self): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) X_train = X_digits[:.9 * n_samples] y_train = y_digits[:.9 * n_samples] X_test = X_digits[.9 * n_samples:] y_test = y_digits[.9 * n_samples:] logistic = LogisticRegression(sqlCtx) score = logistic.fit(X_train, y_train).score(X_test, y_test) self.failUnless(score > 0.9)
def testLogisticSK2(self): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) X_train = X_digits[:.9 * n_samples] y_train = y_digits[:.9 * n_samples] X_test = X_digits[.9 * n_samples:] y_test = y_digits[.9 * n_samples:] # Convert to DataFrame for i/o: current way to transfer data logistic = LogisticRegression(sqlCtx, transferUsingDF=True) score = logistic.fit(X_train, y_train).score(X_test, y_test) self.failUnless(score > 0.9)
def test_logistic(self): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) X_train = X_digits[:int(.9 * n_samples)] y_train = y_digits[:int(.9 * n_samples)] X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] logistic = LogisticRegression(sparkSession) logistic.fit(X_train, y_train) mllearn_predicted = logistic.predict(X_test) sklearn_logistic = linear_model.LogisticRegression() sklearn_logistic.fit(X_train, y_train) self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
def testLogisticMLPipeline1(self): training = sqlCtx.createDataFrame([("a b c d e spark", 1.0), ("b d", 2.0), ("spark f g h", 1.0), ("hadoop mapreduce", 2.0), ("b spark who", 1.0), ("g d a y", 2.0), ("spark fly", 1.0), ("was mapreduce", 2.0), ("e spark program", 1.0), ("a e c l", 2.0), ("spark compile", 1.0), ("hadoop software", 2.0)], ["text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) lr = LogisticRegression(sqlCtx) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) test = sqlCtx.createDataFrame([("spark i j k", 1.0), ("l m n", 2.0), ("mapreduce spark", 1.0), ("apache hadoop", 2.0)], ["text", "label"]) result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator() score = evaluator.evaluate(predictionAndLabels) self.failUnless(score == 1.0)
def test_logistic_sk2(self): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) X_train = X_digits[:int(.9 * n_samples)] y_train = y_digits[:int(.9 * n_samples)] X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] # Convert to DataFrame for i/o: current way to transfer data logistic = LogisticRegression(sparkSession, transferUsingDF=True) logistic.fit(X_train, y_train) mllearn_predicted = logistic.predict(X_test) sklearn_logistic = linear_model.LogisticRegression() sklearn_logistic.fit(X_train, y_train) self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn