def test_logistic_random_data(self):
     X_train, X_test, y_train, y_test = get_classification_data(n_classes=2)
     logistic = LogisticRegression(sparkSession)
     logistic.fit(X_train, y_train)
     mllearn_predicted = logistic.predict(X_test)
     sklearn_logistic = linear_model.LogisticRegression()
     sklearn_logistic.fit(X_train, y_train)
     self.failUnless(
         test_accuracy_score(sklearn_logistic.predict(X_test),
                             mllearn_predicted, y_test, 0.95))
Ejemplo n.º 2
0
 def testLogisticSK1(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:.9 * n_samples]
     y_train = y_digits[:.9 * n_samples]
     X_test = X_digits[.9 * n_samples:]
     y_test = y_digits[.9 * n_samples:]
     logistic = LogisticRegression(sqlCtx)
     score = logistic.fit(X_train, y_train).score(X_test, y_test)
     self.failUnless(score > 0.9)
Ejemplo n.º 3
0
 def testLogisticSK1(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:.9 * n_samples]
     y_train = y_digits[:.9 * n_samples]
     X_test = X_digits[.9 * n_samples:]
     y_test = y_digits[.9 * n_samples:]
     logistic = LogisticRegression(sqlCtx)
     score = logistic.fit(X_train, y_train).score(X_test, y_test)
     self.failUnless(score > 0.9)
Ejemplo n.º 4
0
 def testLogisticSK2(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:.9 * n_samples]
     y_train = y_digits[:.9 * n_samples]
     X_test = X_digits[.9 * n_samples:]
     y_test = y_digits[.9 * n_samples:]
     # Convert to DataFrame for i/o: current way to transfer data
     logistic = LogisticRegression(sqlCtx, transferUsingDF=True)
     score = logistic.fit(X_train, y_train).score(X_test, y_test)
     self.failUnless(score > 0.9)
Ejemplo n.º 5
0
 def testLogisticSK2(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:.9 * n_samples]
     y_train = y_digits[:.9 * n_samples]
     X_test = X_digits[.9 * n_samples:]
     y_test = y_digits[.9 * n_samples:]
     # Convert to DataFrame for i/o: current way to transfer data
     logistic = LogisticRegression(sqlCtx, transferUsingDF=True)
     score = logistic.fit(X_train, y_train).score(X_test, y_test)
     self.failUnless(score > 0.9)
Ejemplo n.º 6
0
 def test_logistic(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:int(.9 * n_samples)]
     y_train = y_digits[:int(.9 * n_samples)]
     X_test = X_digits[int(.9 * n_samples):]
     y_test = y_digits[int(.9 * n_samples):]
     logistic = LogisticRegression(sparkSession)
     logistic.fit(X_train, y_train)
     mllearn_predicted = logistic.predict(X_test)
     sklearn_logistic = linear_model.LogisticRegression()
     sklearn_logistic.fit(X_train, y_train)
     self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
Ejemplo n.º 7
0
 def test_logistic(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:int(.9 * n_samples)]
     y_train = y_digits[:int(.9 * n_samples)]
     X_test = X_digits[int(.9 * n_samples):]
     y_test = y_digits[int(.9 * n_samples):]
     logistic = LogisticRegression(sparkSession)
     logistic.fit(X_train, y_train)
     mllearn_predicted = logistic.predict(X_test)
     sklearn_logistic = linear_model.LogisticRegression()
     sklearn_logistic.fit(X_train, y_train)
     self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
Ejemplo n.º 8
0
 def testLogisticMLPipeline1(self):
     training = sqlCtx.createDataFrame([("a b c d e spark", 1.0),
                                        ("b d", 2.0), ("spark f g h", 1.0),
                                        ("hadoop mapreduce", 2.0),
                                        ("b spark who", 1.0),
                                        ("g d a y", 2.0),
                                        ("spark fly", 1.0),
                                        ("was mapreduce", 2.0),
                                        ("e spark program", 1.0),
                                        ("a e c l", 2.0),
                                        ("spark compile", 1.0),
                                        ("hadoop software", 2.0)],
                                       ["text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol="words",
                           outputCol="features",
                           numFeatures=20)
     lr = LogisticRegression(sqlCtx)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     test = sqlCtx.createDataFrame([("spark i j k", 1.0), ("l m n", 2.0),
                                    ("mapreduce spark", 1.0),
                                    ("apache hadoop", 2.0)],
                                   ["text", "label"])
     result = model.transform(test)
     predictionAndLabels = result.select("prediction", "label")
     evaluator = MulticlassClassificationEvaluator()
     score = evaluator.evaluate(predictionAndLabels)
     self.failUnless(score == 1.0)
Ejemplo n.º 9
0
 def test_logistic_sk2(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:int(.9 * n_samples)]
     y_train = y_digits[:int(.9 * n_samples)]
     X_test = X_digits[int(.9 * n_samples):]
     y_test = y_digits[int(.9 * n_samples):]
     # Convert to DataFrame for i/o: current way to transfer data
     logistic = LogisticRegression(sparkSession, transferUsingDF=True)
     logistic.fit(X_train, y_train)
     mllearn_predicted = logistic.predict(X_test)
     sklearn_logistic = linear_model.LogisticRegression()
     sklearn_logistic.fit(X_train, y_train)
     self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
Ejemplo n.º 10
0
 def test_logistic_sk2(self):
     digits = datasets.load_digits()
     X_digits = digits.data
     y_digits = digits.target
     n_samples = len(X_digits)
     X_train = X_digits[:int(.9 * n_samples)]
     y_train = y_digits[:int(.9 * n_samples)]
     X_test = X_digits[int(.9 * n_samples):]
     y_test = y_digits[int(.9 * n_samples):]
     # Convert to DataFrame for i/o: current way to transfer data
     logistic = LogisticRegression(sparkSession, transferUsingDF=True)
     logistic.fit(X_train, y_train)
     mllearn_predicted = logistic.predict(X_test)
     sklearn_logistic = linear_model.LogisticRegression()
     sklearn_logistic.fit(X_train, y_train)
     self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn