def test_pipeline_same_results(self): X, y, Z = self.make_classification(2, 10000, 2000) loc_clf = LogisticRegression() loc_filter = VarianceThreshold() loc_pipe = Pipeline([ ('threshold', loc_filter), ('logistic', loc_clf) ]) dist_clf = SparkLogisticRegression() dist_filter = SparkVarianceThreshold() dist_pipe = SparkPipeline([ ('threshold', dist_filter), ('logistic', dist_clf) ]) dist_filter.fit(Z) loc_pipe.fit(X, y) dist_pipe.fit(Z, logistic__classes=np.unique(y)) assert_true(np.mean(np.abs( loc_pipe.predict(X) - np.concatenate(dist_pipe.predict(Z[:, 'X']).collect()) )) < 0.1)
def test_pipeline_same_results(self): X, y, Z = self.make_classification(2, 10000, 2000) loc_clf = LogisticRegression() loc_filter = VarianceThreshold() loc_pipe = Pipeline([ ('threshold', loc_filter), ('logistic', loc_clf) ]) dist_clf = SparkLogisticRegression() dist_filter = SparkVarianceThreshold() dist_pipe = SparkPipeline([ ('threshold', dist_filter), ('logistic', dist_clf) ]) dist_filter.fit(Z) loc_pipe.fit(X, y) dist_pipe.fit(Z, logistic__classes=np.unique(y)) assert_true(np.mean(np.abs( loc_pipe.predict(X) - np.concatenate(dist_pipe.predict(Z[:, 'X']).collect()) )) < 0.1)
train_y = sc.parallelize(target_train) train_x = ArrayRDD(train_x) train_y = ArrayRDD(train_y) Z = DictRDD((train_x, train_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # pipeline dist_pipeline = SparkPipeline(( ('vect', SparkHashingVectorizer(non_negative=True)), # hashingTF for NB ('tfidf', SparkTfidfTransformer()), # IDF ('clf', SparkMultinomialNB(alpha=0.05)) # NB )) # fit dist_pipeline.fit(Z, clf__classes=np.array([0, 1])) # test data to RDD test_x = ArrayRDD(sc.parallelize(data_test)) test_y = ArrayRDD(sc.parallelize(target_test)) test_Z = DictRDD((test_x, test_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # predict test data predicts = dist_pipeline.predict(test_Z[:, 'X']) # metrics(accuracy, precision, recall, f1) data_size = len(test) array_y = traget_test array_pred = predicts.toarray()