def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso()) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def test_cv_pipeline(self): pipeline = SKL_Pipeline([ ('vect', SKL_HashingVectorizer(n_features=20)), ('tfidf', SKL_TfidfTransformer(use_idf=False)), ('lasso', SKL_Lasso(max_iter=1)) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas() skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha']) # TODO for gs in skl_gs.grid_scores_: pass # assert(gs.)
def test_cv_linreg(self): pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))]) parameters = {'lasso__alpha': (0.001, 0.005, 0.01)} grid_search = GridSearchCV(self.sc, pipeline, parameters) X = scipy.sparse.vstack( map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100))) y = np.array(list(range(0, 100))).reshape((100, 1)) skl_gs = grid_search.fit(X, y) assert len(skl_gs.cv_results_['params']) == len( parameters['lasso__alpha'])