def setUpClass(cls): from sklearn.datasets import load_iris irisArr = load_iris() cls._irisArr = {"X": irisArr.data, "y": irisArr.target} from lale.datasets import sklearn_to_pandas (train_X, train_y), (test_X, test_y) = sklearn_to_pandas.load_iris_df() cls._irisDf = {"X": train_X, "y": train_y} (train_X, train_y), (test_X, test_y) = sklearn_to_pandas.digits_df() cls._digits = {"X": train_X, "y": train_y} (train_X, train_y), (test_X, test_y) = sklearn_to_pandas.california_housing_df() cls._housing = {"X": train_X, "y": train_y} from lale.datasets import openml (train_X, train_y), (test_X, test_y) = openml.fetch("credit-g", "classification", preprocess=False) cls._creditG = {"X": train_X, "y": train_y} from lale.datasets import load_movie_review train_X, train_y = load_movie_review() cls._movies = {"X": train_X, "y": train_y} from lale.datasets.uci.uci_datasets import fetch_drugscom train_X, train_y, test_X, test_y = fetch_drugscom() cls._drugRev = {"X": train_X, "y": train_y}
def test_text_and_structured(self): from lale.datasets.uci.uci_datasets import fetch_drugscom from sklearn.model_selection import train_test_split train_X_all, train_y_all, test_X, test_y = fetch_drugscom() #subset to speed up debugging train_X, train_X_ignore, train_y, train_y_ignore = train_test_split( train_X_all, train_y_all, train_size=0.01, random_state=42) from lale.lib.lale import Project from lale.lib.lale import ConcatFeatures as Cat from lale.lib.sklearn import TfidfVectorizer as Tfidf from lale.lib.sklearn import LinearRegression as LinReg from lale.lib.sklearn import RandomForestRegressor as Forest prep_text = Project(columns=['review']) >> Tfidf(max_features=100) prep_nums = Project(columns={'type': 'number'}) planned = (prep_text & prep_nums) >> Cat >> (LinReg | Forest) from lale.lib.lale import Hyperopt hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1, scoring='r2') best_found = hyperopt_classifier.fit(train_X, train_y)