def test_with_concat_features2(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from lale.lib.lale import Hyperopt data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) from lale.operators import make_pipeline pipeline = make_pipeline( ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr) | KNeighborsClassifier() ) clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def _fit_gbt_num(self, X, y): from lale.lib.lale import Project from lale.lib.sklearn import SimpleImputer gbt = auto_gbt(self.prediction_type) trainable = (Project(columns={'type': 'number'}) >> SimpleImputer(strategy='mean') >> gbt()) self._try_and_add('gbt_num', trainable, X, y)
def _fit_gbt_num(self, X, y): from lale.lib.lale import Project from lale.lib.sklearn import SimpleImputer gbt = auto_gbt(self.prediction_type) trainable = (Project(columns={"type": "number"}) >> SimpleImputer(strategy="mean") >> gbt()) self._try_and_add("gbt_num", trainable, X, y)
def test_pipeline_AWTTR_1(self): trainable = AutoaiTSPipeline(steps=[( "AutoaiWindowTransformedTargetRegressor", AutoaiWindowTransformedTargetRegressor( regressor=SmallDataWindowTransformer() >> SimpleImputer() >> RandomForestRegressor()), )]) self.doTestPipeline(trainable, self.y, self.y, self.y, self.y)
def auto_prep(X): from lale.lib.lale import ConcatFeatures, Project, categorical from lale.lib.sklearn import OneHotEncoder, SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy="mean") prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder( handle_unknown="ignore") if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={"type": "number"}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_nested_pipeline1(self): from sklearn.datasets import load_iris from lale.lib.lale import Hyperopt from sklearn.metrics import accuracy_score data = load_iris() X, y = data.data, data.target #pipeline = KNeighborsClassifier() | (OneHotEncoder(handle_unknown = 'ignore') >> LogisticRegression()) pipeline = KNeighborsClassifier() | (SimpleImputer() >> LogisticRegression()) clf = Hyperopt(estimator=pipeline, max_evals=1) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions))
def auto_prep(X): from lale.lib.lale import ConcatFeatures from lale.lib.lale import Project from lale.lib.lale import categorical from lale.lib.sklearn import OneHotEncoder from lale.lib.sklearn import SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy='mean') prep_cat = (SimpleImputer(strategy='most_frequent') >> OneHotEncoder(handle_unknown='ignore')) if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={'type': 'number'}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_pipeline_AWWR(self): trainable = AutoaiTSPipeline(steps=[( "AutoaiWindowTransformedTargetRegressor", AutoaiWindowedWrappedRegressor( regressor=SmallDataWindowTransformer() >> SimpleImputer() >> RandomForestRegressor()), )]) self.doTestPipeline(trainable, self.y, self.y, self.y, self.y, optimization=True)
def test_pipeline_AWTTR_2(self): trainable = AutoaiTSPipeline(steps=[( "AutoaiWindowTransformedTargetRegressor", AutoaiWindowTransformedTargetRegressor( regressor=SmallDataWindowTransformer() >> SimpleImputer() >> RandomForestRegressor(), estimator_prediction_type="rowwise", ), )]) self.doTestPipeline(trainable, self.y, self.y, self.y, self.y, optimization=True)
def fetch(dataset_name, task_type, verbose=False, preprocess=True): if verbose: print('Loading dataset:', dataset_name) #Check that the dataset name exists in experiments_dict try: dataset_name_found = experiments_dict[dataset_name] if experiments_dict[dataset_name]['task_type'] != task_type.lower(): raise ValueError("The task type {} does not match with the given datasets task type {}"\ .format(task_type, experiments_dict[dataset_name]['task_type'])) except KeyError: raise KeyError("Dataset name {} not found in the supported datasets".format(dataset_name)) data_file_name = os.path.join(download_data_dir, dataset_name+".arff") if verbose: print(data_file_name) if not os.path.exists(data_file_name): #TODO: Download the data if not os.path.exists(download_data_dir): os.makedirs(download_data_dir) if verbose: print('created directory {}'.format(download_data_dir)) urllib.request.urlretrieve(experiments_dict[dataset_name]['download_arff_url'], data_file_name) assert os.path.exists(data_file_name) with open(data_file_name) as f: dataDictionary = arff.load(f) f.close() from lale.datasets.data_schemas import liac_arff_to_schema schema_orig = liac_arff_to_schema(dataDictionary) target_col = experiments_dict[dataset_name]['target'] if preprocess: arffData = pd.DataFrame(dataDictionary['data']) #arffData = arffData.fillna(0) attributes = dataDictionary['attributes'] if verbose: print(attributes) categorical_cols = [] numeric_cols = [] X_columns = [] for i, item in enumerate(attributes): if item[0].lower() == target_col: target_indx = i #remove it from attributes so that the next loop indices are adjusted accordingly. del attributes[i] y = arffData.iloc[:,target_indx] arffData = arffData.drop(i, axis = 1) for i, item in enumerate(attributes): X_columns.append(i) if (((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) \ or isinstance(item[1], list)) and (item[0].lower() != 'class')): categorical_cols.append(i) elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != 'class'): numeric_cols.append(i) if verbose: print(f'categorical columns: {categorical_cols}') print(f'numeric columns: {numeric_cols}') X = arffData.iloc[:,X_columns] #Check whether there is any error num_classes_from_last_row = len(list(set(y))) if verbose: print('num_classes_from_last_row', num_classes_from_last_row) transformers1 = [ ( 'imputer_str', SimpleImputer(missing_values=None, strategy='most_frequent'), categorical_cols), ( 'imputer_num', SimpleImputer(strategy='mean'), numeric_cols)] txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0) transformers2 = [ ( 'ohe', OneHotEncoder(sparse=False), list(range(len(categorical_cols)))), ( 'no_op', 'passthrough', list(range(len(categorical_cols), len(categorical_cols) + len(numeric_cols))))] txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0) if verbose: print("Shape of X before preprocessing", X.shape) from lale.operators import make_pipeline preprocessing = make_pipeline(txm1, txm2) X = preprocessing.fit(X).transform(X) if verbose: print("Shape of X after preprocessing", X.shape) else: col_names = [attr[0] for attr in dataDictionary['attributes']] df_all = pd.DataFrame(dataDictionary['data'], columns=col_names) y = df_all[target_col] y = y.squeeze() cols_X = [col for col in col_names if col != target_col] X = df_all[cols_X] labelencoder = LabelEncoder() y = labelencoder.fit_transform(y) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size = 0.33, random_state = 0) if verbose: print(f'training set shapes: X {X_train.shape}, y {y_train.shape}') print(f'test set shapes: X {X_test.shape}, y {y_test.shape}') X_train, X_test, y_train, y_test = add_schemas( \ schema_orig, target_col, X_train, X_test, y_train, y_test) return (X_train, y_train), (X_test, y_test)
def test_simple_imputer(self): from lale.lib.sklearn import SimpleImputer reg = SimpleImputer(strategy='mean', fill_value=10) reg.fit(self.X_train, self.y_train)
def test_simple_imputer(self): reg = SimpleImputer(strategy="mean", fill_value=10) reg.fit(self.X_train, self.y_train)