def operate(self, dm_list: typing.List, phase='train'): # The input of a AutoCrossOperator is a DataManager assert len(dm_list) == 1 dm = dm_list[0] assert isinstance(dm, DataManager) self.check_phase(phase) feature_types = dm.feature_types onehot_index = [i for i in range(len(feature_types)) if feature_types[i] == "One-Hot"] numerical_index = [i for i in range(len(feature_types)) if feature_types[i] == 'Discrete' or feature_types[i] == 'Float'] if phase == 'train': from sklearn.model_selection import train_test_split if self.stratify: train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2, stratify=dm.train_y) else: train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2) x = dm.train_X self.autocross.fit(train_x, val_x, train_y, val_y, onehot_index, numerical_index) result_dm = DataManager() result_dm.train_X = self.autocross.transform(x) result_dm.train_y = dm.train_y else: x = dm.test_X result_dm = DataManager() result_dm.test_X = self.autocross.transform(x) return result_dm
def operate(self, dm_list: typing.List, phase='train'): assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] if phase == 'train': x = dm.train_X newfeature = np.zeros((len(x), 1)) for i, sample in enumerate(x): cnt = 0 for column in sample: if column == 0: cnt += 1 newfeature[i] = cnt result_dm = DataManager() result_dm.train_X = newfeature result_dm.train_y = dm.train_y else: x = dm.test_X newfeature = np.zeros((len(x), 1)) for i, sample in enumerate(x): cnt = 0 for column in sample: if column == 0: cnt += 1 newfeature[i] = cnt result_dm = DataManager() result_dm.test_X = newfeature return result_dm
def operate(self, dm_list: typing.List, phase='train'): # The input of a NaiveSelectorOperator is a list of DataManager self.check_phase(phase) x = None y = None if phase == 'train': for dm in dm_list: if x is None: x = dm.train_X y = dm.train_y else: x = np.hstack((x, dm.train_X)) x = self.selector.fit_transform(x, y) dm = DataManager(x, y, spilt=False) else: for dm in dm_list: if x is None: x = dm.test_X else: x = np.hstack((x, dm.test_X)) x = self.selector.transform(x) dm = DataManager() dm.test_X = x return dm
def test_categorical_indexer(): train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8], ["c", 10, "java", 4.8]]) valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8], ["d", 10, "python", 4.8]]) test_x = np.array([["a", 1, "scala", 4.5]]) dm = DataManager() dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"] dm.train_X = train_x dm.val_X = valid_x dm.test_X = test_x dm = categorical_indexer(dm) print(dm.feature_types) print(dm.train_X) print("----------------------------") print(dm.val_X) print("----------------------------") print(dm.test_X)
def test_impute_dm(): train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8], ["c", 10, "java", 4.8]]) valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8], ["d", 10, "python", 4.8]]) test_x = np.array([["a", 1, "scala", 4.5]]) train_x[2][0] = "???" train_x[2][2] = "???" valid_x[0][1] = np.nan test_x[0][-1] = np.nan dm = DataManager() dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"] dm.train_X = train_x.astype(np.object) dm.val_X = valid_x.astype(np.object) dm.test_X = test_x.astype(np.object) dm = impute_dm(dm, "???") print(dm.feature_types) print(dm.train_X) print("----------------------------") print(dm.val_X) print("----------------------------") print(dm.test_X)
def one_hot(dm: DataManager) -> DataManager: """ Convert the categorical features to float with one-hot encoding :param dm: :return: """ feature_types = dm.feature_types categorical_index = [ i for i in range(len(feature_types)) if feature_types[i] == "Categorical" ] other_index = [ i for i in range(len(feature_types)) if feature_types[i] != "Categorical" ] encoder = OneHotEncoder(handle_unknown="ignore") (train_x, _), (valid_x, _), (test_x, _) = dm.get_train(), dm.get_val(), dm.get_test() train_size = len(train_x) valid_size = 0 test_size = 0 if train_x is None: raise ValueError("train_x has no value!!!") if valid_x is not None and test_x is not None: x = np.concatenate([train_x, valid_x, test_x]) valid_size = len(valid_x) test_size = len(test_x) elif valid_x is not None: x = np.concatenate([train_x, valid_x]) valid_size = len(valid_x) else: x = train_x categorical_x = x[:, categorical_index] other_x = x[:, other_index] encoder.fit(categorical_x) categorical_x = encoder.transform(categorical_x).toarray() categorical_features = ["One-Hot"] * categorical_x.shape[1] other_features = [feature_types[i] for i in other_index] x = np.hstack((categorical_x, other_x)).astype(np.float) dm.feature_types = np.concatenate((categorical_features, other_features)) train_x, valid_x, test_x = _split_data(x, train_size, valid_size, test_size) if valid_size == 0: valid_x = None if test_size == 0: test_x = None dm.train_X = train_x dm.val_X = valid_x dm.test_X = test_x return dm
def operate(self, dm_list: typing.List, phase='train'): # The input of a PCAOperator is a DataManager assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] feature_types = dm.feature_types numerical_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] if phase == 'train': x = dm.train_X result_dm = DataManager() result_dm.train_X = self.pca.fit_transform(x[:, numerical_index]) result_dm.train_y = dm.train_y else: x = dm.test_X result_dm = DataManager() result_dm.test_X = self.pca.fit_transform(x[:, numerical_index]) return result_dm
def operate(self, dm_list: typing.List, phase='train'): ''' :return: self.result_dm is a new Datamanager with data splited for training and validation ''' x = None y = None if phase == 'train': for dm in dm_list: if x is None: x = dm.train_X y = dm.train_y else: x = np.hstack((x, dm.train_X)) self.selector.fit(x, y) else: for dm in dm_list: if x is None: x = dm.test_X else: x = np.hstack((x, dm.test_X)) if self.model == self.RANDOM_FOREST: self.sorted_features = np.argsort( self.selector.feature_importances_)[::-1] elif self.model == self.LASSO_REGRESSION: if self.selector.coef_.ndim == 1: self.sorted_features = np.argsort(self.selector.coef_)[::-1] else: importances = np.linalg.norm(self.selector.coef_, axis=0, ord=1) self.sorted_features = np.argsort(importances)[::-1] x = x[:, self.sorted_features[:self.kbest]] dm = DataManager() if phase == 'train': dm.train_X = x dm.train_y = y else: dm.test_X = x return dm
def operate(self, dm_list: typing.List, phase='train') -> DataManager: # The input of a PolynomialFeatureOperator is a DataManager assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] feature_types = dm.feature_types numericial_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] init_length = len(numericial_index) + 1 if phase == 'train': x = dm.train_X newfeatures = self.polynomialfeatures.fit_transform(x[:, numericial_index]) result_dm = DataManager() result_dm.train_X = newfeatures[:, init_length:] result_dm.train_y = dm.train_y else: x = dm.test_X newfeatures = self.polynomialfeatures.transform(x[:, numericial_index]) result_dm = DataManager() result_dm.test_X = newfeatures[:, init_length:] return result_dm
def operate(self, dm_list: typing.List, phase='train'): self.check_phase(phase) x = None y = None if phase == 'train': for dm in dm_list: if x is None: x = dm.train_X y = dm.train_y else: x = np.hstack((x, dm.train_X)) dm = DataManager(x, y, spilt=False) else: for dm in dm_list: if x is None: x = dm.test_X else: x = np.hstack((x, dm.test_X)) dm = DataManager() dm.test_X = x return dm
def operate(self, dm_list: typing.List, phase='train'): # The input of a ImputeOperator is a pd.Dataframe assert len(dm_list) == 1 and isinstance(dm_list[0], pd.DataFrame) self.check_phase(phase) input_df = dm_list[0] df = self.impute_df(input_df) dm = DataManager() label_col = df.columns[self.label_col] if phase == 'train' else None dm.set_col_type(df, label_col) data = df.values if phase == 'train': # Swap label index to -1 swap_list = list(range(data.shape[1])) del (swap_list[self.label_col]) swap_list.append(self.label_col) data = data[:, swap_list] dm.train_X = data[:, :-1] dm.train_y = data[:, -1] else: dm.test_X = data return dm
print("after normalize rescale\n") print(dm.train_X) print(dm.val_X) print(dm.test_X) print(dm.feature_types) if __name__ == '__main__': np.random.seed(19941125) dm = DataManager() dm.train_X = np.random.rand(5, 5) dm.val_X = np.random.rand(3, 5) dm.test_X = np.random.rand(2, 5) dm.feature_types = ["Discrete", "One-Hot", "Float", "Float", "Categorical"] print("Original data......\n") print(dm.train_X) print(dm.val_X) print(dm.test_X) print(dm.feature_types) print("start test MinMaxScaler.......\n") test_minmax(dm) print("start test StandardScaler......\n") test_standard(dm) print("start test MaxAbsScaler......\n")
import pandas as pd import warnings from alphaml.engine.components.data_manager import DataManager from alphaml.engine.components.feature_engineering.auto_feature import AutoFeature warnings.filterwarnings("ignore") home_path = os.path.expanduser('~') train_path = os.path.join(home_path, "datasets/santander/train.csv") test_path = os.path.join(home_path, "datasets/santander/test.csv") df_train = pd.read_csv(train_path) df_test = pd.read_csv(test_path) df_train.drop(labels=["ID_code"], axis=1, inplace=True) df_test.drop(labels=["ID_code"], axis=1, inplace=True) x_train = df_train.drop(labels=["target"], axis=1).values y_train = df_train["target"].values x_test = df_test.values del df_train del df_test dm = DataManager(x_train, y_train) dm.test_X = x_test auto_feature = AutoFeature(metrics="auc") dm = auto_feature.fit(dm, generated_num=100)