def fit(self, data: DataManager, **kwargs): from alphaml.engine.evaluator.dl_evaluator import BaseImgEvaluator task_type = kwargs['task_type'] if data.train_X is None and data.train_y is None: inputshape = data.target_shape classnum = None else: inputshape = data.train_X.shape[1:] classnum = None if task_type == 'img_multiclass': data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y) data.train_y = to_categorical(data.train_y) data.val_y, _, _ = map_label(data.val_y, self.map_dict) data.val_y = to_categorical(data.val_y) classnum = len(self.rev_map_dict) elif task_type == 'img_binary': data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y, if_binary=True) data.val_y, _, _ = map_label(data.val_y, self.map_dict, if_binary=True) classnum = 1 elif task_type == 'img_multilabel-indicator': classnum = get_classnum(data.train_y) self.evaluator = BaseImgEvaluator(inputshape, classnum) return super().fit(data, **kwargs)
def operate(self, dm_list: typing.List, phase='train'): # The input of a AutoCrossOperator is a DataManager assert len(dm_list) == 1 dm = dm_list[0] assert isinstance(dm, DataManager) self.check_phase(phase) feature_types = dm.feature_types onehot_index = [i for i in range(len(feature_types)) if feature_types[i] == "One-Hot"] numerical_index = [i for i in range(len(feature_types)) if feature_types[i] == 'Discrete' or feature_types[i] == 'Float'] if phase == 'train': from sklearn.model_selection import train_test_split if self.stratify: train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2, stratify=dm.train_y) else: train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2) x = dm.train_X self.autocross.fit(train_x, val_x, train_y, val_y, onehot_index, numerical_index) result_dm = DataManager() result_dm.train_X = self.autocross.transform(x) result_dm.train_y = dm.train_y else: x = dm.test_X result_dm = DataManager() result_dm.test_X = self.autocross.transform(x) return result_dm
def operate(self, dm_list: typing.List, phase='train'): assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] if phase == 'train': x = dm.train_X newfeature = np.zeros((len(x), 1)) for i, sample in enumerate(x): cnt = 0 for column in sample: if column == 0: cnt += 1 newfeature[i] = cnt result_dm = DataManager() result_dm.train_X = newfeature result_dm.train_y = dm.train_y else: x = dm.test_X newfeature = np.zeros((len(x), 1)) for i, sample in enumerate(x): cnt = 0 for column in sample: if column == 0: cnt += 1 newfeature[i] = cnt result_dm = DataManager() result_dm.test_X = newfeature return result_dm
def operate(self, dm_list: typing.List, phase='train'): # The input of a PCAOperator is a DataManager assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] feature_types = dm.feature_types numerical_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] if phase == 'train': x = dm.train_X result_dm = DataManager() result_dm.train_X = self.pca.fit_transform(x[:, numerical_index]) result_dm.train_y = dm.train_y else: x = dm.test_X result_dm = DataManager() result_dm.test_X = self.pca.fit_transform(x[:, numerical_index]) return result_dm
def operate(self, dm_list: typing.List, phase='train'): ''' :return: self.result_dm is a new Datamanager with data splited for training and validation ''' x = None y = None if phase == 'train': for dm in dm_list: if x is None: x = dm.train_X y = dm.train_y else: x = np.hstack((x, dm.train_X)) self.selector.fit(x, y) else: for dm in dm_list: if x is None: x = dm.test_X else: x = np.hstack((x, dm.test_X)) if self.model == self.RANDOM_FOREST: self.sorted_features = np.argsort( self.selector.feature_importances_)[::-1] elif self.model == self.LASSO_REGRESSION: if self.selector.coef_.ndim == 1: self.sorted_features = np.argsort(self.selector.coef_)[::-1] else: importances = np.linalg.norm(self.selector.coef_, axis=0, ord=1) self.sorted_features = np.argsort(importances)[::-1] x = x[:, self.sorted_features[:self.kbest]] dm = DataManager() if phase == 'train': dm.train_X = x dm.train_y = y else: dm.test_X = x return dm
def operate(self, dm_list: typing.List, phase='train') -> DataManager: # The input of a PolynomialFeatureOperator is a DataManager assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] feature_types = dm.feature_types numericial_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] init_length = len(numericial_index) + 1 if phase == 'train': x = dm.train_X newfeatures = self.polynomialfeatures.fit_transform(x[:, numericial_index]) result_dm = DataManager() result_dm.train_X = newfeatures[:, init_length:] result_dm.train_y = dm.train_y else: x = dm.test_X newfeatures = self.polynomialfeatures.transform(x[:, numericial_index]) result_dm = DataManager() result_dm.test_X = newfeatures[:, init_length:] return result_dm
def operate(self, dm_list: typing.List, phase='train'): # The input of a ImputeOperator is a pd.Dataframe assert len(dm_list) == 1 and isinstance(dm_list[0], pd.DataFrame) self.check_phase(phase) input_df = dm_list[0] df = self.impute_df(input_df) dm = DataManager() label_col = df.columns[self.label_col] if phase == 'train' else None dm.set_col_type(df, label_col) data = df.values if phase == 'train': # Swap label index to -1 swap_list = list(range(data.shape[1])) del (swap_list[self.label_col]) swap_list.append(self.label_col) data = data[:, swap_list] dm.train_X = data[:, :-1] dm.train_y = data[:, -1] else: dm.test_X = data return dm
df["Sex"] = df["Sex"].replace(["male", "female"], [0, 1]) df.drop(columns="Ticket", axis=1, inplace=True) for i in range(df.shape[0]): if df["Cabin"][i] == "C23 C25 C27": df["Cabin"][i] = 0 else: df["Cabin"][i] = 1 df["Cabin"] = df["Cabin"].astype("float") df = pd.get_dummies(df) x = df.values x_train = x[:train_size] x_test = x[train_size:] dm = DataManager() dm.train_X = x_train dm.train_y = y_train clf = Classifier(optimizer="smbo") clf.fit(dm, metric="accuracy", runcount=200) submission = pd.read_csv(home_path + "/datasets/titanic/gender_submission.csv") submission["Survived"] = clf.predict(x_test) submission.to_csv(home_path + "/datasets/titanic/xgboost.csv", index=False)