Ejemplo n.º 1
0
    def fit(self, data: DataManager, **kwargs):
        from alphaml.engine.evaluator.dl_evaluator import BaseImgEvaluator
        task_type = kwargs['task_type']
        if data.train_X is None and data.train_y is None:
            inputshape = data.target_shape
            classnum = None
        else:
            inputshape = data.train_X.shape[1:]
            classnum = None
            if task_type == 'img_multiclass':
                data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y)
                data.train_y = to_categorical(data.train_y)
                data.val_y, _, _ = map_label(data.val_y, self.map_dict)
                data.val_y = to_categorical(data.val_y)
                classnum = len(self.rev_map_dict)

            elif task_type == 'img_binary':
                data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y, if_binary=True)
                data.val_y, _, _ = map_label(data.val_y, self.map_dict, if_binary=True)
                classnum = 1

            elif task_type == 'img_multilabel-indicator':
                classnum = get_classnum(data.train_y)

        self.evaluator = BaseImgEvaluator(inputshape, classnum)

        return super().fit(data, **kwargs)
Ejemplo n.º 2
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a AutoCrossOperator is a DataManager
        assert len(dm_list) == 1
        dm = dm_list[0]
        assert isinstance(dm, DataManager)
        self.check_phase(phase)

        feature_types = dm.feature_types
        onehot_index = [i for i in range(len(feature_types))
                        if feature_types[i] == "One-Hot"]
        numerical_index = [i for i in range(len(feature_types))
                           if feature_types[i] == 'Discrete' or feature_types[i] == 'Float']

        if phase == 'train':
            from sklearn.model_selection import train_test_split
            if self.stratify:
                train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2,
                                                                  stratify=dm.train_y)
            else:
                train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2)
            x = dm.train_X
            self.autocross.fit(train_x, val_x, train_y, val_y, onehot_index, numerical_index)
            result_dm = DataManager()
            result_dm.train_X = self.autocross.transform(x)
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            result_dm = DataManager()
            result_dm.test_X = self.autocross.transform(x)
        return result_dm
Ejemplo n.º 3
0
    def operate(self, dm_list: typing.List, phase='train'):
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        if phase == 'train':
            x = dm.train_X
            newfeature = np.zeros((len(x), 1))
            for i, sample in enumerate(x):
                cnt = 0
                for column in sample:
                    if column == 0:
                        cnt += 1
                newfeature[i] = cnt
            result_dm = DataManager()
            result_dm.train_X = newfeature
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            newfeature = np.zeros((len(x), 1))
            for i, sample in enumerate(x):
                cnt = 0
                for column in sample:
                    if column == 0:
                        cnt += 1
                newfeature[i] = cnt
            result_dm = DataManager()
            result_dm.test_X = newfeature
        return result_dm
Ejemplo n.º 4
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a PCAOperator is a DataManager
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        feature_types = dm.feature_types
        numerical_index = [i for i in range(len(feature_types))
                           if feature_types[i] == "Float" or feature_types[i] == "Discrete"]
        if phase == 'train':
            x = dm.train_X
            result_dm = DataManager()
            result_dm.train_X = self.pca.fit_transform(x[:, numerical_index])
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            result_dm = DataManager()
            result_dm.test_X = self.pca.fit_transform(x[:, numerical_index])
        return result_dm
Ejemplo n.º 5
0
    def operate(self, dm_list: typing.List, phase='train'):
        '''
        :return: self.result_dm is a new Datamanager with data splited for training and validation
        '''
        x = None
        y = None
        if phase == 'train':
            for dm in dm_list:
                if x is None:
                    x = dm.train_X
                    y = dm.train_y
                else:
                    x = np.hstack((x, dm.train_X))
            self.selector.fit(x, y)
        else:
            for dm in dm_list:
                if x is None:
                    x = dm.test_X
                else:
                    x = np.hstack((x, dm.test_X))

        if self.model == self.RANDOM_FOREST:
            self.sorted_features = np.argsort(
                self.selector.feature_importances_)[::-1]
        elif self.model == self.LASSO_REGRESSION:
            if self.selector.coef_.ndim == 1:
                self.sorted_features = np.argsort(self.selector.coef_)[::-1]
            else:
                importances = np.linalg.norm(self.selector.coef_,
                                             axis=0,
                                             ord=1)
                self.sorted_features = np.argsort(importances)[::-1]
        x = x[:, self.sorted_features[:self.kbest]]
        dm = DataManager()
        if phase == 'train':
            dm.train_X = x
            dm.train_y = y
        else:
            dm.test_X = x
        return dm
Ejemplo n.º 6
0
    def operate(self, dm_list: typing.List, phase='train') -> DataManager:
        # The input of a PolynomialFeatureOperator is a DataManager
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        feature_types = dm.feature_types
        numericial_index = [i for i in range(len(feature_types))
                            if feature_types[i] == "Float" or feature_types[i] == "Discrete"]
        init_length = len(numericial_index) + 1
        if phase == 'train':
            x = dm.train_X
            newfeatures = self.polynomialfeatures.fit_transform(x[:, numericial_index])
            result_dm = DataManager()
            result_dm.train_X = newfeatures[:, init_length:]
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            newfeatures = self.polynomialfeatures.transform(x[:, numericial_index])
            result_dm = DataManager()
            result_dm.test_X = newfeatures[:, init_length:]
        return result_dm
Ejemplo n.º 7
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a ImputeOperator is a pd.Dataframe
        assert len(dm_list) == 1 and isinstance(dm_list[0], pd.DataFrame)
        self.check_phase(phase)

        input_df = dm_list[0]
        df = self.impute_df(input_df)
        dm = DataManager()

        label_col = df.columns[self.label_col] if phase == 'train' else None
        dm.set_col_type(df, label_col)
        data = df.values
        if phase == 'train':
            # Swap label index to -1
            swap_list = list(range(data.shape[1]))
            del (swap_list[self.label_col])
            swap_list.append(self.label_col)
            data = data[:, swap_list]
            dm.train_X = data[:, :-1]
            dm.train_y = data[:, -1]
        else:
            dm.test_X = data
        return dm
Ejemplo n.º 8
0
df["Sex"] = df["Sex"].replace(["male", "female"], [0, 1])

df.drop(columns="Ticket", axis=1, inplace=True)

for i in range(df.shape[0]):
    if df["Cabin"][i] == "C23 C25 C27":
        df["Cabin"][i] = 0
    else:
        df["Cabin"][i] = 1

df["Cabin"] = df["Cabin"].astype("float")

df = pd.get_dummies(df)

x = df.values

x_train = x[:train_size]
x_test = x[train_size:]

dm = DataManager()
dm.train_X = x_train
dm.train_y = y_train


clf = Classifier(optimizer="smbo")
clf.fit(dm, metric="accuracy", runcount=200)

submission = pd.read_csv(home_path + "/datasets/titanic/gender_submission.csv")
submission["Survived"] = clf.predict(x_test)
submission.to_csv(home_path + "/datasets/titanic/xgboost.csv", index=False)