예제 #1
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a AutoCrossOperator is a DataManager
        assert len(dm_list) == 1
        dm = dm_list[0]
        assert isinstance(dm, DataManager)
        self.check_phase(phase)

        feature_types = dm.feature_types
        onehot_index = [i for i in range(len(feature_types))
                        if feature_types[i] == "One-Hot"]
        numerical_index = [i for i in range(len(feature_types))
                           if feature_types[i] == 'Discrete' or feature_types[i] == 'Float']

        if phase == 'train':
            from sklearn.model_selection import train_test_split
            if self.stratify:
                train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2,
                                                                  stratify=dm.train_y)
            else:
                train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2)
            x = dm.train_X
            self.autocross.fit(train_x, val_x, train_y, val_y, onehot_index, numerical_index)
            result_dm = DataManager()
            result_dm.train_X = self.autocross.transform(x)
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            result_dm = DataManager()
            result_dm.test_X = self.autocross.transform(x)
        return result_dm
예제 #2
0
    def operate(self, dm_list: typing.List, phase='train'):
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        if phase == 'train':
            x = dm.train_X
            newfeature = np.zeros((len(x), 1))
            for i, sample in enumerate(x):
                cnt = 0
                for column in sample:
                    if column == 0:
                        cnt += 1
                newfeature[i] = cnt
            result_dm = DataManager()
            result_dm.train_X = newfeature
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            newfeature = np.zeros((len(x), 1))
            for i, sample in enumerate(x):
                cnt = 0
                for column in sample:
                    if column == 0:
                        cnt += 1
                newfeature[i] = cnt
            result_dm = DataManager()
            result_dm.test_X = newfeature
        return result_dm
예제 #3
0
def test_impute_dm():
    train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8],
                        ["c", 10, "java", 4.8]])

    valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8],
                        ["d", 10, "python", 4.8]])

    test_x = np.array([["a", 1, "scala", 4.5]])

    train_x[2][0] = "???"
    train_x[2][2] = "???"
    valid_x[0][1] = np.nan
    test_x[0][-1] = np.nan

    dm = DataManager()

    dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"]

    dm.train_X = train_x.astype(np.object)
    dm.val_X = valid_x.astype(np.object)
    dm.test_X = test_x.astype(np.object)

    dm = impute_dm(dm, "???")

    print(dm.feature_types)
    print(dm.train_X)
    print("----------------------------")
    print(dm.val_X)
    print("----------------------------")
    print(dm.test_X)
예제 #4
0
def test_categorical_indexer():
    train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8],
                        ["c", 10, "java", 4.8]])

    valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8],
                        ["d", 10, "python", 4.8]])

    test_x = np.array([["a", 1, "scala", 4.5]])

    dm = DataManager()

    dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"]

    dm.train_X = train_x
    dm.val_X = valid_x
    dm.test_X = test_x

    dm = categorical_indexer(dm)

    print(dm.feature_types)
    print(dm.train_X)
    print("----------------------------")
    print(dm.val_X)
    print("----------------------------")
    print(dm.test_X)
예제 #5
0
def one_hot(dm: DataManager) -> DataManager:
    """
    Convert the categorical features to float with one-hot encoding
    :param dm:
    :return:
    """
    feature_types = dm.feature_types
    categorical_index = [
        i for i in range(len(feature_types))
        if feature_types[i] == "Categorical"
    ]
    other_index = [
        i for i in range(len(feature_types))
        if feature_types[i] != "Categorical"
    ]

    encoder = OneHotEncoder(handle_unknown="ignore")
    (train_x, _), (valid_x,
                   _), (test_x,
                        _) = dm.get_train(), dm.get_val(), dm.get_test()

    train_size = len(train_x)
    valid_size = 0
    test_size = 0
    if train_x is None:
        raise ValueError("train_x has no value!!!")
    if valid_x is not None and test_x is not None:
        x = np.concatenate([train_x, valid_x, test_x])
        valid_size = len(valid_x)
        test_size = len(test_x)
    elif valid_x is not None:
        x = np.concatenate([train_x, valid_x])
        valid_size = len(valid_x)
    else:
        x = train_x
    categorical_x = x[:, categorical_index]
    other_x = x[:, other_index]

    encoder.fit(categorical_x)
    categorical_x = encoder.transform(categorical_x).toarray()

    categorical_features = ["One-Hot"] * categorical_x.shape[1]
    other_features = [feature_types[i] for i in other_index]

    x = np.hstack((categorical_x, other_x)).astype(np.float)
    dm.feature_types = np.concatenate((categorical_features, other_features))

    train_x, valid_x, test_x = _split_data(x, train_size, valid_size,
                                           test_size)
    if valid_size == 0:
        valid_x = None
    if test_size == 0:
        test_x = None

    dm.train_X = train_x
    dm.val_X = valid_x
    dm.test_X = test_x

    return dm
예제 #6
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a PCAOperator is a DataManager
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        feature_types = dm.feature_types
        numerical_index = [i for i in range(len(feature_types))
                           if feature_types[i] == "Float" or feature_types[i] == "Discrete"]
        if phase == 'train':
            x = dm.train_X
            result_dm = DataManager()
            result_dm.train_X = self.pca.fit_transform(x[:, numerical_index])
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            result_dm = DataManager()
            result_dm.test_X = self.pca.fit_transform(x[:, numerical_index])
        return result_dm
예제 #7
0
    def operate(self, dm_list: typing.List, phase='train'):
        '''
        :return: self.result_dm is a new Datamanager with data splited for training and validation
        '''
        x = None
        y = None
        if phase == 'train':
            for dm in dm_list:
                if x is None:
                    x = dm.train_X
                    y = dm.train_y
                else:
                    x = np.hstack((x, dm.train_X))
            self.selector.fit(x, y)
        else:
            for dm in dm_list:
                if x is None:
                    x = dm.test_X
                else:
                    x = np.hstack((x, dm.test_X))

        if self.model == self.RANDOM_FOREST:
            self.sorted_features = np.argsort(
                self.selector.feature_importances_)[::-1]
        elif self.model == self.LASSO_REGRESSION:
            if self.selector.coef_.ndim == 1:
                self.sorted_features = np.argsort(self.selector.coef_)[::-1]
            else:
                importances = np.linalg.norm(self.selector.coef_,
                                             axis=0,
                                             ord=1)
                self.sorted_features = np.argsort(importances)[::-1]
        x = x[:, self.sorted_features[:self.kbest]]
        dm = DataManager()
        if phase == 'train':
            dm.train_X = x
            dm.train_y = y
        else:
            dm.test_X = x
        return dm
예제 #8
0
    def operate(self, dm_list: typing.List, phase='train') -> DataManager:
        # The input of a PolynomialFeatureOperator is a DataManager
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        feature_types = dm.feature_types
        numericial_index = [i for i in range(len(feature_types))
                            if feature_types[i] == "Float" or feature_types[i] == "Discrete"]
        init_length = len(numericial_index) + 1
        if phase == 'train':
            x = dm.train_X
            newfeatures = self.polynomialfeatures.fit_transform(x[:, numericial_index])
            result_dm = DataManager()
            result_dm.train_X = newfeatures[:, init_length:]
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            newfeatures = self.polynomialfeatures.transform(x[:, numericial_index])
            result_dm = DataManager()
            result_dm.test_X = newfeatures[:, init_length:]
        return result_dm
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a ImputeOperator is a pd.Dataframe
        assert len(dm_list) == 1 and isinstance(dm_list[0], pd.DataFrame)
        self.check_phase(phase)

        input_df = dm_list[0]
        df = self.impute_df(input_df)
        dm = DataManager()

        label_col = df.columns[self.label_col] if phase == 'train' else None
        dm.set_col_type(df, label_col)
        data = df.values
        if phase == 'train':
            # Swap label index to -1
            swap_list = list(range(data.shape[1]))
            del (swap_list[self.label_col])
            swap_list.append(self.label_col)
            data = data[:, swap_list]
            dm.train_X = data[:, :-1]
            dm.train_y = data[:, -1]
        else:
            dm.test_X = data
        return dm
예제 #10
0
def test_normalize(dm):
    dm = normalize(dm)

    print("after normalize rescale\n")

    print(dm.train_X)
    print(dm.val_X)
    print(dm.test_X)
    print(dm.feature_types)


if __name__ == '__main__':
    np.random.seed(19941125)

    dm = DataManager()
    dm.train_X = np.random.rand(5, 5)
    dm.val_X = np.random.rand(3, 5)
    dm.test_X = np.random.rand(2, 5)
    dm.feature_types = ["Discrete", "One-Hot", "Float", "Float", "Categorical"]

    print("Original data......\n")
    print(dm.train_X)
    print(dm.val_X)
    print(dm.test_X)
    print(dm.feature_types)

    print("start test MinMaxScaler.......\n")
    test_minmax(dm)

    print("start test StandardScaler......\n")
    test_standard(dm)
예제 #11
0
df["Sex"] = df["Sex"].replace(["male", "female"], [0, 1])

df.drop(columns="Ticket", axis=1, inplace=True)

for i in range(df.shape[0]):
    if df["Cabin"][i] == "C23 C25 C27":
        df["Cabin"][i] = 0
    else:
        df["Cabin"][i] = 1

df["Cabin"] = df["Cabin"].astype("float")

df = pd.get_dummies(df)

x = df.values

x_train = x[:train_size]
x_test = x[train_size:]

dm = DataManager()
dm.train_X = x_train
dm.train_y = y_train


clf = Classifier(optimizer="smbo")
clf.fit(dm, metric="accuracy", runcount=200)

submission = pd.read_csv(home_path + "/datasets/titanic/gender_submission.csv")
submission["Survived"] = clf.predict(x_test)
submission.to_csv(home_path + "/datasets/titanic/xgboost.csv", index=False)