Ejemplo n.º 1
0
    def my_fit(self, Xs, y, time_ramain, X_test):
        np.random.seed(CONSTANT.SEED)

        split = CONSTANT.SPLIT

        self.split = split

        log(f'split {split}')

        if split == -1:
            config = Config(time.time(), self.info['time_budget'])

            X_test.index = -X_test.index - 1

            main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0]
            main_max_shape = 2888888
            main_min_shape = min(main_shape, 100000)

            test_shape = X_test.shape[0]
            max_accept_shape = 3999999

            if main_shape + test_shape > max_accept_shape:
                sample_main_shape = max_accept_shape - test_shape
                if sample_main_shape > main_max_shape:
                    sample_main_shape = main_max_shape
                if sample_main_shape < main_min_shape:
                    sample_main_shape = main_min_shape
                log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}'
                    )
                if 'time_col' in self.info:
                    key_time_col = self.info['time_col']
                    if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns:
                        Xs[CONSTANT.MAIN_TABLE_NAME].sort_values(
                            by=key_time_col, inplace=True)
                Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[
                    CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:]
                gc.collect()

            Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat(
                [Xs[CONSTANT.MAIN_TABLE_NAME], X_test])

            X_test.drop(X_test.columns, axis=1, inplace=True)
            gc.collect()

            graph = Graph(self.info, Xs)
            graph.sort_tables()
            train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0]
            y = y.loc[train_index]
            test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index < 0]

            graph.preprocess_fit_transform()
            gc.collect()

            merge_feat_pipeline = DeafultMergeFeatPipeline()
            merger = Merger(merge_feat_pipeline)

            merger.merge_table(graph)
            main_table = merger.merge_to_main_fit_transform(graph)
            self.release_tables(Xs, graph)
            del merger
            del graph
            gc.collect()

            feat_pipeline = DefaultFeatPipeline()
            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_order1(main_table, y)

            sample_for_combine_features = True

            if sample_for_combine_features:
                main_data = main_table.data
                train_data = main_data.loc[main_data.index >= 0]

                del main_data

                sample_num = CONSTANT.SAMPLE_NUM
                train_shape = train_data.shape

                if train_shape[0] <= sample_num:
                    sample_for_combine_features = False
                else:
                    data_tail_new = train_data.iloc[-sample_num:]

                    gc.collect()

                    y_tail_new = y.loc[data_tail_new.index]

                    table_tail_new = copy.deepcopy(main_table)
                    table_tail_new.data = data_tail_new

                    del data_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(table_tail_new,
                                                         y_tail_new,
                                                         sample=True)
                    feat_engine.fit_transform_keys_order2(table_tail_new,
                                                          y_tail_new,
                                                          sample=True)

                    del table_tail_new, y_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(main_table,
                                                         y,
                                                         selection=False)
                    feat_engine.fit_transform_keys_order2(main_table,
                                                          y,
                                                          selection=False)

                    feat_engine.fit_transform_post_order1(main_table, y)

            if not sample_for_combine_features:
                gc.collect()

                feat_engine.fit_transform_all_order2(main_table, y)
                feat_engine.fit_transform_keys_order2(main_table, y)

                feat_engine.fit_transform_keys_order3(main_table, y)
                feat_engine.fit_transform_post_order1(main_table, y)

            del feat_engine
            gc.collect()

            X_test = main_table.data.loc[test_index]
            main_table.data = main_table.data.loc[train_index]

            gc.collect()

            test_table = copy.deepcopy(main_table)
            test_table.data = X_test
            self.test_table = test_table
            len_test = X_test.shape[0]
            gc.collect()

            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_merge_order1(main_table, y)
            self.feat_engine = feat_engine

            feat_output = FeatOutput()
            self.feat_output = feat_output
            X, y, categories = feat_output.final_fit_transform_output(
                main_table, y)

            del main_table
            gc.collect()

            lgb = AutoLGB()

            lgb.param_compute(X, y, categories, config)
            X_train, y_train, X_test, y_test = time_train_test_split(
                X, y, test_rate=0.2)

            lgb.param_opt_new(X_train, y_train, X_test, y_test, categories)

            gc.collect()

            del X_train, y_train, X_test, y_test

            gc.collect()

            X, y = self.shuffle(X, y, 2019)
            gc.collect()

            lgb.ensemble_train(X, y, categories, config, len_test)

            gc.collect()

            importances = lgb.get_ensemble_importances()

            self.model = lgb
            del X, y

        elif split == -2:

            config = Config(time.time(), self.info['time_budget'])

            Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([
                Xs[CONSTANT.MAIN_TABLE_NAME],
            ])

            gc.collect()

            graph = Graph(self.info, Xs)
            graph.sort_tables()
            train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0]
            y = y.loc[train_index]

            graph.preprocess_fit_transform()
            gc.collect()

            merge_feat_pipeline = DeafultMergeFeatPipeline()
            merger = Merger(merge_feat_pipeline)

            merger.merge_table(graph)
            main_table = merger.merge_to_main_fit_transform(graph)
            self.release_tables(Xs, graph)
            del merger
            del graph
            gc.collect()

            feat_pipeline = DefaultFeatPipeline()
            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_order1(main_table, y)

            sample_for_combine_features = True

            if sample_for_combine_features:
                main_data = main_table.data
                train_data = main_data.loc[main_data.index >= 0]

                del main_data

                sample_num = CONSTANT.SAMPLE_NUM
                train_shape = train_data.shape

                if train_shape[0] <= sample_num:
                    sample_for_combine_features = False
                else:
                    data_tail_new = train_data.iloc[-sample_num:]

                    gc.collect()
                    log(f'sample data shape {data_tail_new.shape}')

                    y_tail_new = y.loc[data_tail_new.index]

                    table_tail_new = copy.deepcopy(main_table)
                    table_tail_new.data = data_tail_new

                    del data_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(table_tail_new,
                                                         y_tail_new,
                                                         sample=True)
                    feat_engine.fit_transform_keys_order2(table_tail_new,
                                                          y_tail_new,
                                                          sample=True)

                    del table_tail_new, y_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(main_table,
                                                         y,
                                                         selection=False)
                    feat_engine.fit_transform_keys_order2(main_table,
                                                          y,
                                                          selection=False)
                    feat_engine.fit_transform_post_order1(main_table, y)

            if not sample_for_combine_features:
                gc.collect()

                feat_engine.fit_transform_all_order2(main_table, y)
                feat_engine.fit_transform_keys_order2(main_table, y)
                feat_engine.fit_transform_keys_order3(main_table, y)
                feat_engine.fit_transform_post_order1(main_table, y)

            del feat_engine
            gc.collect()

            main_table.data = main_table.data.loc[train_index]

            gc.collect()

            def split_table(table, y):
                X = table.data
                X_train, y_train, X_test, y_test = time_train_test_split(
                    X, y, shuffle=False, test_rate=0.2)
                table1 = copy.deepcopy(table)
                table1.data = X_train
                table2 = copy.deepcopy(table)
                table2.data = X_test
                return table1, y_train, table2, y_test

            table1, y_train, table2, y_test = split_table(main_table, y)

            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_merge_order1(table1, y_train)
            self.feat_engine = feat_engine

            feat_output = FeatOutput()
            self.feat_output = feat_output

            X_train, y_train, categories = feat_output.fit_transform_output(
                table1, y_train)

            gc.collect()
            self.feat_engine.transform_merge_order1(table2)
            X_test = self.feat_output.transform_output(table2)

            lgb = AutoLGB()

            lgb.param_compute(X_train, y_train, categories, config)

            lgb.param_opt_new(X_train, y_train, X_test, y_test, categories)

            len_test = X_test.shape[0]

            lgb.ensemble_train(X_train, y_train, categories, config, len_test)
            gc.collect()

            pred, pred0 = lgb.ensemble_predict_test(X_test)

            auc = roc_auc_score(y_test, pred0)
            print('source AUC:', auc)

            auc = roc_auc_score(y_test, pred)
            Model.ensemble_auc.append(auc)
            print('ensemble AUC:', auc)

            importances = lgb.get_ensemble_importances()

            self.model = lgb

            del X_train, y_train, X_test, y_test
            gc.collect()

        paths = os.path.join(feature_importance_path, version)
        if not os.path.exists(paths):
            os.makedirs(paths)
        importances.to_csv(os.path.join(
            paths, '{}_importances.csv'.format(
                datetime.now().strftime('%Y%m%d%H%M%S'))),
                           index=False)