Ejemplo n.º 1
0
    def train_predict(self, data, time_budget, n_class, schema):
        s1 = time.time()
        seed = SEED
        fix_seed(seed)
        LOGGER.info(f'time_budget:{time_budget}')
        LOGGER.info(f'n_class:{n_class}')
        LOGGER.info(f'node:{data["fea_table"].shape[0]}')
        LOGGER.info(f'edge:{data["edge_file"].shape[0]}')

        #pre-process data
        process_data = ProcessData(data)
        table = process_data.pre_process(time_budget, n_class, schema)

        # Feature Dimension Reduction
        feat = Feat()

        process_data.drop_unique_columns(table)
        drop_sum_columns = process_data.drop_excessive_columns(table)

        feat.fit_transform(table, drop_sum_columns)
        LOGGER.info(
            f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}'
        )

        #这里好像没用到哦
        table.large_features = False
        if table.ori_columns.shape[0] > 500:
            table.large_features = True

        model_type_list = ['sage', 'gat', 'tagc', 'gcn']

        repeat = 3
        model_name_list = [
            f'{model_type_list[i]}{i+len(model_type_list)*j}'
            for j in range(repeat) for i in range(len(model_type_list))
        ]
        model_type_list = model_type_list * repeat

        LOGGER.info('use node embedding')
        categories = [
            'node_index', 'degree_bins', 'bin_2-neighbor_mean_degree_bins'
        ]

        for model in set(model_type_list):
            LOGGER.info(
                f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}"""
            )
            exec(
                f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)'
            )

        allmodel = AllModel()

        table.lr_epoch = 16

        table.lr_list = [0.05, 0.03, 0.01, 0.0075, 0.005, 0.003, 0.001, 0.0005]

        train_valid_idx_list, valid_idx_list = split_train_and_valid(
            table, train_rate=0.8, seed=SEED, mode=split_mode)
        train_idx, test_idx = split_train_and_test(table)

        test_idx = test_idx.sort_values()
        run_model = []
        run_type = []
        run_time = {}
        for i in range(len(model_type_list)):
            seed = SEED * (i + 1)
            fix_seed(seed)
            model_type = model_type_list[i]
            model_name = model_name_list[i]
            if model_type not in run_time:
                init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time(
                    table,
                    model_type,
                    model_name,
                    train_idx,
                    test_idx,
                    seed=seed)
                run_lr_time = len(table.lr_list) * (
                    init_time + table.lr_epoch * one_epoch_time)
                run_time500 = init_time * (2) + one_epoch_time * (
                    500 + early_stopping_rounds) * 2 + run_lr_time
                run_time300 = init_time * (2) + one_epoch_time * (
                    300 + early_stopping_rounds) * 2 + run_lr_time
                run_time150 = init_time * (2) + one_epoch_time * (
                    150 + early_stopping_rounds) * 2 + run_lr_time
                run_time[model_type] = (run_time500 - run_lr_time,
                                        run_time300 - run_lr_time,
                                        run_time150 - run_lr_time,
                                        early_stopping_rounds, init_time,
                                        one_epoch_time, run_lr_time)
            else:
                run_time500, run_time300, run_time150, early_stopping_rounds, init_time, one_epoch_time, run_lr_time = run_time[
                    model_type]
            s2 = time.time()
            LOGGER.info(
                f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s"
            )
            if s2 - s1 + run_time500 + 5 < time_budget:
                LOGGER.info('train 500 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=500,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time300 + 5 < time_budget:
                LOGGER.info('train 300 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=300,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time150 + 5 < time_budget:
                LOGGER.info('train 150 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=150,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif len(allmodel.valid_models[0]) == 0:
                this_epoch = int((
                    (time_budget -
                     (s2 - s1 + 5) - run_lr_time) / 2 - init_time) /
                                 (one_epoch_time) - early_stopping_rounds)
                LOGGER.info(f'short time train {this_epoch} epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=this_epoch,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif time_budget - (s2 - s1) < 5:
                LOGGER.info('never train; break')
                break
            else:
                LOGGER.info('no train this model; continue')
                continue

        if offline:
            if table.especial:
                df = table.df[['node_index', 'is_test']]
                df = df.merge(data['test_label'], how='left', on='node_index')
                test_label = df.loc[(df['is_test'] == 1) &
                                    (table.directed_mask.tolist()),
                                    'label'].astype('int').values
            else:
                test_label = data['test_label']['label'].values
        else:
            test_label = None

        preds1, valid_acc1 = get_preds(0, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds2, valid_acc2 = get_preds(1, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds = (preds1 + preds2) / 2

        preds = preds.argmax(axis=1).flatten()

        if table.especial:
            LOGGER.info(f'preds\n{preds}')
            df = table.df[['label', 'is_test']]
            df['preds'] = int(
                df.loc[[not i for i in table.directed_mask.tolist()],
                       'label'].value_counts().index[0])
            df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                   'preds'] = preds
            preds = df.loc[df['is_test'] == 1, 'preds'].values

        LOGGER.info(
            f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}"
        )
        df_preds = pd.Series(preds, name='preds')
        LOGGER.info(
            f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}")

        if offline:
            preds1 = preds1.argmax(axis=1).flatten()
            preds2 = preds2.argmax(axis=1).flatten()
            if table.especial:
                LOGGER.info(f'preds1\n{preds1}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds1
                preds1 = df.loc[df['is_test'] == 1, 'preds'].values

                LOGGER.info(f'preds2\n{preds2}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds2
                preds2 = df.loc[df['is_test'] == 1, 'preds'].values

            df_test = table.df[['degree', 'label', 'is_test']]
            df_test = df_test.loc[df_test['is_test'] == 1]
            df_test['preds'] = preds
            df_test['label'] = data['test_label']['label'].values
            df_test['acc'] = df_test['preds'] == df_test['label']

            pd.set_option('display.max_rows', 1000)
            print(df_test.groupby('degree')['acc'].mean())

            return preds, valid_acc1, valid_acc2, preds1, preds2
        else:
            return preds