def train_predict(self, data, time_budget, n_class, schema): s1 = time.time() seed = SEED fix_seed(seed) LOGGER.info(f'time_budget:{time_budget}') LOGGER.info(f'n_class:{n_class}') LOGGER.info(f'node:{data["fea_table"].shape[0]}') LOGGER.info(f'edge:{data["edge_file"].shape[0]}') #pre-process data process_data = ProcessData(data) table = process_data.pre_process(time_budget, n_class, schema) # Feature Dimension Reduction feat = Feat() process_data.drop_unique_columns(table) drop_sum_columns = process_data.drop_excessive_columns(table) feat.fit_transform(table, drop_sum_columns) LOGGER.info( f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}' ) #这里好像没用到哦 table.large_features = False if table.ori_columns.shape[0] > 500: table.large_features = True model_type_list = ['sage', 'gat', 'tagc', 'gcn'] repeat = 3 model_name_list = [ f'{model_type_list[i]}{i+len(model_type_list)*j}' for j in range(repeat) for i in range(len(model_type_list)) ] model_type_list = model_type_list * repeat LOGGER.info('use node embedding') categories = [ 'node_index', 'degree_bins', 'bin_2-neighbor_mean_degree_bins' ] for model in set(model_type_list): LOGGER.info( f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}""" ) exec( f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)' ) allmodel = AllModel() table.lr_epoch = 16 table.lr_list = [0.05, 0.03, 0.01, 0.0075, 0.005, 0.003, 0.001, 0.0005] train_valid_idx_list, valid_idx_list = split_train_and_valid( table, train_rate=0.8, seed=SEED, mode=split_mode) train_idx, test_idx = split_train_and_test(table) test_idx = test_idx.sort_values() run_model = [] run_type = [] run_time = {} for i in range(len(model_type_list)): seed = SEED * (i + 1) fix_seed(seed) model_type = model_type_list[i] model_name = model_name_list[i] if model_type not in run_time: init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time( table, model_type, model_name, train_idx, test_idx, seed=seed) run_lr_time = len(table.lr_list) * ( init_time + table.lr_epoch * one_epoch_time) run_time500 = init_time * (2) + one_epoch_time * ( 500 + early_stopping_rounds) * 2 + run_lr_time run_time300 = init_time * (2) + one_epoch_time * ( 300 + early_stopping_rounds) * 2 + run_lr_time run_time150 = init_time * (2) + one_epoch_time * ( 150 + early_stopping_rounds) * 2 + run_lr_time run_time[model_type] = (run_time500 - run_lr_time, run_time300 - run_lr_time, run_time150 - run_lr_time, early_stopping_rounds, init_time, one_epoch_time, run_lr_time) else: run_time500, run_time300, run_time150, early_stopping_rounds, init_time, one_epoch_time, run_lr_time = run_time[ model_type] s2 = time.time() LOGGER.info( f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s" ) if s2 - s1 + run_time500 + 5 < time_budget: LOGGER.info('train 500 epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=500, seed=seed) run_model.append(model_name) run_type.append(model_type) elif s2 - s1 + run_time300 + 5 < time_budget: LOGGER.info('train 300 epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=300, seed=seed) run_model.append(model_name) run_type.append(model_type) elif s2 - s1 + run_time150 + 5 < time_budget: LOGGER.info('train 150 epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=150, seed=seed) run_model.append(model_name) run_type.append(model_type) elif len(allmodel.valid_models[0]) == 0: this_epoch = int(( (time_budget - (s2 - s1 + 5) - run_lr_time) / 2 - init_time) / (one_epoch_time) - early_stopping_rounds) LOGGER.info(f'short time train {this_epoch} epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=this_epoch, seed=seed) run_model.append(model_name) run_type.append(model_type) elif time_budget - (s2 - s1) < 5: LOGGER.info('never train; break') break else: LOGGER.info('no train this model; continue') continue if offline: if table.especial: df = table.df[['node_index', 'is_test']] df = df.merge(data['test_label'], how='left', on='node_index') test_label = df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'label'].astype('int').values else: test_label = data['test_label']['label'].values else: test_label = None preds1, valid_acc1 = get_preds(0, run_model, run_type, allmodel, model_name_list, table, test_label, valid_idx_list) preds2, valid_acc2 = get_preds(1, run_model, run_type, allmodel, model_name_list, table, test_label, valid_idx_list) preds = (preds1 + preds2) / 2 preds = preds.argmax(axis=1).flatten() if table.especial: LOGGER.info(f'preds\n{preds}') df = table.df[['label', 'is_test']] df['preds'] = int( df.loc[[not i for i in table.directed_mask.tolist()], 'label'].value_counts().index[0]) df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'preds'] = preds preds = df.loc[df['is_test'] == 1, 'preds'].values LOGGER.info( f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}" ) df_preds = pd.Series(preds, name='preds') LOGGER.info( f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}") if offline: preds1 = preds1.argmax(axis=1).flatten() preds2 = preds2.argmax(axis=1).flatten() if table.especial: LOGGER.info(f'preds1\n{preds1}') df = table.df[['label', 'is_test']] df['preds'] = int( df.loc[[not i for i in table.directed_mask.tolist()], 'label'].value_counts().index[0]) df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'preds'] = preds1 preds1 = df.loc[df['is_test'] == 1, 'preds'].values LOGGER.info(f'preds2\n{preds2}') df = table.df[['label', 'is_test']] df['preds'] = int( df.loc[[not i for i in table.directed_mask.tolist()], 'label'].value_counts().index[0]) df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'preds'] = preds2 preds2 = df.loc[df['is_test'] == 1, 'preds'].values df_test = table.df[['degree', 'label', 'is_test']] df_test = df_test.loc[df_test['is_test'] == 1] df_test['preds'] = preds df_test['label'] = data['test_label']['label'].values df_test['acc'] = df_test['preds'] == df_test['label'] pd.set_option('display.max_rows', 1000) print(df_test.groupby('degree')['acc'].mean()) return preds, valid_acc1, valid_acc2, preds1, preds2 else: return preds