def run_svm_using_test_train_split(data, data_with_features, split, task): """ # @param X: numpy @param y: numpy @return: """ # split training set and test set (x_train, y_train), (x_test, y_test) = data.split_train_test(split, stratify=True, task=task) # reset_train_test_split=True, # splitted_edges_dir= # split_by_node=True) x_train_with_features, x_test_with_features = data_with_features.loc[ x_train], \ data_with_features.loc[ x_test] # train model clf = svm.SVC(gamma='scale', decision_function_shape='ovr', probability=True) clf.fit(x_train_with_features, y_train) ## train set ### use decision_function y_train_pred = clf.decision_function(x_train_with_features).argmax(1) y_train_pred_proba = clf.decision_function(x_train_with_features) ## test set ### use decision function y_test_pred = clf.decision_function(x_test_with_features).argmax(1) y_test_pred_proba = clf.decision_function(x_test_with_features) # report performance of model print('=======training set=======') train_report_np, train_columns, train_index = report_performance( y_train, y_train_pred, y_train_pred_proba, np.unique(y_train), plot=True, verbose=True, return_value_for_cv=True) print('=======test set=======') test_report_np, test_columns, test_index = report_performance( y_test, y_test_pred, y_test_pred_proba, np.unique(y_test), plot=True, verbose=True, return_value_for_cv=True) #========= save to file========= save2file(train_report_np, train_columns, train_index, test_report_np, test_columns, test_index)
def run_rf_using_test_train_split(data, data_with_features, split, task): """ # @param X: numpy @param y: numpy @return: """ # split training set and test set (x_train, y_train), (x_test, y_test) = data.split_train_test(split, stratify=True, task=task) # reset_train_test_split=True, # split_by_node=True) x_train_with_features, x_test_with_features = data_with_features.loc[ x_train], \ data_with_features.loc[ x_test] # train model clf = RandomForestClassifier(max_depth=2, random_state=0) clf.fit(x_train_with_features, y_train) ## train set ### use decision_function y_train_pred = clf.predict_proba(x_train_with_features).argmax(1) y_train_pred_proba = clf.predict_proba(x_train_with_features) ## test set ### use decision function y_test_pred = clf.predict_proba(x_test_with_features).argmax(1) y_test_pred_proba = clf.predict_proba(x_test_with_features) # report performance of model print('=======training set=======') train_report_np, train_columns, train_index = report_performance( y_train, y_train_pred, y_train_pred_proba, np.unique(y_train), plot=True, verbose=True, return_value_for_cv=True) print('=======test set=======') test_report_np, test_columns, test_index = report_performance( y_test, y_test_pred, y_test_pred_proba, np.unique(y_test), plot=True, verbose=True, return_value_for_cv=True) #========= save to file========= save2file(train_report_np, train_columns, train_index, test_report_np, test_columns, test_index)
def run_neural_network_using_train_test_split(data, x_with_features, task, splitted_edges_dir, split=None, split_by_node=None): # ===================== # ==parameter setting # ===================== # epochs = 1 # epochs = 10 # epochs = 150 # TODO make sure that x_with_features have row = edges and columns = features dimension train_set_np, test_set_np = data.split_train_test( split, stratify=True, task=task, splitted_edges_dir=splitted_edges_dir, split_by_node=split_by_node) def get_train_test_set(): # train_set_np, test_set_np = data.train_set, data.test_set # x_train,y_train, x_test,y_test have to be used as row index to get access to x_with_features x_train_ind, y_train = train_set_np[:, :2], train_set_np[:, -1].astype( float) x_test_ind, y_test = test_set_np[:, :2], test_set_np[:, -1].astype(float) # from keras.utils import to_categorical # y_train = to_categorical(y_train.astype(int)) # y_test = to_categorical(y_test.astype(int)) import numpy as np # what should train and test of link prediction be? x_with_features.index = x_with_features.index.map(str) tmp = x_with_features.reindex(x_train_ind[:, 0]).dropna().to_numpy() tmp_1 = x_with_features.reindex(x_train_ind[:, 1]).dropna().to_numpy() x_train = np.concatenate([tmp, tmp_1], axis=1) tmp = x_with_features.reindex(x_test_ind[:, 0]).dropna().to_numpy() tmp_1 = x_with_features.reindex(x_test_ind[:, 1]).dropna().to_numpy() x_test = np.concatenate([tmp, tmp_1], axis=1) return (y_train, x_train, x_train_ind), (y_test, x_test, x_test_ind) (y_train, x_train, x_train_ind), (y_test, x_test, x_test_ind) = get_train_test_set() # x_train = np.concatenate([x_with_features.loc[x_train_ind[:, 0]], # x_with_features.loc[x_train_ind[:, 1]]], axis=1) # # x_test = np.concatenate([x_with_features.loc[x_test_ind[:, 0]], # x_with_features.loc[x_test_ind[:, 1]]], axis=1) assert x_train.shape[0] == x_train_ind.shape[0], '' # TODO pass the correct value in y_train_pred, y_train_pred_proba, y_test_pred, y_test_pred_proba = run_neural_network_for_each_fold( x_train, x_test, y_train, y_test) print(f"================training set==================") report_final_train_performance_report_np, columns_of_performance_metric, indices_of_performance_metric = report_performance( y_train, y_train_pred, y_train_pred_proba, np.unique(y_train), verbose=True, plot=False, return_value_for_cv=True) print(f"================test set==================") report_final_test_performance_report_np, columns_of_performance_metric, indices_of_performance_metric = report_performance( y_test, y_test_pred, y_test_pred_proba, np.unique(y_test), verbose=True, plot=False, return_value_for_cv=True) save2file(report_final_train_performance_report_np, columns_of_performance_metric, indices_of_performance_metric, report_final_test_performance_report_np, columns_of_performance_metric, indices_of_performance_metric)
def run_lr_using_test_train_split_with_link_prediction(data, x_with_features, task, splitted_edges_dir, split, split_by_node): train_set_np, test_set_np = data.split_train_test( split, stratify=True, task=task, splitted_edges_dir=splitted_edges_dir, split_by_node=split_by_node) # train_set_np, test_set_np = data.train_set, data.test_set # x_train,y_train, x_test,y_test have to be used as row index to get access to x_with_features x_train_ind, y_train = train_set_np[:, :2], train_set_np[:, -1].astype(float) x_test_ind, y_test = test_set_np[:, :2], test_set_np[:, -1].astype(float) # from keras.utils import to_categorical # y_train = to_categorical(y_train.astype(int)) # y_test = to_categorical(y_test.astype(int)) import numpy as np # what should train and test of link prediction be? x_with_features.index = x_with_features.index.map(str) tmp = x_with_features.reindex(x_train_ind[:, 0]).dropna().to_numpy() tmp_1 = x_with_features.reindex(x_train_ind[:, 1]).dropna().to_numpy() x_train = np.concatenate([tmp, tmp_1], axis=1) tmp = x_with_features.reindex(x_test_ind[:, 0]).dropna().to_numpy() tmp_1 = x_with_features.reindex(x_test_ind[:, 1]).dropna().to_numpy() x_test = np.concatenate([tmp, tmp_1], axis=1) # x_train = np.concatenate([x_with_features.loc[x_train_ind[:, 0]], # x_with_features.loc[x_train_ind[:, 1]]], axis=1) # # x_test = np.concatenate([x_with_features.loc[x_test_ind[:, 0]], # x_with_features.loc[x_test_ind[:, 1]]], axis=1) assert x_train.shape[0] == x_train_ind.shape[0], '' # train model clf = LogisticRegression(random_state=0) clf.fit(x_train, y_train) ## train set ### use decision_function y_train_pred = clf.predict_proba(x_train).argmax(1) y_train_pred_proba = clf.predict_proba(x_train) ## test set ### use decision function y_test_pred = clf.predict_proba(x_test).argmax(1) y_test_pred_proba = clf.predict_proba(x_test) # report performance of model print('=======training set=======') train_report_np, train_columns, train_index = report_performance( y_train, y_train_pred, y_train_pred_proba, np.unique(y_train), plot=True, verbose=True, return_value_for_cv=True) print('=======test set=======') test_report_np, test_columns, test_index = report_performance( y_test, y_test_pred, y_test_pred_proba, np.unique(y_test), plot=True, verbose=True, return_value_for_cv=True) #========= save to file========= save2file(train_report_np, train_columns, train_index, test_report_np, test_columns, test_index)