def execute_model(network_function, criterion, device, print_details=False): acc_list = [] auc_list = [] for _ in range(METRIC_COMPUTATION_ITER): network = network_function().to(device) optimiser = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE) df = Dataset(PATH) end = int(df.__len__()) indices = list([i for i in range(0, end)]) set_split = end - round(end * SET_RATIO) train_indices = indices[0:set_split] test_indices = indices[set_split:end] training_data = data.DataLoader( df, batch_size=TRAIN_BATCH_SIZE, sampler=data.SubsetRandomSampler(train_indices)) test_data = data.DataLoader( df, batch_size=TEST_BATCH_SIZE, sampler=data.SubsetRandomSampler(test_indices)) training_data_batches = len(training_data) test_data_batches = len(test_data) for epoch in range(EPOCH): running_loss = 0 for i, batch in enumerate(training_data): inputs, labels = batch inputs, labels = inputs.to(device), labels.to(device) optimiser.zero_grad() outputs = network(inputs) loss = criterion(outputs, labels.type_as(outputs)) loss.backward() optimiser.step() running_loss += loss.item() if print_details: if i % training_data_batches == training_data_batches - 1: print("Epoch : %2d, Loss : %.3f" % (epoch + 1, running_loss)) evaluate_model(network, training_data, 'training data', device) acc_tmp, auc_tmp = evaluate_model(network, test_data, 'test data', device) acc_list.append(acc_tmp) auc_list.append(auc_tmp) write_metrics(acc_list, auc_list)
def execute_tl_model(network_function, criterion, device, print_details=False): print('Transfer Learning - pretraining using NSCLC data to predict GBM') acc_list = [] auc_list = [] for _ in range(METRIC_COMPUTATION_ITER): network = network_function().to(device) optimiser = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE) _, _ = get_data_and_train_model(NSCLC_PATH, network, criterion, optimiser, device, print_details) training_data, test_data = get_data_and_train_model( GBM_PATH, network, criterion, optimiser, device, print_details) evaluate_model(network, training_data, 'training data', device) acc_tmp, auc_tmp = evaluate_model(network, test_data, 'test data', device, print_details=True) acc_list.append(acc_tmp) auc_list.append(auc_tmp) write_metrics(acc_list, auc_list)
def execute_logistic(data, labels): print('Logistic Regression --------') acc_list = [] auc_list = [] coeff_sum = None intercept_sum = None data = preprocessing.scale(data) for _ in range(METRIC_COMPUTATION_ITER): training_data, test_data, training_labels, test_labels = train_test_split( data, labels, test_size=0.2) log_reg_model = LogisticRegression(solver='liblinear') log_reg_model.fit(training_data, training_labels) coeff_sum, intercept_sum = compute_param_sum(log_reg_model.coef_[0], log_reg_model.intercept_, coeff_sum, intercept_sum) evaluate_model(log_reg_model, training_data, training_labels, 'training data') acc_tmp, auc_tmp = evaluate_model(log_reg_model, test_data, test_labels, 'test data') acc_list.append(acc_tmp) auc_list.append(auc_tmp) write_metrics(acc_list, auc_list) coeff_mean = coeff_sum / METRIC_COMPUTATION_ITER intercept_mean = intercept_sum / METRIC_COMPUTATION_ITER write_model_params(coeff_mean, intercept_mean, 'logistic')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--file_path", "-fp", type = str, required = True, help = 'File path including the file name of the data file') parser.add_argument("--penalty", "-p", type = str, help = 'penalty used in logistic regression regularization (only supports l1 and l2, default is l2)') parser.add_argument("--epochs", "-e", type = int, help = 'Number of train test combinations created to report accuracy and auc (default = 10)' \ + 'For each such combination, 100 combinations of train-validation are created to obtain model param distribution') args = parser.parse_args() path = args.file_path penalty = args.penalty epochs = args.epochs if epochs is None: epochs = 10 print('Executing the model on', path.split('/')[-1], 'with', epochs, 'epochs') data, labels = read_data(path) acc_list = [] auc_list = [] acc_list_filtered = [] auc_list_filtered = [] avg_num_significant_features = 0.0 for i in range(epochs): significant_weights, _, acc, auc = execute_logistic(data, labels, penalty, random_state = i) significant_weights, num_significant_features, acc_filt, auc_filt = execute_logistic(data, labels, penalty, significant_weights, random_state = i) acc_list.append(acc) auc_list.append(auc) acc_list_filtered.append(acc_filt) auc_list_filtered.append(auc_filt) avg_num_significant_features += num_significant_features avg_num_significant_features /= epochs print('Average number of significant features :', avg_num_significant_features) print('\nResults on test data with all features...') write_metrics(acc_list, auc_list, write_to_file = False, show_all = False) print('\nResults on test data with filtered features...') write_metrics(acc_list_filtered, auc_list_filtered, write_to_file = False, show_all = False)
def execute_svm(data, labels): print('\nSVM --------') acc_list = [] auc_list = [] data = preprocessing.scale(data) for _ in range(METRIC_COMPUTATION_ITER): training_data, test_data, training_labels, test_labels = train_test_split( data, labels, test_size=0.2) svm_model = svm.SVC(gamma='scale', kernel='linear') svm_model.fit(training_data, training_labels) evaluate_model(svm_model, training_data, training_labels, 'training data', svm=True) acc_tmp, auc_tmp = evaluate_model(svm_model, test_data, test_labels, 'test data', svm=True) acc_list.append(acc_tmp) auc_list.append(auc_tmp) write_metrics(acc_list, auc_list)
def execute_logistic(data, labels, penalty, significant_weights = None, random_state = 0): if not penalty: penalty = 'l2' if random_state == 0: print('\nLogistic Regression with', penalty, 'penalty ....') acc_list = [] auc_list = [] if significant_weights is not None: if significant_weights[0] == 0: significant_weights = np.delete(significant_weights, 0) significant_weights -= 1 data = data[:, significant_weights] print(data.shape) num_params = data.shape[1] + 1 all_iter_params = np.zeros((NUM_SPLITS, num_params)) #creating a separate test set for final evaluation training_data, test_data, training_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = random_state) data = training_data labels = training_labels if random_state == 0: print('training data size :', data.shape, 'training labels size :', labels.shape) print('test data size :', test_data.shape, 'test labels size :', test_labels.shape) rs = ShuffleSplit(n_splits = NUM_SPLITS, test_size = 0.2, random_state = 0) split_count = 0 for train_index, validation_index in rs.split(data): training_data = data[train_index, :] training_labels = labels[train_index] validation_data = data[validation_index, :] validation_labels = labels[validation_index] log_reg_model = LogisticRegression(solver = 'liblinear', penalty = penalty) pipe = Pipeline([('scaler', StandardScaler()), ('logreg', log_reg_model)]) pipe.fit(training_data, training_labels) all_iter_params[split_count, :] = np.append(log_reg_model.intercept_, log_reg_model.coef_) evaluate_model(pipe, training_data, training_labels, 'training data') acc_tmp, auc_tmp = evaluate_model(pipe, validation_data, validation_labels, 'validation data') acc_list.append(acc_tmp) auc_list.append(auc_tmp) split_count += 1 if random_state == 0: write_metrics(acc_list, auc_list, write_to_file = False, show_all = False) #evaluating on the separated out test data set log_reg_model = LogisticRegression(solver = 'liblinear', penalty = penalty) pipe = Pipeline([('scaler', StandardScaler()), ('logreg', log_reg_model)]) pipe.fit(data, labels) acc, auc = evaluate_model(pipe, test_data, test_labels, 'test data') if significant_weights is not None: return significant_weights, data.shape[1], acc, auc #calculating z value - method 1 coeff_mean = np.mean(all_iter_params, axis = 0) coeff_se = stats.sem(all_iter_params) coeff_z = coeff_mean / coeff_se coeff_CI_l = np.percentile(all_iter_params, 2.5, axis = 0) coeff_CI_u = np.percentile(all_iter_params, 97.5, axis = 0) #calculating z value - method 2 coeff_se_method2 = (coeff_CI_u - coeff_CI_l) / (2 * 1.96) coeff_z_method2 = coeff_mean / coeff_se_method2 #for now using method2 since, method2 results for Z look better, with shorter z range significant_weights = np.argwhere(np.absolute(coeff_z_method2) > 2).flatten() x = np.array([i for i in range(num_params)]) #show graphs only for 1st epoch if random_state == 0: plt.plot(x, coeff_mean) plt.xlabel('Intercept and Features') plt.ylabel('LogReg Model Mean Weights') plt.show() fig, ax = plt.subplots() ax.plot(x, coeff_mean) ax.fill_between(x, coeff_CI_l, coeff_CI_u, color='g') plt.xlabel('Intercept and Features') plt.ylabel('LogReg Model Mean Weights with CI') plt.show() plt.scatter(x, coeff_z, s=2) plt.xlabel('Intercept and Features') plt.ylabel('LogReg Model Weights Z values - method 1') plt.show() plt.scatter(x, coeff_z_method2, s=2) plt.xlabel('Intercept and Features') plt.ylabel('LogReg Model Weights Z values') plt.show() print('Number of significant Weights : ', len(significant_weights)) plt.scatter(significant_weights, coeff_z_method2[significant_weights], s = 2) plt.xlabel('Intercept and Features Selected') plt.ylabel('LogReg Model Weights Z values') plt.show() return significant_weights, data.shape[1], acc, auc