Esempio n. 1
0
def execute_model(network_function, criterion, device, print_details=False):

    acc_list = []
    auc_list = []

    for _ in range(METRIC_COMPUTATION_ITER):
        network = network_function().to(device)
        optimiser = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE)

        df = Dataset(PATH)

        end = int(df.__len__())
        indices = list([i for i in range(0, end)])
        set_split = end - round(end * SET_RATIO)
        train_indices = indices[0:set_split]
        test_indices = indices[set_split:end]

        training_data = data.DataLoader(
            df,
            batch_size=TRAIN_BATCH_SIZE,
            sampler=data.SubsetRandomSampler(train_indices))
        test_data = data.DataLoader(
            df,
            batch_size=TEST_BATCH_SIZE,
            sampler=data.SubsetRandomSampler(test_indices))

        training_data_batches = len(training_data)
        test_data_batches = len(test_data)

        for epoch in range(EPOCH):
            running_loss = 0

            for i, batch in enumerate(training_data):
                inputs, labels = batch

                inputs, labels = inputs.to(device), labels.to(device)

                optimiser.zero_grad()

                outputs = network(inputs)

                loss = criterion(outputs, labels.type_as(outputs))
                loss.backward()

                optimiser.step()

                running_loss += loss.item()

                if print_details:
                    if i % training_data_batches == training_data_batches - 1:
                        print("Epoch : %2d, Loss : %.3f" %
                              (epoch + 1, running_loss))

        evaluate_model(network, training_data, 'training data', device)
        acc_tmp, auc_tmp = evaluate_model(network, test_data, 'test data',
                                          device)
        acc_list.append(acc_tmp)
        auc_list.append(auc_tmp)

    write_metrics(acc_list, auc_list)
Esempio n. 2
0
def execute_tl_model(network_function, criterion, device, print_details=False):

    print('Transfer Learning - pretraining using NSCLC data to predict GBM')

    acc_list = []
    auc_list = []

    for _ in range(METRIC_COMPUTATION_ITER):
        network = network_function().to(device)
        optimiser = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE)

        _, _ = get_data_and_train_model(NSCLC_PATH, network, criterion,
                                        optimiser, device, print_details)
        training_data, test_data = get_data_and_train_model(
            GBM_PATH, network, criterion, optimiser, device, print_details)

        evaluate_model(network, training_data, 'training data', device)
        acc_tmp, auc_tmp = evaluate_model(network,
                                          test_data,
                                          'test data',
                                          device,
                                          print_details=True)
        acc_list.append(acc_tmp)
        auc_list.append(auc_tmp)

    write_metrics(acc_list, auc_list)
def execute_logistic(data, labels):
    print('Logistic Regression --------')
    acc_list = []
    auc_list = []

    coeff_sum = None
    intercept_sum = None
    data = preprocessing.scale(data)
    for _ in range(METRIC_COMPUTATION_ITER):
        training_data, test_data, training_labels, test_labels = train_test_split(
            data, labels, test_size=0.2)
        log_reg_model = LogisticRegression(solver='liblinear')
        log_reg_model.fit(training_data, training_labels)

        coeff_sum, intercept_sum = compute_param_sum(log_reg_model.coef_[0],
                                                     log_reg_model.intercept_,
                                                     coeff_sum, intercept_sum)

        evaluate_model(log_reg_model, training_data, training_labels,
                       'training data')
        acc_tmp, auc_tmp = evaluate_model(log_reg_model, test_data,
                                          test_labels, 'test data')
        acc_list.append(acc_tmp)
        auc_list.append(auc_tmp)
    write_metrics(acc_list, auc_list)

    coeff_mean = coeff_sum / METRIC_COMPUTATION_ITER
    intercept_mean = intercept_sum / METRIC_COMPUTATION_ITER
    write_model_params(coeff_mean, intercept_mean, 'logistic')
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--file_path", "-fp", type = str, required = True, help = 'File path including the file name of the data file')
	parser.add_argument("--penalty", "-p", type = str, help = 'penalty used in logistic regression regularization (only supports l1 and l2, default is l2)')
	parser.add_argument("--epochs", "-e", type = int, help = 'Number of train test combinations created to report accuracy and auc (default = 10)' \
		+ 'For each such combination, 100 combinations of train-validation are created to obtain model param distribution')	
	args = parser.parse_args()

	path = args.file_path
	penalty = args.penalty
	epochs = args.epochs
	if epochs is None:
		epochs = 10

	print('Executing the model on', path.split('/')[-1], 'with', epochs, 'epochs')

	data, labels = read_data(path)

	acc_list = []
	auc_list = []
	acc_list_filtered = []
	auc_list_filtered = []
	avg_num_significant_features = 0.0
	for i in range(epochs):
		significant_weights, _, acc, auc = execute_logistic(data, labels, penalty, random_state = i)
		significant_weights, num_significant_features, acc_filt, auc_filt = execute_logistic(data, labels, penalty, significant_weights, random_state = i)
		acc_list.append(acc)
		auc_list.append(auc)
		acc_list_filtered.append(acc_filt)
		auc_list_filtered.append(auc_filt)
		avg_num_significant_features += num_significant_features
	avg_num_significant_features /= epochs

	print('Average number of significant features :', avg_num_significant_features)
	print('\nResults on test data with all features...')	
	write_metrics(acc_list, auc_list, write_to_file = False, show_all = False)		
	print('\nResults on test data with filtered features...')
	write_metrics(acc_list_filtered, auc_list_filtered, write_to_file = False, show_all = False)
def execute_svm(data, labels):
    print('\nSVM --------')
    acc_list = []
    auc_list = []
    data = preprocessing.scale(data)
    for _ in range(METRIC_COMPUTATION_ITER):
        training_data, test_data, training_labels, test_labels = train_test_split(
            data, labels, test_size=0.2)
        svm_model = svm.SVC(gamma='scale', kernel='linear')
        svm_model.fit(training_data, training_labels)

        evaluate_model(svm_model,
                       training_data,
                       training_labels,
                       'training data',
                       svm=True)
        acc_tmp, auc_tmp = evaluate_model(svm_model,
                                          test_data,
                                          test_labels,
                                          'test data',
                                          svm=True)
        acc_list.append(acc_tmp)
        auc_list.append(auc_tmp)
    write_metrics(acc_list, auc_list)
def execute_logistic(data, labels, penalty, significant_weights = None, random_state = 0):
	if not penalty:
		penalty = 'l2'
	if random_state == 0:
		print('\nLogistic Regression with', penalty, 'penalty ....')
	acc_list = []
	auc_list = []

	if significant_weights is not None:
		if significant_weights[0] == 0:
			significant_weights = np.delete(significant_weights, 0)
		significant_weights -= 1
		data = data[:, significant_weights]
		print(data.shape)

	num_params = data.shape[1] + 1
	all_iter_params = np.zeros((NUM_SPLITS, num_params))

	#creating a separate test set for final evaluation
	training_data, test_data, training_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = random_state)
	data = training_data
	labels = training_labels
	if random_state == 0:
		print('training data size :', data.shape, 'training labels size :', labels.shape)
		print('test data size :', test_data.shape, 'test labels size :', test_labels.shape)

	rs = ShuffleSplit(n_splits = NUM_SPLITS, test_size = 0.2, random_state = 0)
	split_count = 0
	for train_index, validation_index in rs.split(data):
		training_data = data[train_index, :]
		training_labels = labels[train_index]
		validation_data = data[validation_index, :]
		validation_labels = labels[validation_index]

		log_reg_model = LogisticRegression(solver = 'liblinear', penalty = penalty) 

		pipe = Pipeline([('scaler', StandardScaler()), ('logreg', log_reg_model)])
		pipe.fit(training_data, training_labels)

		all_iter_params[split_count, :] = np.append(log_reg_model.intercept_, log_reg_model.coef_)

		evaluate_model(pipe, training_data, training_labels, 'training data')
		acc_tmp, auc_tmp = evaluate_model(pipe, validation_data, validation_labels, 'validation data')	
		acc_list.append(acc_tmp)
		auc_list.append(auc_tmp)

		split_count += 1

	if random_state == 0:	
		write_metrics(acc_list, auc_list, write_to_file = False, show_all = False)	

	#evaluating on the separated out test data set
	log_reg_model = LogisticRegression(solver = 'liblinear', penalty = penalty) 
	pipe = Pipeline([('scaler', StandardScaler()), ('logreg', log_reg_model)])
	pipe.fit(data, labels)
	acc, auc = evaluate_model(pipe, test_data, test_labels, 'test data')

	if significant_weights is not None:
		return significant_weights, data.shape[1], acc, auc

	#calculating z value - method 1
	coeff_mean = np.mean(all_iter_params, axis = 0)
	coeff_se = stats.sem(all_iter_params)

	coeff_z = coeff_mean / coeff_se

	coeff_CI_l = np.percentile(all_iter_params, 2.5, axis = 0)
	coeff_CI_u = np.percentile(all_iter_params, 97.5, axis = 0)

	#calculating z value - method 2
	coeff_se_method2 = (coeff_CI_u - coeff_CI_l) / (2 * 1.96)
	coeff_z_method2 = coeff_mean / coeff_se_method2	

	#for now using method2 since, method2 results for Z look better, with shorter z range
	significant_weights = np.argwhere(np.absolute(coeff_z_method2) > 2).flatten()	

	x = np.array([i for i in range(num_params)])

	#show graphs only for 1st epoch
	if random_state == 0:
		plt.plot(x, coeff_mean)
		plt.xlabel('Intercept and Features')
		plt.ylabel('LogReg Model Mean Weights')
		plt.show()
	
		fig, ax = plt.subplots()
		ax.plot(x, coeff_mean)
		ax.fill_between(x, coeff_CI_l, coeff_CI_u, color='g')
		plt.xlabel('Intercept and Features')
		plt.ylabel('LogReg Model Mean Weights with CI')	
		plt.show()
	
		plt.scatter(x, coeff_z, s=2)
		plt.xlabel('Intercept and Features')
		plt.ylabel('LogReg Model Weights Z values - method 1')
		plt.show()
	
		plt.scatter(x, coeff_z_method2, s=2)
		plt.xlabel('Intercept and Features')
		plt.ylabel('LogReg Model Weights Z values')
		plt.show()
	
		print('Number of significant Weights : ', len(significant_weights))
	
		plt.scatter(significant_weights, coeff_z_method2[significant_weights], s = 2)
		plt.xlabel('Intercept and Features Selected')
		plt.ylabel('LogReg Model Weights Z values')
		plt.show()
	
	return significant_weights, data.shape[1], acc, auc