Beispiel #1
0
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, target_id, feature_selection, oversampling, survival, undersampling, aggregation):
	'''execute learning task using the specified algorithm'''

	# feature selection
	if survival == True and aggregation == True:
		k=150
	if survival == True and aggregation == False:
		k=220
	if survival == False and aggregation == True:
		k=150
	if survival == False and aggregation == False:
		k=220

	new_X, best_features, headers = pearson_fs(X, y, k, headers, feature_selection, survival)

	if aggregation == True:
		new_X = new_X[:,0:-1]
		headers = headers[0:-1]

	# execute algorithm
	if alg == 'DT':
		results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers, oversampling, undersampling, aggregation)  #out_dir+"{}.dot".format(fname)
	elif alg == 'RF':
		results, features, model = ML.RF(new_X, y, best_features,oversampling, undersampling, aggregation, n_estimators=200)
	elif alg == 'RFsmall':
		results, features, model = ML.RF(new_X, y, best_features, oversampling, undersampling, aggregation, n_estimators=100)
	elif alg == 'SVM':
		results, model = ML.SVM(new_X, y, best_features, oversampling, undersampling, aggregation)
	elif alg == 'LR':
		results, features, model = ML.LR(new_X, y, best_features,oversampling, undersampling, aggregation)
	elif alg == 'XGBoost':
		results, features, model = ML.XGBoost(new_X, y, best_features,oversampling, undersampling, aggregation)
	if alg == 'COX':
		results, features, model = ML.COX(new_X, y, best_features, oversampling, undersampling, aggregation)
	if alg == 'survSVM':
		results, features, model = ML.survSVM(new_X, y, best_features, oversampling, undersampling, aggregation)
	if alg == 'GBS':
		results, features, model = ML.GradientBoostingSurvival(new_X, y, best_features, oversampling, undersampling, aggregation)

	if not results:
		return

	# set2model_instance[fname] = (model, best_features)

	# export results
	# results_list.append([fname] + results[0:3])

	if survival == False:
		in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])
	# else:
		# in_out.save_results(out_dir+fname+'.csv', ["CI"], results, [sum(y),len(y)])

	if 'features' in locals():
		features = features.flatten()
		in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features))
	
	return model, best_features, [fname] + results[0:3]
Beispiel #2
0
def predict_separate(X, y, fname, out_dir, record_id, target_id, feature_selection, model, best_features):
	'''execute learning task using the specified algorithm'''
	print '  ...testing on new data'

	# select the feature selected attribute only
	if best_features == 'all':
		new_X = X
	else:
		new_X = X[:,best_features]

	# execute algorithm
	y_pred = model.predict_proba(new_X)
	fpr, tpr, _ = roc_curve(y, y_pred[:, 1])
	mean_fpr = np.linspace(0, 1, 100)
	mean_tpr = interp(mean_fpr, fpr, tpr)
	mean_auc = auc(fpr, tpr)
	results = [mean_fpr, mean_tpr, mean_auc, np.zeros((2,2))]
	in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])

	results = [fname] + results[0:3]
	return results
def predict_separate(X, y, fname, out_dir, record_id, target_id, feature_selection, model, best_features):
	'''execute learning task using the specified algorithm'''
	print '  ...testing on new data'
	
	# select the feature selected attribute only
	if best_features == 'all':
		new_X = X
	else:
		new_X = X[:,best_features]

	# execute algorithm
	y_pred = model.predict_proba(new_X)
	fpr, tpr, _ = roc_curve(y, y_pred[:, 1])
	mean_fpr = np.linspace(0, 1, 100)
	mean_tpr = interp(mean_fpr, fpr, tpr)
	mean_auc = auc(fpr, tpr)
	results = [mean_fpr, mean_tpr, mean_auc, np.zeros((2,2))]
	in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])

	results = [fname] + results[0:3]
	return results
Beispiel #4
0
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, target_id, feature_selection):
	'''execute learning task using the specified algorithm'''

	# feature selection
	# feature selection
	k = 30
	#k = 100000
	if feature_selection:
		print '  ...performing feature selection'
		if X.shape[1] < k:
			k = X.shape[1]

		pearsons = []
		pearsons_print = []
		for i in range(X.shape[1]):
			if sum(np.asarray(X[:,i])) != 0:
				p = pearsonr(np.squeeze(np.asarray(X[:,i])), y)
				pearsons.append(abs(p[0]))
				pearsons_print.append(p[0])
			else:
				pearsons.append(0)
				pearsons_print.append(0)

		# best_features = np.array(pearsons).argsort()[-k:][::-1]

		sorted_features = np.array(pearsons).argsort()[:][::-1]

		best_features = []
		remove_list = []
		i = 0
		while len(best_features) < k:
			if not i in remove_list:
				best_features.append(sorted_features[i])
				for j in range(i, X.shape[1]):
					p = pearsonr(np.asarray(X[:,sorted_features[i]]).tolist(), np.asarray(X[:,sorted_features[j]]).tolist())
					if abs(p[0]) >= 0.7:
						remove_list.append(j)
			i += 1


		old_headers = list(headers)
		headers = [headers[i] for i in best_features]
		f = open(out_dir+"correlations_" + fname + '.csv', 'w')
		for header in headers:
			f.write(str(header) + ' & ' + str(float("{0:.2f}".format(pearsons_print[old_headers.index(header)]))) + '\n')
		f.close()
		new_X = X[:,best_features]

	else:
		new_X = X
		best_features = 'all'

	print alg

	# execute algorithm
	if alg == 'DT':
		results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers)
	elif alg == 'RF':
		results, features, model = ML.RF(new_X, y, best_features, n_estimators=100)
	elif alg == 'RFsmall':
		results, features, model = ML.RF(new_X, y, best_features, n_estimators=10)
	elif alg == 'SVM':
		results, model = ML.SVM(new_X, y, best_features)
	elif alg == 'LR':
		results, features, model = ML.LR(new_X, y, best_features)

	if not results:
		return

	# set2model_instance[fname] = (model, best_features)

	# export results
	# results_list.append([fname] + results[0:3])

	in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])
	if 'features' in locals():
		features = features.flatten()
		in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features))

	return model, best_features, [fname] + results[0:3]
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, target_id, feature_selection):
	'''execute learning task using the specified algorithm'''

	# feature selection
	k = 50
	if feature_selection and X.shape[1] >= k:
		print '  ...performing feature selection'

		# subset for feature selection
		# print '   ...subsetting pos'
		# print y.shape, y[0:5], type(y[0]), type(y[1])
		# pos_target_indices = y[y==1]
		# # print pos_target_indices, pos_target_indices.shape
		# sub_pos_X = X[pos_target_indices,:]
		# sub_pos_y = np.squeeze(y[pos_target_indices])

		# # print '    ..neg'
		# neg_target_indices = y[y==0]
		# # print y[pos_target_indices].shape
		# # print y[neg_target_indices].shape

		# # print '     ..choices'
		# neg_choices = np.random.choice(y[neg_target_indices], size=sum(pos_target_indices)*50, replace=False)
		# # print neg_choices
		# sub_neg_X = X[neg_choices,:]
		# sub_neg_y = np.squeeze(y[neg_choices])

		# # print '    ... adding, ', sub_pos_X.shape[1]
		# cols = sub_pos_X.shape[1]
		# rows_pos = sub_pos_X.shape[0]
		# rows = sub_pos_X.shape[0] + sub_neg_X.shape[0]
		# sub_X = np.empty( (rows, cols) )
		# sub_X[0:rows_pos, :] = sub_pos_X
		# sub_X[rows_pos:, :] = sub_neg_X
		# sub_y = np.append(sub_pos_y, sub_neg_y)
		# # print sub_pos_X.shape
		# # print sub_neg_X.shape
		# # print sub_X.shape
		# # print sub_y.shape

		# # FS
		# transformer = SelectKBest(f_classif, k)
		# # transformer = VarianceThreshold(0.005)
		# new_sub_X = transformer.fit_transform(sub_X, sub_y)
		# best_features = np.array(transformer.scores_).argsort()[-k:][::-1]
		
		
		pearsons = []
		for i in range(X.shape[1]):
			p = pearsonr(np.squeeze(np.asarray(X[:,i])), y)
			pearsons.append(abs(p[0]))
		best_features = np.array(pearsons).argsort()[-k:][::-1]

		# print best_features
		headers = [headers[i] for i in best_features]
		new_X = X[:,best_features]
		# print new_X.shape
		# print y.shape

	else:
		new_X = X
		best_features = 'all'

	# execute algorithm
	if alg == 'DT':
		results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers)
	elif alg == 'RF':
		results, features, model = ML.RF(new_X, y, best_features, n_estimators=100)
	elif alg == 'RFsmall':
		results, features, model = ML.RF(new_X, y, best_features, n_estimators=10)
	elif alg == 'SVM':
		results, model = ML.SVM(new_X, y, best_features)
	elif alg == 'LR':
		results, features, model = ML.LR(new_X, y, best_features)

	if not results:
		return

	# set2model_instance[fname] = (model, best_features)

	# export results
	# results_list.append([fname] + results[0:3])

	in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])
	if 'features' in locals():
		features = features.flatten()
		in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features))
	
	return model, best_features, [fname] + results[0:3]
Beispiel #6
0
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k):
    '''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
    print '### executing learning algorithms on... ###'

    # get the files
    files = util.list_dir_csv(in_dir)

    # stop if no files found
    if not files:
        print 'No appropriate csv files found. Select an input directory with appropriate files'
        return

    # create directory
    util.make_dir(out_dir)

    # execute each algorithm

    # run algorithm alg for each file f
    for f in files:
        results_list = []
        fname = in_out.get_file_name(f, extension=False)
        print ' ...{}'.format(fname)

        # get data, split in features/target. If invalid stuff happened --> exit
        X, y, headers = in_out.import_data(
            f, record_id, target_id,
            True)  # assumption: first column is patientnumber
        if type(X) == bool: return

        day_index = headers.index(day_id)
        new_X = np.zeros((0, len(headers)))
        new_y = []

        IDs = []
        IDrows = {}

        # ordering of time points and complete data (filled with nan's if not available) assumed!

        # Select the right day and normalize the columns
        new_index = 0
        for i in range(0, X.shape[0]):
            if X[i, headers.index(day_id)] == day or day == -1:
                row = np.array(X[i, :]).reshape(-1)

                if not row[0] in IDs:
                    IDs.append(row[0])
                    new_y.append(int(y[i]))
                    IDrows[row[0]] = [new_index]
                else:
                    IDrows[row[0]].append(new_index)
                new_X = np.append(new_X, np.column_stack(row), axis=0)
                new_index += 1

        # Remove the id, the day, and the time stamp from the data and headers.
        new_X = np.delete(new_X, 2, 1)
        new_X = np.delete(new_X, 1, 1)
        new_X = np.delete(new_X, 0, 1)
        new_headers = headers[3:len(headers)]
        X = new_X

        # Remove columns with only a single value or all nans

        non_singular_rows = [
            i for i in range(0, X.shape[1])
            if len(set(util.get_non_nans(X[:, i].tolist()))) > 1
        ]
        #print str(len(non_singular_rows)) + ' ' + str(X.shape[1])
        #print non_singular_rows

        X = X[:, non_singular_rows]
        new_headers = np.array(new_headers)[non_singular_rows].tolist()

        max_values = np.nanmax(X, axis=0)
        min_values = np.nanmin(X, axis=0)

        ranges = []
        for i in range(0, len(min_values)):
            diff = max_values[i] - min_values[i]
            if diff == 0:
                print 'difference of zero encountered in ' + str(i)
                print 'Max values: ' + str(max_values[i])
                print 'Min values: ' + str(min_values[i])
                ranges.append(1)
            else:
                ranges.append(diff)

        # Now do some scaling to get the values to the same order or magnitude
        scaled_X = (X - min_values) / (max_values - min_values)
        X = scaled_X
        y = np.squeeze(np.asarray(new_y))

        new_IDrows = {}
        for ID in IDs:
            IDrows[ID] = {
                'first_row': min(IDrows[ID]),
                'last_row': max(IDrows[ID])
            }

        print '  ...instances: {}, attributes: {}'.format(
            X.shape[0], X.shape[1])

        # Now we are going to build the similarity matrix. We are also going to store how many attributes
        # we actually able to make a comparison for.

        similarity_matrix = np.zeros((len(IDs), len(IDs)))
        matching_number_matrix = np.ones((len(IDs), len(IDs)))

        for i in range(0, len(IDs)):
            for j in range(i + 1, len(IDs)):
                for attr in range(0, len(new_headers)):
                    i_data = X[IDrows[IDs[i]]['first_row']:
                               IDrows[IDs[i]]['last_row'] + 1, attr].tolist()
                    j_data = X[IDrows[IDs[j]]['first_row']:
                               IDrows[IDs[j]]['last_row'] + 1, attr].tolist()
                    #print i_data
                    #print j_data
                    if new_headers[attr] in dtw_attr:
                        dtw_distance = dtw.lb_keogh(i_data, j_data, window)
                        # print dtw_distance
                        if not dtw_distance == -1:
                            similarity_matrix[i, j] += dtw_distance
                            matching_number_matrix[i, j] += 1
                    else:
                        i_data = util.get_non_nans(i_data)
                        j_data = util.get_non_nans(j_data)
                        if len(i_data) > 0 and len(j_data) > 0:
                            simple_distance = math.pow(
                                np.mean(i_data) - np.mean(j_data), 2)
                            similarity_matrix[i, j] += simple_distance
                            matching_number_matrix[i, j] += 1
                similarity_matrix[j, i] = similarity_matrix[i, j]
                matching_number_matrix[j, i] = matching_number_matrix[i, j]

        similarity_matrix = similarity_matrix / matching_number_matrix  # We calculate the average score per item matched
        # Best might be to apply a weighting scheme now.

        results = perform_classification(similarity_matrix, y, out_dir, k)
        results_list.append(results)

        in_out.save_results(out_dir + str(k) + '.csv',
                            ["fpr", "tpr", "auc", "cm"],
                            results[1:len(results)], [sum(y), len(y)])
        in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve')

    # notify user
    print '## Learning Finished ##'
Beispiel #7
0
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k):
    '''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and sub_directories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
    print '### executing learning algorithms on... ###'

    # get the files
    files = util.list_dir_csv(in_dir)

    # stop if no files found
    if not files:
        print 'No appropriate csv files found. Select an input directory with appropriate files'
        return

    # create directory
    util.make_dir(out_dir)

    # execute each algorithm

    # run algorithm alg for each file f
    for f in files:
        results_list = []
        fname = in_out.get_file_name(f, extension=False)
        print ' ...{}'.format(fname)

        # get data, split in features/target. If invalid stuff happened --> exit
        X, y, headers = in_out.import_data(
            f, record_id, target_id,
            True)  # assumption: first column is patientnumber
        if type(X) == bool: return

        day_index = headers.index(day_id)
        new_X = np.zeros((0, len(headers)))
        new_y = []

        IDs = []
        IDrows = {}

        # ordering of time points and complete data (filled with nan's if not available) assumed!

        #  		features_to_be_removed   =    [ "pvc_bin","pnc_bin","pac_bin","ect_freq_bin","full_code_bin","comfort_meas_bin","other_code_bin","no_cpr_bin",
        # 										"dnr_bin","dni_bin","fall_risk_bin","orientation_ord","orient_unable_ass_bin","riker_sas_ord","vent_bin",
        # 										"vent_mode_ord","pacemaker_bin","trach_bin","flush_skin_bin","jaundice_skin_bin","pale_skin_bin","impaired_skin_bin",
        # 										"iabp_ord","iabp_bin","svnsicu_bin","svcsicu_bin","svcsru_bin","svmicu_bin","svmsicu_bin","svother_bin","svccu_bin",
        # 										"gender"]

        exclude = [
            146, 140, 95, 123, 88, 133, 22, 65, 49, 114, 178, 55, 133, 138, 34,
            186, 20, 73
        ]
        new_index = 0
        for i in range(0, X.shape[0]):
            if X[i, headers.index(day_id)] == day or day == -1:
                row = np.array(X[i, :]).reshape(-1)

                if not row[0] in IDs and not row[0] in exclude:
                    IDs.append(row[0])
                    new_y.append(int(y[i]))
                    IDrows[row[0]] = [new_index]
                elif not row[0] in exclude:
                    IDrows[row[0]].append(new_index)
                new_X = np.append(new_X, np.column_stack(row), axis=0)
                new_index += 1

        ID_column = new_X[:, 0]

        # Remove the id, the day, and the time stamp from the data and headers.
        new_X = np.delete(new_X, 2, 1)
        new_X = np.delete(new_X, 1, 1)
        new_X = np.delete(new_X, 0, 1)
        new_headers = headers[3:len(headers)]

        dtw_attr = ['hr', 'resp', 'nbp', 'sbp', 'dbp', 'so2']

        X = new_X
        print len(X)

        non_singular_rows = [
            i for i in range(0, X.shape[1])
            if len(set(util.get_non_nans(X[:, i].tolist()))) > 1
        ]
        #print str(len(non_singular_rows)) + ' ' + str(X.shape[1])
        #print non_singular_rows

        X = X[:, non_singular_rows]
        new_headers = np.array(new_headers)[non_singular_rows].tolist()
        print str(
            len(new_headers)) + "length new headers after non singular rows"
        print new_headers
        print "Removed columns with only nan of 1 value"
        max_values = np.nanmax(X, axis=0)
        min_values = np.nanmin(X, axis=0)

        ranges = []
        for i in range(0, len(min_values)):
            diff = max_values[i] - min_values[i]
            if diff == 0:
                print 'difference of zero encountered in ' + str(i)
                print 'Max values: ' + str(max_values[i])
                print 'Min values: ' + str(min_values[i])
                ranges.append(1)
            else:
                ranges.append(diff)

        # Now do some scaling to get the values to the same order or magnitude
        scaled_X = (X - min_values) / (max_values - min_values)
        X = scaled_X
        y = np.squeeze(np.asarray(new_y))

        print "Scaling done!"

        new_IDrows = {}
        for ID in IDs:
            IDrows[ID] = {
                'first_row': min(IDrows[ID]),
                'last_row': max(IDrows[ID])
            }

        print '  ...instances: {}, attributes: {}'.format(
            X.shape[0], X.shape[1])

        # Now we are going to build the similarity matrix. We are also going to store how many attributes
        # we actually able to make a comparison for.

        similarity_matrix = np.ones((len(IDs), len(IDs)))
        matching_number_matrix = np.ones((len(IDs), len(IDs)))

        for attr in range(0, len(new_headers)):
            print str(attr) + "attribute in KNN loop"
            print str(attr) + "/" + str(len(new_headers))

            temp = np.ones((len(IDs), len(IDs)))
            temp[:] = 2
            for i in range(0, len(IDs)):
                for j in range(i + 1, len(IDs)):

                    i_data = X[IDrows[IDs[i]]['first_row']:
                               IDrows[IDs[i]]['last_row'] + 1, attr].tolist()
                    j_data = X[IDrows[IDs[j]]['first_row']:
                               IDrows[IDs[j]]['last_row'] + 1, attr].tolist()

                    if new_headers[attr] in dtw_attr:
                        dtw_distance = dtw.lb_keogh(i_data, j_data, window)

                        if not dtw_distance == -1:
                            temp[i, j] += dtw_distance
                            matching_number_matrix[i, j] += 1
                            matching_number_matrix[
                                j, i] = matching_number_matrix[i, j]
                            temp[j, i] = temp[i, j]
                    else:
                        i_data = util.get_non_nans(i_data)
                        j_data = util.get_non_nans(j_data)
                        if len(i_data) > 0 and len(j_data) > 0:
                            simple_distance = math.pow(
                                np.mean(i_data) - np.mean(j_data), 2)
                            temp[i, j] += simple_distance
                            matching_number_matrix[i, j] += 1
                            matching_number_matrix[
                                j, i] = matching_number_matrix[i, j]
                            temp[j, i] = temp[i, j]

            if np.max(temp) != 0:
                temp = temp / np.max(temp)
            similarity_matrix += temp

        # We calculate the average score per item matched
        # Best might be to apply a weighting scheme now.
        similarity_matrix = (similarity_matrix / matching_number_matrix) + (
            1 / matching_number_matrix)

        print len(IDs)
        results = perform_classification(similarity_matrix, y, out_dir, k)
        results_list.append(results)
        print results
        in_out.save_results(out_dir + str(k) + '.csv',
                            ["fpr", "tpr", "auc", "cm"],
                            results[1:len(results)], [sum(y), len(y)])
        in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve')

        # notify user
    print '## Learning Finished ##'
    print similarity_matrix