def regression_with_cross_validation_and_c_index(inputs, outputs, num_folds, k): c_total_predictions = [] cd_predictions = [] pb_predictions = [] for fold in range(0, num_folds): #How many data objects in each fold len_fold = int(len(inputs) / num_folds) #Where is the test set placed in the data ix_test_first = fold * len_fold ix_test_one_past_last = ix_test_first + len_fold test_set = inputs[ix_test_first:ix_test_one_past_last] training_set = inputs[IX_MOD1:ix_test_first] + inputs[ ix_test_one_past_last:len(inputs)] training_set_outputs = (outputs[IX_C_TOTAL:ix_test_first] + outputs[ix_test_one_past_last:len(inputs)]) #Get list of nearest neighbor indices for each object in the test set neighbors = knn.compute_nearest_neighbors(test_set, training_set, k) for i in range(0, len_fold): #Get the neighbors' output values neighbors_c_totals = [] neighbors_cds = [] neighbors_pbs = [] for n in neighbors: neighbors_c_totals.append( float(training_set_outputs[n][IX_C_TOTAL])) neighbors_cds.append(float(training_set_outputs[n][IX_CD])) neighbors_pbs.append(float(training_set_outputs[n][IX_PB])) #Mean value of the neigbors is the prediction estimate_c_total = mean(neighbors_c_totals) estimate_cd = mean(neighbors_cds) estimate_pb = mean(neighbors_pbs) #Store the predictions c_total_predictions.append(estimate_c_total) cd_predictions.append(estimate_cd) pb_predictions.append(estimate_pb) c_ix_c_total = knn.c_index([row[IX_C_TOTAL] for row in outputs], c_total_predictions) c_ix_cd = knn.c_index([row[IX_CD] for row in outputs], cd_predictions) c_ix_pb = knn.c_index([row[IX_PB] for row in outputs], pb_predictions) print("C-index for c_totals: " + str(c_ix_c_total)) print("C-index for cd: " + str(c_ix_cd)) print("C-index for pb: " + str(c_ix_pb))
def knn_classification_and_c_index(test_data, training_data, subject_id, k): test_labels = get_labels(test_data) training_labels = get_labels(training_data) test_features = get_features(test_data) training_features = get_features(training_data) # Get nearest neighbors for every object in test set neighbors = knn.compute_nearest_neighbors(test_features, training_features, k) predictions = [] actuals = [] for measurement in range(0, len(neighbors)): prediction = knn.majority_class(neighbors[measurement], training_labels) actual = test_labels[measurement] predictions.append(prediction) actuals.append(actual) c_ix = knn.c_index(actuals, predictions) print("(" + str(subject_id) + "," + str(c_ix) + ")") return (c_ix)
def spatial_loo_cv(inputs, outputs, coordinates, delta, f_predict): ''' Perform spatial leave-one-out cross-validation for data and call a prediction function. @param inputs 2d list of the input data @param outputs 1d list of the corresponding outputs @param coordinates List of 2d coordinates of where the measurements were made @param delta Size of the dead zone. Same unit as in coordinates @param f_predict Function used to make the predictions @return C-index value of the predictions ''' predictions = [] for test_ix, test_inputs in enumerate(inputs): nearby_pnt_ixs = get_nearby_pnt_indices(inputs, test_ix, coordinates, delta) #Remove data of nearby points training_outputs = [] training_inputs = [] for ix, item in enumerate(inputs): if (ix != test_ix) and (ix not in nearby_pnt_ixs): training_inputs.append(inputs[ix]) training_outputs.append(outputs[ix]) #Make prediction for test set prediction = f_predict([test_inputs], training_inputs, training_outputs) predictions.append(prediction) return (knn.c_index(outputs, predictions))
def loo_cv_with_pairwise_filtering(features, labels, pairs, f_predict): ''' Perform modified leave-one-out cross-validation, where no overlapping of items in the pairs list between test and training set is allowed. Do classifications and report the c-index. @param features 2d list of feature values @param labels Correct labels corresponding to rows in features @param pairs List of pairs of each row in features. These are just names, not eg. row indices @param f_predict Prediction function used for classifications @return Concordance-index for the made predictions ''' predictions = [] misclassifications = 0 for test_ix, test_features in enumerate(features): #Potential training features and labels training_features = [ row for i, row in enumerate(features) if i != test_ix ] training_labels = [row for i, row in enumerate(labels) if i != test_ix] #Bind test index and pairs list to a function that tells whether or not #the pairs of a training object clash with the pairs of this test object f_can_add_to_trn_set = partial(pairs_disjoint, pairs=pairs, ix_1=test_ix) #Filter the features and labels training_features_filtered = filter_list(training_features, f_can_add_to_trn_set) training_labels_filtered = filter_list(training_labels, f_can_add_to_trn_set) #Do prediction with the function that was given as an argument prediction = f_predict([test_features], training_features_filtered, training_labels_filtered) predictions.append([prediction]) #See if we guessed correctly actual = labels[test_ix][0] if actual != prediction: misclassifications += 1 num_rows = len(features) if num_rows != 0: misclass_rate = float(misclassifications) / num_rows print("Classifications: " + str(num_rows)) print("Misclassifications: " + str(misclassifications)) print("Misclassification rate: " + str(misclass_rate)) return (knn.c_index(labels, predictions))
def loo_cv(features, labels, f_predict): ''' Perform leave-one-out cross-validation, do classifications and report the c-index @param features 2d list of feature values @param labels Correct labels corresponding to rows in features @param f_predict Prediction function used for classifications @return Concordance-index for the made predictions ''' predictions = [] misclassifications = 0 for test_ix, test_features in enumerate(features): training_features = [ row for i, row in enumerate(features) if i != test_ix ] training_labels = [row for i, row in enumerate(labels) if i != test_ix] prediction = f_predict([test_features], training_features, training_labels) predictions.append([prediction]) #See if we guessed correctly actual = labels[test_ix][0] if actual != prediction: misclassifications += 1 num_rows = len(features) if num_rows != 0: misclass_rate = float(misclassifications) / num_rows print("Classifications: " + str(num_rows)) print("Misclassifications: " + str(misclassifications)) print("Misclassification rate: " + str(misclass_rate)) c_ix = knn.c_index(labels, predictions) return (c_ix)