def _get_pos_neg_outcomes(self, proc_data, m): """ Saves the probabilities of an attribute being a value, given outcome 0 or 1. The format of how the data is saved is explained below above the dict instantiations """ print("Starting probability estimates") # This will store as key the attribute index, and have value be list of probabilities, with # each index being the attr value and each value being the probability of that attr value given positive class data_pred_pos_outcome = {} # This will store as key the attribute index, and have value be list of probabilities, with # each index being the attr value and each value being the probability of that attr value given negative class data_pred_neg_outcome = {} class_idx = utils._get_class_idx(proc_data) for attr_idx in range(len(proc_data.iloc[0, :])): p_data_given_y0 = self._p_data_given_y(proc_data, attr_idx, class_idx, 0, m) p_data_given_y1 = self._p_data_given_y(proc_data, attr_idx, class_idx, 1, m) # If returns are -1, then attribute is a class or index col., which we don't want if p_data_given_y0 is not None and p_data_given_y1 is not None: # Store list of probabilities corresponding to attr val under dict key of attr idx data_pred_pos_outcome[str(attr_idx)] = p_data_given_y1 data_pred_neg_outcome[str(attr_idx)] = p_data_given_y0 return data_pred_pos_outcome, data_pred_neg_outcome
def __init__(self, data, validationType, bins, mEstimate): self.validationType = validationType self.bins = bins self.mEstimate = mEstimate proc_data = utils._convert_exampleset_to_dataframe(data) # Convert all continuous attributes to classes self._convert_data(proc_data, bins) # Get P(Xi = xi | Y = y) for all Xi, xi, and y. Specific data structure # storage details are discussed in helper methods data_pred_pos_outcome, data_pred_neg_outcome = self._get_pos_neg_outcomes( proc_data, mEstimate) # Store probabilities of attributes self.pos_outcomes = data_pred_pos_outcome self.neg_outcomes = data_pred_neg_outcome # Store class probability class_idx = utils._get_class_idx(proc_data) class_data = proc_data.iloc[:, class_idx] self.p_pos_class = self._get_n_class(class_data, 1) / len(class_data)
def __init__(self, data, validationType, bins, mEstimate, training_weights): self.validationType = validationType self.bins = bins self.mEstimate = mEstimate proc_data = data # already processed in ensemble version # Convert all continuous attributes to classes self._convert_data(proc_data, bins) # Get P(Xi = xi | Y = y) for all Xi, xi, and y. Specific data structure # storage details are discussed in helper methods data_pred_pos_outcome, data_pred_neg_outcome = self._get_pos_neg_outcomes( proc_data, mEstimate, training_weights) # Store probabilities of attributes self.pos_outcomes = data_pred_pos_outcome self.neg_outcomes = data_pred_neg_outcome # Store class probability class_idx = utils._get_class_idx(proc_data) class_data = proc_data.iloc[:, class_idx] self.p_pos_class = self._get_n_class( class_data, 1, training_weights) / sum(training_weights)
def logreg(schema, exampleSet, validationType, constant, k=5): if validationType == 0: # 5-Fold Stratified CROSS VALIDATION folds = stratified_split_data(schema, exampleSet, k) print("-------", k, "- Fold Stratified Cross Validation --------") total_acc = [] total_prec = [] total_recal = [] total_original_results = [] total_predictions = [] for i in range(k): #Create the buildSet buildSet = mldata.ExampleSet(schema) for j in range(k): if i != j: for example in (folds[j]): buildSet.append(example) print("Fold Iteration:", i) test = utils._convert_exampleset_to_dataframe(folds[i]) class_idx = utils._get_class_idx(test) #classifier = NaiveBayes(buildSet, validationType, bins, Mestimate) classifier = LogisticRegression(buildSet, constant) predictions = classifier.predict(test) print("Calculating output of this fold.") original_results = [] for l in range(len(test)): original_results.append(test.iloc[l, class_idx]) TruePos = 0 TrueNeg = 0 FalsePos = 0 FalseNeg = 0 for m in range(len(predictions)): if predictions[m][1] == 1 and original_results[m] == 1: TruePos += 1 elif predictions[m][1] == 0 and original_results[m] == 0: TrueNeg += 1 elif predictions[m][1] == 1 and original_results[m] == 0: FalsePos += 1 elif predictions[m][1] == 0 and original_results[m] == 1: FalseNeg += 1 else: print("YOU MESSED UP:", i) assert len(predictions) == ( TrueNeg + TruePos + FalseNeg + FalsePos ), "...OH NO, Sum of results doesn't equal num of results..." total_acc.append((TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg + FalsePos)) print("Error for fold: " + str(1 - (TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg + FalsePos))) if TruePos + FalsePos > 0: total_prec.append((TruePos) / (TruePos + FalsePos)) elif TruePos + FalsePos + FalseNeg == 0: total_prec.append(1) else: total_prec.append(0) if TruePos + FalseNeg > 0: total_recal.append((TruePos) / (TruePos + FalseNeg)) elif TruePos + FalsePos + FalseNeg == 0: total_recal.append(1) else: total_recal.append(0) if i == 0: total_predictions = predictions total_original_results = original_results else: total_predictions = np.concatenate( (total_predictions, predictions), axis=0) total_original_results = np.concatenate( (total_original_results, original_results), axis=0) #after folds are done TPR = [] FPR = [] increment = 0.1 threshold = 1.0 while threshold >= 0: TP = 0 FP = 0 TN = 0 FN = 0 for i in range(0, len(total_predictions)): if total_predictions[i][ 0] >= threshold and total_original_results[i] == 1: TP += 1 elif total_predictions[i][ 0] >= threshold and total_original_results[i] == 0: FP += 1 elif total_predictions[i][ 0] < threshold and total_original_results[i] == 1: FN += 1 elif total_predictions[i][ 0] < threshold and total_original_results[i] == 0: TN += 1 else: print("YOU MESSED UP:", i, total_predictions[i], total_original_results[i]) assert len(total_predictions) == ( TN + TP + FN + FP), "...OH NO, pred doens't equal original..." TPR.append(TP / (TP + FN)) FPR.append(FP / (FP + TN)) threshold -= increment print("TPR: ", TPR) print("FPR: ", FPR) AUR = 0.0 for trap in range(0, len(TPR) - 1): xDis = (FPR[trap + 1] - FPR[trap]) yDis = (TPR[trap] + TPR[trap + 1]) / 2 AUR += xDis * yDis if AUR < 0.5: print("1 - AUR used") AUR = 1.0 - AUR avg_acc = np.average(total_acc) avg_pre = np.average(total_prec) avg_rec = np.average(total_recal) std_acc = np.std(total_acc) std_pre = np.std(total_prec) std_rec = np.std(total_recal) print("===== Folds Complete =====") print("Average Accuracy :", round(avg_acc, 3), round(std_acc, 3)) print("Average Precision :", round(avg_pre, 3), round(std_pre, 3)) print("Average Recall :", round(avg_rec, 3), round(std_rec, 3)) print("Area Under ROC :", round(AUR, 3)) elif validationType == 1: print( "------- NO Cross Validation: Running on Full Example Set --------" ) #NO CROSS VALIDATION total_acc = 0.0 total_prec = 0.0 total_recal = 0.0 test = utils._convert_exampleset_to_dataframe(exampleSet) class_idx = utils._get_class_idx(test) #classifier = NaiveBayes(exampleSet, validationType, bins, Mestimate) classifier = LogisticRegression(exampleSet, constant) predictions = classifier.predict(test) print("Calculating output") original_results = [] for l in range(len(test)): original_results.append(test.iloc[l, class_idx]) TruePos = 0 TrueNeg = 0 FalsePos = 0 FalseNeg = 0 for m in range(len(predictions)): if predictions[m][1] == 1 and original_results[m] == 1: TruePos += 1 elif predictions[m][1] == 0 and original_results[m] == 0: TrueNeg += 1 elif predictions[m][1] == 1 and original_results[m] == 0: FalsePos += 1 elif predictions[m][1] == 0 and original_results[m] == 1: FalseNeg += 1 else: print("YOU MESSED UP:", i) assert len(predictions) == ( TrueNeg + TruePos + FalseNeg + FalsePos ), "...OH NO, Sum of results doesn't equal num of results..." total_acc = (TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg + FalsePos) total_prec = (TruePos) / (TruePos + FalsePos) total_recal = (TruePos) / (TruePos + FalseNeg) #after folds are done TPR = [] FPR = [] increment = 0.1 threshold = 1.0 while threshold >= 0: TP = 0 FP = 0 TN = 0 FN = 0 for i in range(0, len(predictions)): if predictions[i][0] >= threshold and original_results[i] == 1: TP += 1 elif predictions[i][0] >= threshold and original_results[ i] == 0: FP += 1 elif predictions[i][0] < threshold and original_results[i] == 1: FN += 1 elif predictions[i][0] < threshold and original_results[i] == 0: TN += 1 else: print("YOU MESSED UP:", i, predictions[i], original_results[i]) assert len(predictions) == ( TN + TP + FN + FP), "...OH NO, pred doens't equal original..." TPR.append(TP / (TP + FN)) FPR.append(FP / (FP + TN)) threshold -= increment print("TPR: ", TPR) print("FPR: ", FPR) AUR = 0.0 for trap in range(0, len(TPR) - 1): xDis = (FPR[trap + 1] - FPR[trap]) yDis = (TPR[trap] + TPR[trap + 1]) / 2 AUR += xDis * yDis if AUR < 0.5: print("1 - AUR used") AUR = 1.0 - AUR print("===== Run Complete =====") print("Average Accuracy :", round(total_acc, 3)) print("Average Precision :", round(total_prec, 3)) print("Average Recall :", round(total_recal, 3)) print("Area Under ROC :", round(AUR, 3)) else: print("Incorrect validation type argument given.")