def stratified_split_data(schema, exampleSet, numFolds): posData = mldata.ExampleSet(schema) negData = mldata.ExampleSet(schema) np.random.seed(PRNG) folds = [mldata.ExampleSet(schema) for i in range(numFolds)] #split input by label for example in exampleSet: if example[len(example) - 1] == 1: posData.append(example) else: negData.append(example) while len(posData) > 0: for i in range(numFolds): if len(posData) == 0: break x = np.random.randint(0, len(posData)) #get random index on the input folds[i].append(posData[x]) #add the element to the fold del posData[x] #remove from the input set while len(negData) > 0: for i in range(numFolds): if len(negData) == 0: break x = np.random.randint(0, len(negData)) #get random index on the input folds[i].append(negData[x]) #add the element to the fold del negData[x] #remove from the input set return folds
def fold_cv(full_dataset, num_folds): #Divide full_dataset into stratified 5-folds #Separate the full_dataset into two sets in terms of the label true_set = mldata.ExampleSet(ex for ex in full_dataset if ex[-1] == True) false_set = mldata.ExampleSet(ex for ex in full_dataset if ex[-1] == False) shuffle(true_set) shuffle(false_set) #Calculate the length of each set true_len = len(true_set) true_len_part = true_len / num_folds false_len = len(false_set) false_len_part = false_len / num_folds datasets = [] for i in range(num_folds): dataset = mldata.ExampleSet() for j in range(int(i * true_len_part), int((i + 1) * true_len_part)): dataset.append(true_set[j]) for j in range(int(i * false_len_part), int((i + 1) * false_len_part)): dataset.append(false_set[j]) datasets.append(dataset) return datasets
def get_train_test_split(folds: Sequence, test_fold_ind: int) -> Tuple: """Creates the training and test sets from dataset folds. Args: folds: A sequence of sequences, each being a fold of the data. test_fold_ind: Index of the training set fold. Returns: A training set and test set. """ train_folds = [folds[i] for i in range(len(folds)) if i != test_fold_ind] train_set = mldata.ExampleSet(functools.reduce(operator.add, train_folds)) test_set = mldata.ExampleSet(folds[test_fold_ind]) return train_set, test_set
def boost_data(data, weights): ints = correctratios(weights) replicated_data = [] for i, count in enumerate(ints): replicated_data.extend([data[i]] * count) eset = mldata.ExampleSet(d for d in replicated_data) return eset
def build_trees(datasets): #Build a tree using datasets as training data #Initialize lists to save outputs trees = [] sizes = [] max_depths = [] first_features = [] accs = [] #Build each tree and output results for i in range(5): train_data = mldata.ExampleSet() for j in range(1, 5): for index in range(len(datasets[(i + j) % 5])): train_data.append(datasets[(i + j) % 5][index]) val_data = datasets[i] shuffle(train_data) shuffle(val_data) tree = build_tree.build_DecisionTree(MAX_DEPTH, EPS, train_data, ENABLE_GAIN) size = tree.get_tree_size() max_depth = tree.get_tree_depth() trees.append(tree) sizes.append(size) max_depths.append(max_depth) first_feature_index = tree.get_root().get_attriIndex() first_feature = train_data.schema.features[first_feature_index].name first_features.append(first_feature) acc = tree.classify_dataset(val_data) accs.append(acc) print( 'Tree %d:\n\nAccuracy: %.4f\n\nSize: %d\n\nMaximum Depth: %d\n\nFirst Feature: %s' % (i + 1, acc, size, max_depth, first_feature)) return trees, sizes, first_features, accs, max_depths
def cross_logreg(original_data): datasets = fold_5_cv(original_data) accuracies = [] precisions = [] recalls = [] for i in range(5): train_data = mldata.ExampleSet() for j in range(1, 5): for index in range(len(datasets[(i + j) % 5])): train_data.append(datasets[(i + j) % 5][index]) val_data = datasets[i] shuffle(train_data) shuffle(val_data) lg = Logistic_Regression(lambdaa=LAMBDA, training_data=train_data, iteration=ITER, learning_rate=LR) predictions, true_label = lg.classify_data(val_data) accuracy, precision, recall = get_results(predictions, true_label) accuracies.append(accuracy) precisions.append(precision) recalls.append(recall) print( "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n" % (i + 1, accuracy, precision, recall)) return accuracies, precisions, recalls
def standardize(data: mldata.ExampleSet) -> mldata.ExampleSet: """Standardizes (center and scales) all continuous features. Args: data: Collection of examples to pre-process. Returns: An ExampleSet with standardized continuous features. """ continuous_exs = mlutil.get_feature_examples(data=data, feature_types={CONTINUOUS}, as_dict=True, index_as_key=True) if len(continuous_exs) == 0: return data standardized = {i: stats.zscore(exs) for i, exs in continuous_exs.items()} examples = [] for e, ex_val in enumerate(data): example = mldata.Example(data.schema) # f is the feature number; e is the example index example.features = [ standardized[f][e] if f in standardized else ex_val[f] for f in range(len(data.schema)) ] examples.append(example) example_set = mldata.ExampleSet(data.schema) example_set.extend(examples) return example_set
def predict(self, data: mldata.ExampleSet) -> Tuple: if self.model is None: predictions = tuple() else: predictions = tuple( self._predict_example(mldata.ExampleSet([example]), self.model) for example in data) return predictions
def dtree(exampleSet, validationType, depth, splitCriterion, k=5): e = ns.EntropySelector(exampleSet) if validationType == 0: # 5-Fold Stratified CROSS VALIDATION folds = stratified_split_data(exampleSet, k) print("-------", k, "- Fold Stratified Cross Validation --------") total_acc = 0 for i in range(k): #Create the buildSet buildSet = mldata.ExampleSet() for j in range(k): if i != j: buildSet.append(folds[j]) #Build tree and output for each fold #print(buildSet) tree = dt.build_tree(buildSet, e, depth, splitCriterion) acc = accuracy(tree, folds[i]) print("Fold Iteration:", i) print("Accuracy :", acc) print("Size :", tree.size) print("Maximum Depth:", tree.depth) print("First Feature:", tree.headnode.name) total_acc += acc print("Average Accuracy:", total_acc / k) elif validationType == 1: print( "------- NO Cross Validation: Running on Full Example Set --------" ) #NO CROSS VALIDATION tree = dt.build_tree(exampleSet, e, depth, splitCriterion) print("Accuracy :", accuracy(tree, exampleSet)) print("Size :", tree.size) print("Maximum Depth:", tree.depth) print("First Feature:", tree.headnode.name) else: print("Incorrect validation type argument given.")
def naive_bayes_cv(datasets, min_and_max): accuracies = [] precisions = [] recalls = [] for i in range(5): train_data = mldata.ExampleSet() for j in range(1, 5): for index in range(len(datasets[(i + j) % 5])): train_data.append(datasets[(i + j) % 5][index]) val_data = datasets[i] shuffle(train_data) shuffle(val_data) label_ratio, save_all_prob, save_all_threshold = Naive_Bayes.showme_dataset( train_data, NUM_BINS, M, min_and_max) accuracy, precision, recall = compute_test_results( label_ratio, save_all_prob, val_data) accuracies.append(accuracy) precisions.append(precision) recalls.append(recall) print( "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n" % (i + 1, accuracy, precision, recall)) return accuracies, precisions, recalls
def remove_near_zero_variance(data: mldata.ExampleSet, cut_off: float = 0.1) -> mldata.ExampleSet: """Removes all discrete features that have "low" variance. Args: data: Collection of examples to pre-process. cut_off: Features with a variance below this value will be removed from the data. Returns: Filtered data that does not include near-zero variance features. """ def discrete_var(values: Sequence) -> float: encoding = {k: v for v, k in enumerate(set(values))} return statistics.variance(encoding[v] for v in values) discrete_exs = mlutil.get_feature_examples(data=data, feature_types={BINARY, NOMINAL}, as_dict=True, index_as_key=True) if len(discrete_exs) == 0: return data near_zeros = { i for i, exs in discrete_exs.items() if discrete_var(exs) <= cut_off } enumerated_subset_schema = [(i, f) for i, f in enumerate(data.schema) if i not in near_zeros] subset_schema = [feature for _, feature in enumerated_subset_schema] examples = [] for ex in data: example = mldata.Example(subset_schema) example.features = [ex[i] for i, _ in enumerated_subset_schema] examples.append(example) subset = mldata.ExampleSet() subset.extend(examples) return subset
def logreg(schema, exampleSet, validationType, constant, k=5): if validationType == 0: # 5-Fold Stratified CROSS VALIDATION folds = stratified_split_data(schema, exampleSet, k) print("-------", k, "- Fold Stratified Cross Validation --------") total_acc = [] total_prec = [] total_recal = [] total_original_results = [] total_predictions = [] for i in range(k): #Create the buildSet buildSet = mldata.ExampleSet(schema) for j in range(k): if i != j: for example in (folds[j]): buildSet.append(example) print("Fold Iteration:", i) test = utils._convert_exampleset_to_dataframe(folds[i]) class_idx = utils._get_class_idx(test) #classifier = NaiveBayes(buildSet, validationType, bins, Mestimate) classifier = LogisticRegression(buildSet, constant) predictions = classifier.predict(test) print("Calculating output of this fold.") original_results = [] for l in range(len(test)): original_results.append(test.iloc[l, class_idx]) TruePos = 0 TrueNeg = 0 FalsePos = 0 FalseNeg = 0 for m in range(len(predictions)): if predictions[m][1] == 1 and original_results[m] == 1: TruePos += 1 elif predictions[m][1] == 0 and original_results[m] == 0: TrueNeg += 1 elif predictions[m][1] == 1 and original_results[m] == 0: FalsePos += 1 elif predictions[m][1] == 0 and original_results[m] == 1: FalseNeg += 1 else: print("YOU MESSED UP:", i) assert len(predictions) == ( TrueNeg + TruePos + FalseNeg + FalsePos ), "...OH NO, Sum of results doesn't equal num of results..." total_acc.append((TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg + FalsePos)) print("Error for fold: " + str(1 - (TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg + FalsePos))) if TruePos + FalsePos > 0: total_prec.append((TruePos) / (TruePos + FalsePos)) elif TruePos + FalsePos + FalseNeg == 0: total_prec.append(1) else: total_prec.append(0) if TruePos + FalseNeg > 0: total_recal.append((TruePos) / (TruePos + FalseNeg)) elif TruePos + FalsePos + FalseNeg == 0: total_recal.append(1) else: total_recal.append(0) if i == 0: total_predictions = predictions total_original_results = original_results else: total_predictions = np.concatenate( (total_predictions, predictions), axis=0) total_original_results = np.concatenate( (total_original_results, original_results), axis=0) #after folds are done TPR = [] FPR = [] increment = 0.1 threshold = 1.0 while threshold >= 0: TP = 0 FP = 0 TN = 0 FN = 0 for i in range(0, len(total_predictions)): if total_predictions[i][ 0] >= threshold and total_original_results[i] == 1: TP += 1 elif total_predictions[i][ 0] >= threshold and total_original_results[i] == 0: FP += 1 elif total_predictions[i][ 0] < threshold and total_original_results[i] == 1: FN += 1 elif total_predictions[i][ 0] < threshold and total_original_results[i] == 0: TN += 1 else: print("YOU MESSED UP:", i, total_predictions[i], total_original_results[i]) assert len(total_predictions) == ( TN + TP + FN + FP), "...OH NO, pred doens't equal original..." TPR.append(TP / (TP + FN)) FPR.append(FP / (FP + TN)) threshold -= increment print("TPR: ", TPR) print("FPR: ", FPR) AUR = 0.0 for trap in range(0, len(TPR) - 1): xDis = (FPR[trap + 1] - FPR[trap]) yDis = (TPR[trap] + TPR[trap + 1]) / 2 AUR += xDis * yDis if AUR < 0.5: print("1 - AUR used") AUR = 1.0 - AUR avg_acc = np.average(total_acc) avg_pre = np.average(total_prec) avg_rec = np.average(total_recal) std_acc = np.std(total_acc) std_pre = np.std(total_prec) std_rec = np.std(total_recal) print("===== Folds Complete =====") print("Average Accuracy :", round(avg_acc, 3), round(std_acc, 3)) print("Average Precision :", round(avg_pre, 3), round(std_pre, 3)) print("Average Recall :", round(avg_rec, 3), round(std_rec, 3)) print("Area Under ROC :", round(AUR, 3)) elif validationType == 1: print( "------- NO Cross Validation: Running on Full Example Set --------" ) #NO CROSS VALIDATION total_acc = 0.0 total_prec = 0.0 total_recal = 0.0 test = utils._convert_exampleset_to_dataframe(exampleSet) class_idx = utils._get_class_idx(test) #classifier = NaiveBayes(exampleSet, validationType, bins, Mestimate) classifier = LogisticRegression(exampleSet, constant) predictions = classifier.predict(test) print("Calculating output") original_results = [] for l in range(len(test)): original_results.append(test.iloc[l, class_idx]) TruePos = 0 TrueNeg = 0 FalsePos = 0 FalseNeg = 0 for m in range(len(predictions)): if predictions[m][1] == 1 and original_results[m] == 1: TruePos += 1 elif predictions[m][1] == 0 and original_results[m] == 0: TrueNeg += 1 elif predictions[m][1] == 1 and original_results[m] == 0: FalsePos += 1 elif predictions[m][1] == 0 and original_results[m] == 1: FalseNeg += 1 else: print("YOU MESSED UP:", i) assert len(predictions) == ( TrueNeg + TruePos + FalseNeg + FalsePos ), "...OH NO, Sum of results doesn't equal num of results..." total_acc = (TrueNeg + TruePos) / (TrueNeg + TruePos + FalseNeg + FalsePos) total_prec = (TruePos) / (TruePos + FalsePos) total_recal = (TruePos) / (TruePos + FalseNeg) #after folds are done TPR = [] FPR = [] increment = 0.1 threshold = 1.0 while threshold >= 0: TP = 0 FP = 0 TN = 0 FN = 0 for i in range(0, len(predictions)): if predictions[i][0] >= threshold and original_results[i] == 1: TP += 1 elif predictions[i][0] >= threshold and original_results[ i] == 0: FP += 1 elif predictions[i][0] < threshold and original_results[i] == 1: FN += 1 elif predictions[i][0] < threshold and original_results[i] == 0: TN += 1 else: print("YOU MESSED UP:", i, predictions[i], original_results[i]) assert len(predictions) == ( TN + TP + FN + FP), "...OH NO, pred doens't equal original..." TPR.append(TP / (TP + FN)) FPR.append(FP / (FP + TN)) threshold -= increment print("TPR: ", TPR) print("FPR: ", FPR) AUR = 0.0 for trap in range(0, len(TPR) - 1): xDis = (FPR[trap + 1] - FPR[trap]) yDis = (TPR[trap] + TPR[trap + 1]) / 2 AUR += xDis * yDis if AUR < 0.5: print("1 - AUR used") AUR = 1.0 - AUR print("===== Run Complete =====") print("Average Accuracy :", round(total_acc, 3)) print("Average Precision :", round(total_prec, 3)) print("Average Recall :", round(total_recal, 3)) print("Area Under ROC :", round(AUR, 3)) else: print("Incorrect validation type argument given.")
def _partition_data(data: mldata.ExampleSet, feature: mldata.Feature, test: Callable[[Any], bool]) -> Tuple: idx = mlutil.get_feature_index(data, feature) left_data = mldata.ExampleSet([e for e in data if test(e[idx])]) right_data = mldata.ExampleSet([e for e in data if not test(e[idx])]) return left_data, right_data
def _sample_dtree(self, dataset): newDataset = mldata.ExampleSet() for i in range(len(dataset)): samplingIdx = random.randint(0, len(dataset) - 1) newDataset.append(dataset[samplingIdx]) return newDataset
def cv(datasets): accuracies = [] precisions = [] recalls = [] for i in range(5): train_data = mldata.ExampleSet() for j in range(1, 5): for index in range(len(datasets[(i + j) % 5])): train_data.append(datasets[(i + j) % 5][index]) val_data = datasets[i] shuffle(train_data) shuffle(val_data) if (P != 0): for data in train_data: if (random.random() <= P): if (data[-1] == True): data[-1] = False elif (data[-1] == False): data[-1] = True if (ALGORITHM == 1): weight = 1 / len(train_data) * np.ones(len(train_data)) weight = weight.reshape(-1, 1) alpha_list, label_list = build_tree_boosting.boosting( MAX_DEPTH, EPS, train_data, val_data, ENABLE_GAIN, ITER, weight) f_list = compute_f_list(alpha_list, label_list) accuracy, precision, recall = compute_test_results( val_data, f_list) accuracies.append(accuracy) precisions.append(precision) recalls.append(recall) ROC_area = compute_ROC_area() print( "Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nArea under ROC: %.3f\n" % (accuracy, precision, recall, ROC_area)) elif (ALGORITHM == 2): alpha_list, label_list = naive_gayes.naive_bayes( train_data, val_data, ITER, NUM_BINS, M) f_list = compute_f_list(alpha_list, label_list) accuracy, precision, recall = compute_test_results( val_data, f_list) accuracies.append(accuracy) precisions.append(precision) recalls.append(recall) print( "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n" % (i + 1, accuracy, precision, recall)) elif (ALGORITHM == 3): lg = logreg.Logistic_Regression(lambdaa=LAMBDA, training_data=train_data, iteration=100, learning_rate=LR, boosting=True) lg, alpha_list, label_list = update_lg(lg, val_data) f_list = compute_f_list(alpha_list, label_list) accuracy, precision, recall = compute_test_results( val_data, f_list) accuracies.append(accuracy) precisions.append(precision) recalls.append(recall) print( "Classifier %d:\nAccuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\n" % (i + 1, accuracy, precision, recall)) return accuracies, precisions, recalls