def classify0(to_test_matrix, training_matrix, labels, k): training_count = training_matrix.shape[0] input_matrix = tile(to_test_matrix, (training_count, 1)) distances = distances_of_two_matrixes(input_matrix, training_matrix) sorted_distance_indices = distances.argsort() top_k_distance_indices = sorted_distance_indices[:k] label_count_dict = count_by(lambda i: labels[i], top_k_distance_indices) sorted_label_count_pairs = sorted(label_count_dict.items(), key=lambda k_v: k_v[1], reverse=True) return sorted_label_count_pairs[0][0]
def scan_D(D, Ck, min_support): matched_c_items = [c for row in D for c in Ck if set(c).issubset(set(row))] matched_c_items = map(frozenset, matched_c_items) item_counts = count_by(identity, matched_c_items) def support(item): return item_counts[item] / float(len(D)) Lk = list( set(filter(lambda item: support(item) >= min_support, matched_c_items))) support_data = {item: support(item) for item in Lk} return map(list, Lk), support_data
def calc_shannon_entropy(dataset): label_counts = count_by(lambda r: r[-1], dataset) def prop(k): return label_counts[k] / float(len(dataset)) return reduce(lambda r, k: r - prop(k) * log(prop(k), 2), label_counts, 0.0)
def majority_class(classes): class_counts = count_by(lambda c: c, classes) sorted_class_count_pairs = sorted(class_counts.items(), key=lambda k_v: k_v[1], reverse=True) return sorted_class_count_pairs[0][0]
def words_to_bag_vector(all_words, test_words): word_counts = count_by(lambda w: w, test_words) return map(lambda word: word_counts[word] if word in test_words else 0, all_words)