def test_cluster_threshold(self): """Expected error for threshold to similarity should be reasonable""" n_tests = 50 dim = 15 expected_error = 0.20 tot_err = 0 for test in range(n_tests): # Get some sets and their similarities sets = (randset(), randset()) jsim = jaccard_sim(*sets) # Find the threshold at which they cluster together for threshold in range(1, 100, 5): threshold = float(threshold) / 100 bandwidth = get_bandwidth(dim, threshold) num_bands = int(dim / bandwidth) cluster = Cluster(width=num_bands * bandwidth, bandwidth=bandwidth) cluster.add_item(sets[0]) cluster.add_item(sets[1]) if len(cluster.get_clusters()) == 2: tot_err += abs(jsim - threshold) break avg_err = float(tot_err) / n_tests self.assertLessEqual(avg_err, expected_error, "Error too large")
def test_same_set(self): """A set should be clustered with itself""" s = randset() cluster = Cluster(width=10, bandwidth=2) cluster.add_item(s) cluster.add_item(s) self.assertEqual(len(cluster.get_clusters()), 1)
def test_empty(self): """Should place the two empty sets into a separate cluster""" cluster = Cluster(width=10, bandwidth=2) cluster.add_item("abcdefg") cluster.add_item("abcdefghi") cluster.add_item("") cluster.add_item("") self.assertEqual(len(cluster.get_clusters()), 2)
def test_dissimilar_sets(self): """Two non-similar sets should not be clustered""" cluster = Cluster(width=10, bandwidth=2) cluster.add_item("12yu5abcdef") cluster.add_item("1234567890z") print cluster.get_clusters() num_clusters = len(cluster.get_clusters()) self.assertEqual(num_clusters, 2, "Expected 2 clusters, got {}".format(num_clusters))
def test_bills(self): """Should return 97 clusters of bills. """ with open(get_resource_name('data/bills100.txt'), 'r') as fhandle: data = [line.rstrip().split('|') for line in fhandle] cluster = Cluster(width=20, bandwidth=5, seed=SEED) shingler = Shingler(span=3, tokenizer=RegexTokenizer()) for label, text in data: shingles = shingler.get_shingles(text) cluster.add_item(shingles, label) clusters = cluster.get_clusters() self.assertEqual(len(clusters), 97)
def test_names(self): """Should return 281 clusters of names. """ with open(get_resource_name('data/perrys.csv'), 'r') as fhandle: data = set(line.rstrip() for line in fhandle) cluster = Cluster(width=20, bandwidth=5, seed=SEED) shingler = Shingler(3) for name in data: shingles = shingler.get_shingles(name) cluster.add_item(shingles, name) clusters = cluster.get_clusters() self.assertEqual(len(clusters), 209)
def cluster_from_mac_log(options): """Generate a list of clusters from a MAC log and summarize them """ def output_clusters(unfiltered_sets, data): if options.output_path: fh = open(options.output_path, 'w') else: fh = None for cluster_id, cluster in enumerate(sort_by_length(unfiltered_sets)): parcel = { "cluster_id": cluster_id, "length": len(cluster), "posts": [data[pid] for pid in cluster] #"posts": {post_id: data[post_id] for post_id in cluster} } if fh: print >>fh, json.dumps(parcel) yield parcel cluster_builder = Cluster(width=options.width, bandwidth=options.bandwidth) shingler = MACShingler(options) data = {} with open(options.file_path) as mac_log: for line_num, line in enumerate(islice(mac_log, 0, options.head)): if (not options.quiet) and (not line_num % 10000): sys.stderr.write("Processing line " + str(line_num) + "\n") json_obj = json.loads(line) post_id = mac_get_post_id(json_obj, line_num) cluster_builder.add_set(shingler.shingles_from_mac(json_obj), post_id) data[post_id] = json_obj clusters = cluster_builder.get_clusters() transformed_clusers = output_clusters(clusters, data) print_mac_stats(transformed_clusers, options=options)
def test_names_kmin_scheme(self): """Should return 145 clusters of names. """ with open(get_resource_name('data/perrys.csv'), 'r') as fhandle: data = set(line.rstrip() for line in fhandle) cluster = Cluster(width=20, bandwidth=5, kmin=2, lsh_scheme="a1", seed=SEED) shingler = Shingler(3) for name in data: shingles = shingler.get_shingles(name) cluster.add_item(shingles, name) clusters = cluster.get_clusters() # for cluster in clusters: # print cluster self.assertEqual(len(clusters), 176)
def run_simulated_manually(filepath, lines_to_read=sys.maxint, cluster_args=None): with open(get_resource_name(filepath), 'r') as fhandle: data = [line.rstrip().split(' ') for line in islice(fhandle, lines_to_read)] if cluster_args is None: cluster_args = dict() cluster = Cluster(**cluster_args) shingler = Shingler(span=3) content_dict = dict() for pair in data: if len(pair) > 1: label, text = pair else: label, text = pair[0], '' content_dict[label] = text shingles = shingler.get_shingles(text) cluster.add_item(shingles, label) clusters = cluster.get_clusters() is_label_positive = lambda lbl: ':' in lbl return dict(stats=describe_clusters(clusters, is_label_positive))
def test_similar_sets(self): """Two similar sets should be clustered""" cluster = Cluster(width=10, bandwidth=2) cluster.add_item("abcdefg") cluster.add_item("abcdefghi") self.assertEqual(len(cluster.get_clusters()), 1)