Python MinHashCluster Exemples, lsh_hdc.cluster.MinHashCluster Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_cluster.py Projet : Livefyre/lsh-hdc

    def test_cluster_threshold(self):
        """Expected error for threshold to similarity should be reasonable"""
        n_tests = 50
        dim = 15
        expected_error = 0.20

        tot_err = 0
        for test in range(n_tests):
            # Get some sets and their similarities
            sets = (randset(), randset())
            jsim = jaccard_sim(*sets)

            # Find the threshold at which they cluster together
            for threshold in range(1, 100, 5):
                threshold = float(threshold) / 100
                bandwidth = get_bandwidth(dim, threshold)
                num_bands = int(dim / bandwidth)
                cluster = Cluster(width=num_bands * bandwidth,
                                  bandwidth=bandwidth)
                cluster.add_item(sets[0])
                cluster.add_item(sets[1])
                if len(cluster.get_clusters()) == 2:
                    tot_err += abs(jsim - threshold)
                    break
        avg_err = float(tot_err) / n_tests
        self.assertLessEqual(avg_err, expected_error, "Error too large")

Exemple #2

0

Afficher le fichier

Fichier : test_files.py Projet : simonemainardi/lsh-hdc

    def run_simulated_manually(filepath, lines_to_read=sys.maxint,
                               cluster_args=None):
        with open(get_resource_name(filepath), 'r') as f:
            data = [line.rstrip().split(' ')
                    for line in islice(f, lines_to_read)]
        if cluster_args is None:
            cluster_args = dict()
        cluster = Cluster(**cluster_args)
        shingler = Shingler(span=3)
        s = FeatureClusterSummarizer()
        content_dict = dict()
        for pair in data:
            if len(pair) > 1:
                label, text = pair
            else:
                label, text = pair[0], ''
            content_dict[label] = text
            shingles = shingler.get_shingles(text)
            s.add_features(label, shingles)
            cluster.add_set(shingles, label)
        clusters = cluster.get_clusters()

        is_label_positive = lambda lbl: ':' in lbl
        return dict(stats=get_stats(clusters, is_label_positive),
                    uindex=s.summarize_clusters(clusters))

Exemple #3

0

Afficher le fichier

Fichier : test_cluster.py Projet : simonemainardi/lsh-hdc

    def test_cluster_threshold(self):
        """Expected error for threshold to similarity should be reasonable"""
        n_tests = 50
        dim = 15
        expected_error = 0.20

        tot_err = 0
        for test in range(n_tests):
            # Get some sets and their similarities
            sets = (randset(), randset())
            jsim = jaccard_sim(*sets)

            # Find the threshold at which they cluster together
            for threshold in range(1, 100, 5):
                threshold = float(threshold) / 100
                bandwidth = get_bandwidth(dim, threshold)
                num_bands = int(dim / bandwidth)
                cluster = Cluster(width=num_bands * bandwidth,
                                  bandwidth=bandwidth)
                cluster.add_set(sets[0])
                cluster.add_set(sets[1])
                if len(cluster.get_clusters()) == 2:
                    tot_err += abs(jsim - threshold)
                    break
        avg_err = float(tot_err) / n_tests
        self.assertLessEqual(avg_err, expected_error, "Error too large")

Exemple #4

0

Afficher le fichier

Fichier : test_cluster.py Projet : Livefyre/lsh-hdc

 def test_same_set(self):
     """A set should be clustered with itself"""
     s = randset()
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_item(s)
     cluster.add_item(s)
     self.assertEqual(len(cluster.get_clusters()), 1)

Exemple #5

0

Afficher le fichier

Fichier : test_cluster.py Projet : simonemainardi/lsh-hdc

 def test_same_set(self):
     """A set should be clustered with itself"""
     s = randset()
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_set(s)
     cluster.add_set(s)
     self.assertEqual(len(cluster.get_clusters()), 1)

Exemple #6

0

Afficher le fichier

Fichier : test_files.py Projet : escherba/lsh-hdc

 def test_names(self):
     """Should return 281 clusters of names.
     """
     with open(get_resource_name('data/perrys.csv'), 'r') as fhandle:
         data = set(line.rstrip() for line in fhandle)
     cluster = Cluster(width=20, bandwidth=5, seed=SEED)
     shingler = Shingler(3)
     for name in data:
         shingles = shingler.get_shingles(name)
         cluster.add_item(shingles, name)
     clusters = cluster.get_clusters()
     self.assertEqual(327, len(clusters))

Exemple #7

0

Afficher le fichier

 def test_names(self):
     """Should return 281 clusters of names.
     """
     with open(get_resource_name('data/perrys.csv'), 'r') as fhandle:
         data = set(line.rstrip() for line in fhandle)
     cluster = Cluster(width=20, bandwidth=5, seed=SEED)
     shingler = Shingler(3)
     for name in data:
         shingles = shingler.get_shingles(name)
         cluster.add_item(shingles, name)
     clusters = cluster.get_clusters()
     self.assertEqual(len(clusters), 209)

Exemple #8

0

Afficher le fichier

Fichier : test_files.py Projet : escherba/lsh-hdc

 def test_bills(self):
     """Should return 97 clusters of bills.
     """
     with open(get_resource_name('data/bills100.txt'), 'r') as fhandle:
         data = [line.rstrip().split('|') for line in fhandle]
     cluster = Cluster(width=20, bandwidth=5, seed=SEED)
     shingler = Shingler(span=3, tokenizer=RegexTokenizer())
     for label, text in data:
         shingles = shingler.get_shingles(text)
         cluster.add_item(shingles, label)
     clusters = cluster.get_clusters()
     self.assertEqual(96, len(clusters))

Exemple #9

0

Afficher le fichier

 def test_bills(self):
     """Should return 97 clusters of bills.
     """
     with open(get_resource_name('data/bills100.txt'), 'r') as fhandle:
         data = [line.rstrip().split('|') for line in fhandle]
     cluster = Cluster(width=20, bandwidth=5, seed=SEED)
     shingler = Shingler(span=3, tokenizer=RegexTokenizer())
     for label, text in data:
         shingles = shingler.get_shingles(text)
         cluster.add_item(shingles, label)
     clusters = cluster.get_clusters()
     self.assertEqual(len(clusters), 97)

Exemple #10

0

Afficher le fichier

Fichier : test_cluster.py Projet : simonemainardi/lsh-hdc

 def test_empty(self):
     """Should place the two empty sets into a separate cluster"""
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_set("abcdefg")
     cluster.add_set("abcdefghi")
     cluster.add_set("")
     cluster.add_set("")
     self.assertEqual(len(cluster.get_clusters()), 2)

Exemple #11

0

Afficher le fichier

Fichier : test_cluster.py Projet : Livefyre/lsh-hdc

 def test_empty(self):
     """Should place the two empty sets into a separate cluster"""
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_item("abcdefg")
     cluster.add_item("abcdefghi")
     cluster.add_item("")
     cluster.add_item("")
     self.assertEqual(len(cluster.get_clusters()), 2)

Exemple #12

0

Afficher le fichier

Fichier : test_cluster.py Projet : escherba/lsh-hdc

 def test_dissimilar_sets(self):
     """Two non-similar sets should not be clustered"""
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_item("yu5abcdef967")
     cluster.add_item("1234567890z")
     print cluster.get_clusters()
     num_clusters = len(cluster.get_clusters())
     self.assertEqual(2, num_clusters)

Exemple #13

0

Afficher le fichier

 def test_names_kmin_scheme(self):
     """Should return 145 clusters of names.
     """
     with open(get_resource_name('data/perrys.csv'), 'r') as fhandle:
         data = set(line.rstrip() for line in fhandle)
     cluster = Cluster(width=20, bandwidth=5, kmin=2, lsh_scheme="a1",
                       seed=SEED)
     shingler = Shingler(3)
     for name in data:
         shingles = shingler.get_shingles(name)
         cluster.add_item(shingles, name)
     clusters = cluster.get_clusters()
     # for cluster in clusters:
     #     print cluster
     self.assertEqual(len(clusters), 176)

Exemple #14

0

Afficher le fichier

Fichier : test_files.py Projet : simonemainardi/lsh-hdc

 def test_bills(self):
     """Should return 97 clusters of bills.
     """
     with open(get_resource_name('data/bills100.txt'), 'r') as f:
         data = [line.rstrip().split('|') for line in f]
     cluster = Cluster(width=20, bandwidth=5)
     shingler = Shingler(span=3, tokenizer=RegexTokenizer())
     s = FeatureClusterSummarizer()
     for label, text in data:
         shingles = shingler.get_shingles(text)
         s.add_features(label, shingles)
         cluster.add_set(shingles, label)
     clusters = cluster.get_clusters()
     ti = s.summarize_clusters(clusters)
     self.assertEqual(len(clusters), 97)
     self.assertAlmostEqual(ti, 1.0)

Exemple #15

0

Afficher le fichier

Fichier : test_files.py Projet : simonemainardi/lsh-hdc

 def test_names(self):
     """Should return 281 clusters of names.
     """
     with open(get_resource_name('data/perrys.csv'), 'r') as f:
         data = set(line.rstrip() for line in f)
     cluster = Cluster(width=20, bandwidth=5)
     shingler = Shingler(3)
     s = FeatureClusterSummarizer()
     for name in data:
         shingles = shingler.get_shingles(name)
         s.add_features(name, shingles)
         cluster.add_set(shingles, name)
     clusters = cluster.get_clusters()
     ti = s.summarize_clusters(clusters)
     self.assertEqual(len(clusters), 281)
     self.assertAlmostEqual(ti, 0.9780512134223747)

Exemple #16

0

Afficher le fichier

Fichier : test_files.py Projet : simonemainardi/lsh-hdc

 def test_names_kmin(self):
     """Should return 252 clusters of names.
     """
     with open(get_resource_name('data/perrys.csv'), 'r') as f:
         data = set(line.rstrip() for line in f)
     cluster = Cluster(width=20, bandwidth=5, kmin=2)
     shingler = Shingler(3)
     s = FeatureClusterSummarizer()
     for name in data:
         shingles = shingler.get_shingles(name)
         s.add_features(name, shingles)
         cluster.add_set(shingles, name)
     clusters = cluster.get_clusters()
     # for cluster in clusters:
     #     print cluster
     ti = s.summarize_clusters(clusters)
     self.assertEqual(len(clusters), 252)
     self.assertAlmostEqual(ti, 0.9732840816954408)

Exemple #17

0

Afficher le fichier

Fichier : strings.py Projet : escherba/lsh-hdc

def get_clusters(args, data):
    cluster = Cluster(width=args.width,
                      bandwidth=args.bandwidth,
                      lsh_scheme=args.lsh_scheme,
                      kmin=args.kmin,
                      hashfun=args.hashfun)
    shingler = Shingler(
        span=args.shingle_span,
        skip=args.shingle_skip,
        kmin=args.shingle_kmin,
        unique=bool(args.shingle_uniq)
    )
    content_dict = dict()
    for label, text in data:
        content_dict[label] = text
        shingles = shingler.get_shingles(text)
        cluster.add_item(shingles, label)
    return cluster.get_clusters()

Exemple #18

0

Afficher le fichier

Fichier : test_files.py Projet : escherba/lsh-hdc

    def run_simulated_manually(filepath, lines_to_read=sys.maxint,
                               cluster_args=None):
        with open(get_resource_name(filepath), 'r') as fhandle:
            data = [line.rstrip().split(' ')
                    for line in islice(fhandle, lines_to_read)]
        if cluster_args is None:
            cluster_args = dict()
        cluster = Cluster(**cluster_args)
        shingler = Shingler(span=3)
        content_dict = dict()
        for pair in data:
            if len(pair) > 1:
                label, text = pair
            else:
                label, text = pair[0], ''
            content_dict[label] = text
            shingles = shingler.get_shingles(text)
            cluster.add_item(shingles, label)
        clusters = cluster.get_clusters()

        is_label_positive = lambda lbl: ':' in lbl
        return clusters, is_label_positive

Exemple #19

0

Afficher le fichier

    def run_simulated_manually(filepath, lines_to_read=sys.maxint,
                               cluster_args=None):
        with open(get_resource_name(filepath), 'r') as fhandle:
            data = [line.rstrip().split(' ')
                    for line in islice(fhandle, lines_to_read)]
        if cluster_args is None:
            cluster_args = dict()
        cluster = Cluster(**cluster_args)
        shingler = Shingler(span=3)
        content_dict = dict()
        for pair in data:
            if len(pair) > 1:
                label, text = pair
            else:
                label, text = pair[0], ''
            content_dict[label] = text
            shingles = shingler.get_shingles(text)
            cluster.add_item(shingles, label)
        clusters = cluster.get_clusters()

        is_label_positive = lambda lbl: ':' in lbl
        return dict(stats=describe_clusters(clusters, is_label_positive))

Exemple #20

0

Afficher le fichier

Fichier : test_cluster.py Projet : Livefyre/lsh-hdc

 def test_dissimilar_sets(self):
     """Two non-similar sets should not be clustered"""
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_item("12yu5abcdef")
     cluster.add_item("1234567890z")
     print cluster.get_clusters()
     num_clusters = len(cluster.get_clusters())
     self.assertEqual(num_clusters, 2,
                      "Expected 2 clusters, got {}".format(num_clusters))

Exemple #21

0

Afficher le fichier

Fichier : test_cluster.py Projet : simonemainardi/lsh-hdc

 def test_dissimilar_sets(self):
     """Two non-similar sets should not be clustered"""
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_set("12yu5abcdef")
     cluster.add_set("1234567890z")
     print cluster.get_clusters()
     num_clusters = len(cluster.get_clusters())
     self.assertEqual(num_clusters, 2,
                      "Expected 2 clusters, got {}".format(num_clusters))

Exemple #22

0

Afficher le fichier

Fichier : seq_simulator.py Projet : Livefyre/lsh-hdc

def test_simulated(opts, data):
    cluster = Cluster(width=opts.width,
                      bandwidth=opts.bandwidth,
                      lsh_scheme=opts.lsh_scheme)
    shingler = Shingler(span=opts.shingle_span)
    s = FeatureClusterSummarizer()
    content_dict = dict()
    for label, text in data:
        content_dict[label] = text
        shingles = shingler.get_shingles(text)
        s.add_features(label, shingles)
        cluster.add_item(shingles, label)
    clusters = cluster.get_clusters()

    c = describe_clusters(clusters, lambda x: len(x.split(':')) > 1)
    ti = s.summarize_clusters(clusters)
    print json.dumps(dict(
        stats=c.dict(),
        ratios=dict(
            precision=c.get_precision(),
            recall=c.get_recall()
        ),
        ti=ti
    ))

Exemple #23

0

Afficher le fichier

Fichier : test_cluster.py Projet : Livefyre/lsh-hdc

 def test_similar_sets(self):
     """Two similar sets should be clustered"""
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_item("abcdefg")
     cluster.add_item("abcdefghi")
     self.assertEqual(len(cluster.get_clusters()), 1)

Exemple #24

0

Afficher le fichier

Fichier : test_cluster.py Projet : simonemainardi/lsh-hdc

 def test_similar_sets(self):
     """Two similar sets should be clustered"""
     cluster = Cluster(width=10, bandwidth=2)
     cluster.add_set("abcdefg")
     cluster.add_set("abcdefghi")
     self.assertEqual(len(cluster.get_clusters()), 1)