Ejemplo n.º 1
0
def read_artist_alias(fn):

    uf = UnionFind()
    with open(fn, "r") as fh:
        for line in fh:
            try:
                a, b = map(int, line.split("\t"))
            except ValueError:
                continue
            uf.union(a, b)

    result = {}
    for s in uf.sets():
        sorted_set = sorted(s)
        head = sorted_set[0]
        for remaining in sorted_set[1:]:
            result[remaining] = head

    return result
Ejemplo n.º 2
0
 def __init__(self, signer=None, sketch_dist_fn=None, max_dist=0,
              min_support=1, sketch_operator=operator.__and__,
              sketch_bits=0):
     self.union_find = UnionFind()
     self.signer = signer
     self.buckets = defaultdict(dict)
     self.sketch_dist_fn = sketch_dist_fn
     self.sketch_bits = sketch_bits
     self.max_dist = max_dist
     self.min_support = min_support
     self.sketch_operator = sketch_operator
Ejemplo n.º 3
0
 def __init__(self, signer=None, min_support=2):
     self.union_find = UnionFind()
     self.signer = signer
     self.buckets = defaultdict(dict)
     self.min_support = min_support
Ejemplo n.º 4
0
class Cluster(object):
    """Clusters sets with Jaccard similarity above threshold with high
    probability.

    Algorithm based on Rajaraman, "Mining of Massive Datasets":
    1. Generate set signature
    2. Use LSH to map similar signatures to same buckets
    3. Use UnionFind to merge buckets containing same values
    """
    def __init__(self, signer=None, min_support=2):
        self.union_find = UnionFind()
        self.signer = signer
        self.buckets = defaultdict(dict)
        self.min_support = min_support

    def _closeness_measure(self, sketch=None):
        min_support = self.min_support
        return lambda support, sketch: support >= min_support

    def add_item(self, item, label=None, sketch=None):
        # Set default label for this set
        if label is None:
            label = item

        # Add to union-find structure
        union_find = self.union_find
        union_find.__getitem__(label)

        # Get signature vector and hash it
        keys = item \
            if self.signer is None \
            else self.signer.get_signature(item)

        # Unite labels with same LSH keys
        counter = Counter()
        sketches = dict()
        for bucket in imap(self.buckets.__getitem__, keys):
            bucket[label] = sketch
            counter.update(bucket.keys())
            sketches.update(bucket)

        is_close = self._closeness_measure()
        for matched_label, support in counter.iteritems():
            if matched_label != label and \
                    is_close(support, sketches[matched_label]):
                union_find.union(matched_label, label)

    def add_key(self, key, label=None, sketch=None):
        """Add one LSH key only (with associated info)
        Cannot use min_support in this case (it is always equal to one)
        """
        # Set default label for this set
        if label is None:
            label = key

        # Add to union-find structure
        union_find = self.union_find
        union_find.__getitem__(label)

        # Unite labels with same LSH keys
        bucket = self.buckets[key]
        bucket[label] = sketch

        is_close = self._closeness_measure()
        for matched_label in bucket.keys():
            if matched_label != label:
                matched_sketch = bucket[matched_label]
                # Note: large improvement in precision when also ensuring that
                # distance > 0 below:
                if is_close(matched_sketch, None):
                    union_find.union(matched_label, label)

    def get_clusters(self):
        """Returns a list of sets representing clusters

        :rtype: list
        """
        return self.union_find.sets()
Ejemplo n.º 5
0
 def test_simple_cluster(self):
     uf = UnionFind()
     uf.union(0, 1)
     uf.union(2, 3)
     uf.union(3, 0)
     self.assertEqual(uf.sets(), [[0, 1, 2, 3]])