Python NearNeighborLookup Examples

Programming Language: Python

Namespace/Package Name: catch.utils.lsh

Method/Function: NearNeighborLookup

Examples at hotexamples.com: 2

Python NearNeighborLookup - 2 examples found. These are the top rated real world Python examples of catch.utils.lsh.NearNeighborLookup extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def _filter(self, input):
        """Filter with an arbitrary LSH family.

        This performs near neighbor lookups using self.lsh_family. It only
        calls probes near-duplicates if their distance, according to
        self.dist_fn, is within self.dist_thres.

        Args:
            input: collection of probes to filter

        Returns:
            subset of input
        """
        # Sort the probes by their mulitiplicity (descending)
        occurrences = defaultdict(int)
        for p in input:
            occurrences[p] += 1
        input_sorted = [p for p, count in
            sorted(occurrences.items(), key=operator.itemgetter(1),
                   reverse=True)]

        # Remove exact duplicates from the input
        input = list(set(input))

        # Construct a collection of hash tables for looking up
        # near neighbors of each probe
        nnl = lsh.NearNeighborLookup(self.lsh_family, self.k, self.dist_thres,
            self.dist_fn, self.reporting_prob)
        nnl.add(input)

        # Iterate through all probes in order; for each p, remove others
        # that are near-duplicates (neighbors) of p. Since we iterate
        # in sorted order by multiplicity, the ones that hit more targets
        # should appear earlier and will be included in the filtered output
        to_include = set()
        to_exclude = set()
        for p in input_sorted:
            # p should not have already been included because input_sorted
            # should not contain duplicates
            assert p not in to_include

            if p in to_exclude:
                # p is already being filtered out
                continue

            # Include p in the output and exclude all near-duplicates of it
            to_include.add(p)
            for near_dup in nnl.query(p):
                if near_dup not in to_include:
                    to_exclude.add(near_dup)

        # Check that every probe is either included or excluded and
        # that none are both included and excluded
        assert len(to_include | to_exclude) == len(input_sorted)
        assert len(to_include & to_exclude) == 0

        return list(to_include)

Example #2

Show file

    def test_varied_k(self):
        a = 'ATCGATATGGGCACTGCTAT'
        b = str(a)  # identical to a
        c = 'ATCGACATGGGCACTGGTAT'  # similar to a
        d = 'AGTTGTCACCCTTGACGATA'  # not similar to a
        e = 'AGTTGTCACCCTTGACGATA'  # similar to d

        for k in [2, 5, 10]:
            nnl = lsh.NearNeighborLookup(self.family, k, self.dist_thres,
                                         self.dist_fn, 0.95)
            nnl.add([a, b, c, d])

            # b and c are within self.dist_thres of a, so only these
            # should be returned (along with a); note that since
            # a==b, {a,b,c}=={a,c}=={b,c} and nnl.query(a) returns
            # a set, which will be {a,c} or {b,c}
            self.assertCountEqual(nnl.query(a), {a, b, c})

            # Although e was not added, a query for it should return d
            self.assertCountEqual(nnl.query(e), {d})