Example #1
0
    def _filter(self, input):
        """Filter with an arbitrary LSH family.

        This performs near neighbor lookups using self.lsh_family. It only
        calls probes near-duplicates if their distance, according to
        self.dist_fn, is within self.dist_thres.

        Args:
            input: collection of probes to filter

        Returns:
            subset of input
        """
        # Sort the probes by their mulitiplicity (descending)
        occurrences = defaultdict(int)
        for p in input:
            occurrences[p] += 1
        input_sorted = [p for p, count in
            sorted(occurrences.items(), key=operator.itemgetter(1),
                   reverse=True)]

        # Remove exact duplicates from the input
        input = list(set(input))

        # Construct a collection of hash tables for looking up
        # near neighbors of each probe
        nnl = lsh.NearNeighborLookup(self.lsh_family, self.k, self.dist_thres,
            self.dist_fn, self.reporting_prob)
        nnl.add(input)

        # Iterate through all probes in order; for each p, remove others
        # that are near-duplicates (neighbors) of p. Since we iterate
        # in sorted order by multiplicity, the ones that hit more targets
        # should appear earlier and will be included in the filtered output
        to_include = set()
        to_exclude = set()
        for p in input_sorted:
            # p should not have already been included because input_sorted
            # should not contain duplicates
            assert p not in to_include

            if p in to_exclude:
                # p is already being filtered out
                continue

            # Include p in the output and exclude all near-duplicates of it
            to_include.add(p)
            for near_dup in nnl.query(p):
                if near_dup not in to_include:
                    to_exclude.add(near_dup)

        # Check that every probe is either included or excluded and
        # that none are both included and excluded
        assert len(to_include | to_exclude) == len(input_sorted)
        assert len(to_include & to_exclude) == 0

        return list(to_include)
Example #2
0
    def test_varied_k(self):
        a = 'ATCGATATGGGCACTGCTAT'
        b = str(a)  # identical to a
        c = 'ATCGACATGGGCACTGGTAT'  # similar to a
        d = 'AGTTGTCACCCTTGACGATA'  # not similar to a
        e = 'AGTTGTCACCCTTGACGATA'  # similar to d

        for k in [2, 5, 10]:
            nnl = lsh.NearNeighborLookup(self.family, k, self.dist_thres,
                                         self.dist_fn, 0.95)
            nnl.add([a, b, c, d])

            # b and c are within self.dist_thres of a, so only these
            # should be returned (along with a); note that since
            # a==b, {a,b,c}=={a,c}=={b,c} and nnl.query(a) returns
            # a set, which will be {a,c} or {b,c}
            self.assertCountEqual(nnl.query(a), {a, b, c})

            # Although e was not added, a query for it should return d
            self.assertCountEqual(nnl.query(e), {d})