def _filter(self, input): """Filter with an arbitrary LSH family. This performs near neighbor lookups using self.lsh_family. It only calls probes near-duplicates if their distance, according to self.dist_fn, is within self.dist_thres. Args: input: collection of probes to filter Returns: subset of input """ # Sort the probes by their mulitiplicity (descending) occurrences = defaultdict(int) for p in input: occurrences[p] += 1 input_sorted = [p for p, count in sorted(occurrences.items(), key=operator.itemgetter(1), reverse=True)] # Remove exact duplicates from the input input = list(set(input)) # Construct a collection of hash tables for looking up # near neighbors of each probe nnl = lsh.NearNeighborLookup(self.lsh_family, self.k, self.dist_thres, self.dist_fn, self.reporting_prob) nnl.add(input) # Iterate through all probes in order; for each p, remove others # that are near-duplicates (neighbors) of p. Since we iterate # in sorted order by multiplicity, the ones that hit more targets # should appear earlier and will be included in the filtered output to_include = set() to_exclude = set() for p in input_sorted: # p should not have already been included because input_sorted # should not contain duplicates assert p not in to_include if p in to_exclude: # p is already being filtered out continue # Include p in the output and exclude all near-duplicates of it to_include.add(p) for near_dup in nnl.query(p): if near_dup not in to_include: to_exclude.add(near_dup) # Check that every probe is either included or excluded and # that none are both included and excluded assert len(to_include | to_exclude) == len(input_sorted) assert len(to_include & to_exclude) == 0 return list(to_include)
def test_varied_k(self): a = 'ATCGATATGGGCACTGCTAT' b = str(a) # identical to a c = 'ATCGACATGGGCACTGGTAT' # similar to a d = 'AGTTGTCACCCTTGACGATA' # not similar to a e = 'AGTTGTCACCCTTGACGATA' # similar to d for k in [2, 5, 10]: nnl = lsh.NearNeighborLookup(self.family, k, self.dist_thres, self.dist_fn, 0.95) nnl.add([a, b, c, d]) # b and c are within self.dist_thres of a, so only these # should be returned (along with a); note that since # a==b, {a,b,c}=={a,c}=={b,c} and nnl.query(a) returns # a set, which will be {a,c} or {b,c} self.assertCountEqual(nnl.query(a), {a, b, c}) # Although e was not added, a query for it should return d self.assertCountEqual(nnl.query(e), {d})