def test_errors(self): with self.assertRaisesRegexp(ValueError, "must be a character buffer"): _chemfp.align_fingerprint(1, 4, 4) with self.assertRaisesRegexp(ValueError, "storage size is too small"): _chemfp.align_fingerprint("too long", 4, 4) with self.assertRaisesRegexp(ValueError, "storage size must be positive"): _chemfp.align_fingerprint("", 1, 0) with self.assertRaisesRegexp(ValueError, "storage size must be positive"): _chemfp.align_fingerprint("X", 1, -12) with self.assertRaisesRegexp(ValueError, "alignment must be a positive power of two"): _chemfp.align_fingerprint("1234", 3, 4)
def threshold_tanimoto_search_fp(query_fp, target_arena, threshold=0.7): """Search for fingerprint hits in `target_arena` which are at least `threshold` similar to `query_fp` The hits in the returned `SearchResult` are in arbitrary order. Example:: query_id, query_fp = chemfp.load_fingerprints("queries.fps")[0] targets = chemfp.load_fingerprints("targets.fps") print list(chemfp.search.threshold_tanimoto_search_fp(query_fp, targets, threshold=0.15)) :param query_fp: the query fingerprint :type query_fp: a byte string :param target_arena: the target arena :type target_fp: a FingerprintArena :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :returns: a SearchResult """ _require_matching_fp_size(query_fp, target_arena) # Improve the alignment so the faster algorithms can be used query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) results = SearchResults(1, target_arena.arena_ids) _chemfp.threshold_tanimoto_arena( threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) return results[0]
def count_tanimoto_hits_fp(query_fp, target_arena, threshold=0.7): """Count the number of hits in `target_arena` at least `threshold` similar to the `query_fp` Example:: query_id, query_fp = chemfp.load_fingerprints("queries.fps")[0] targets = chemfp.load_fingerprints("targets.fps") print chemfp.search.count_tanimoto_hits_fp(query_fp, targets, threshold=0.1) :param query_fp: the query fingerprint :type query_fp: a byte string :param target_arena: the target arena :type target_fp: a FingerprintArena :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :returns: an integer count """ _require_matching_fp_size(query_fp, target_arena) # Improve the alignment so the faster algorithms can be used query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) counts = array.array("i", (0 for i in xrange(len(query_fp)))) _chemfp.count_tanimoto_arena( threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, counts) return counts[0]
def test_identical(self): # This fingerprint is aligned; no need to create a new one s = "blah" start_padding, end_padding, t = _chemfp.align_fingerprint(s, 4, 4) self.assertEquals(start_padding, 0) self.assertEquals(end_padding, 0) self.assertIs(s, t)
def count_tanimoto_hits_fp(query_fp, target_arena, threshold=0.7): """Count the number of hits in `target_arena` at least `threshold` similar to the `query_fp` Example:: query_id, query_fp = chemfp.load_fingerprints("queries.fps")[0] targets = chemfp.load_fingerprints("targets.fps") print chemfp.search.count_tanimoto_hits_fp(query_fp, targets, threshold=0.1) :param query_fp: the query fingerprint :type query_fp: a byte string :param target_arena: the target arena :type target_fp: a FingerprintArena :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :returns: an integer count """ _require_matching_fp_size(query_fp, target_arena) # Improve the alignment so the faster algorithms can be used query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) counts = array.array("i", (0 for i in xrange(len(query_fp)))) _chemfp.count_tanimoto_arena(threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, counts) return counts[0]
def test_errors(self): with self.assertRaisesRegexp(ValueError, "must be a character buffer"): _chemfp.align_fingerprint(1, 4, 4) with self.assertRaisesRegexp(ValueError, "storage size is too small"): _chemfp.align_fingerprint("too long", 4, 4) with self.assertRaisesRegexp(ValueError, "storage size must be positive"): _chemfp.align_fingerprint("", 1, 0) with self.assertRaisesRegexp(ValueError, "storage size must be positive"): _chemfp.align_fingerprint("X", 1, -12) with self.assertRaisesRegexp( ValueError, "alignment must be a positive power of two"): _chemfp.align_fingerprint("1234", 3, 4)
def count_tanimoto_hits_fp(query_fp, target_arena, threshold): require_matching_fp_size(query_fp, target_arena) # Improve the alignment so the faster algorithms can be used query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) counts = array.array("i", (0 for i in xrange(len(query_fp)))) _chemfp.count_tanimoto_arena( threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, counts) return counts[0]
def threshold_tanimoto_search_fp(query_fp, target_arena, threshold): require_matching_fp_size(query_fp, target_arena) # Improve the alignment so the faster algorithms can be used query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) results = SearchResults(1) _chemfp.threshold_tanimoto_arena( threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) return results[0]
def knearest_tanimoto_search_fp(query_fp, target_arena, k, threshold): require_matching_fp_size(query_fp, target_arena) query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) if k < 0: raise ValueError("k must be non-negative") results = SearchResults(1) _chemfp.knearest_tanimoto_arena( k, threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) _chemfp.knearest_results_finalize(results, 0, 1) return results[0]
def test_different_cases(self): for query in ( ("1", 4, 8), ("12", 8, 8), ("123", 16, 16), ("abcd", 4, 8), ("abcd", 8, 8), ("abcd", 16, 16), ): fp, alignment, storage_size = query result = _chemfp.align_fingerprint(*query) start_padding, end_padding, s = result i = _addressof(s) + start_padding self.assertEquals(i % alignment, 0, (query, result)) expected = fp + "\0" * (storage_size - len(fp)) self.assertEquals(s[start_padding:-end_padding], expected, (query, expected, result)) self.assertEquals(s[:start_padding], "\0"*start_padding) self.assertEquals(s[-end_padding:], "\0"*end_padding)
def knearest_tanimoto_search_fp(query_fp, target_arena, k=3, threshold=0.7): """Search for `k`-nearest hits in `target_arena` which are at least `threshold` similar to `query_fp` The hits in the `SearchResults` are ordered by decreasing similarity score. Example:: query_id, query_fp = chemfp.load_fingerprints("queries.fps")[0] targets = chemfp.load_fingerprints("targets.fps") print list(chemfp.search.knearest_tanimoto_search_fp(query_fp, targets, k=3, threshold=0.0)) :param query_fp: the query fingerprint :type query_fp: a byte string :param target_arena: the target arena :type target_fp: a FingerprintArena :param k: the number of nearest neighbors to find. :type k: positive integer :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :returns: a SearchResult """ _require_matching_fp_size(query_fp, target_arena) query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) if k < 0: raise ValueError("k must be non-negative") results = SearchResults(1, target_arena.arena_ids) _chemfp.knearest_tanimoto_arena( k, threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) _chemfp.knearest_results_finalize(results, 0, 1) return results[0]
def test_different_cases(self): for query in ( ("1", 4, 8), ("12", 8, 8), ("123", 16, 16), ("abcd", 4, 8), ("abcd", 8, 8), ("abcd", 16, 16), ): fp, alignment, storage_size = query result = _chemfp.align_fingerprint(*query) start_padding, end_padding, s = result i = _addressof(s) + start_padding self.assertEquals(i % alignment, 0, (query, result)) expected = fp + "\0" * (storage_size - len(fp)) self.assertEquals(s[start_padding:-end_padding], expected, (query, expected, result)) self.assertEquals(s[:start_padding], "\0" * start_padding) self.assertEquals(s[-end_padding:], "\0" * end_padding)