def knearest_tanimoto_search_symmetric(arena, k=3, threshold=0.7, batch_size=100): """Search for the `k`-nearest hits in the `arena` at least `threshold` similar to the fingerprints in the arena The computation can take a long time. Python won't check check for a ^C until the function finishes. This can be irritating. Instead, process only `batch_size` rows at a time before checking for a ^C. The hits in the `SearchResults` are ordered by decreasing similarity score. Example:: arena = chemfp.load_fingerprints("queries.fps") results = chemfp.search.knearest_tanimoto_search_symmetric(arena, k=3, threshold=0.8) for (query_id, hits) in zip(arena.ids, results): print query_id, "->", ", ".join(("%s %.2f" % hit) for hit in hits.get_ids_and_scores()) :param arena: the set of fingerprints :type arena: a FingerprintArena :param k: the number of nearest neighbors to find. :type k: positive integer :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :param include_lower_triangle: if False, compute only the upper triangle, otherwise use symmetry to compute the full matrix :type include_lower_triangle: boolean :param batch_size: the number of rows to process before checking for a ^C :type batch_size: integer :returns: a SearchResults instance """ N = len(arena) if batch_size <= 0: raise ValueError("batch_size must be positive") results = SearchResults(N, arena.arena_ids) if N: # Break it up into batch_size groups in order to let Python's # interrupt handler check for a ^C, which is otherwise # suppressed until the function finishes. for query_start in xrange(0, N, batch_size): query_end = min(query_start + batch_size, N) _chemfp.knearest_tanimoto_arena_symmetric( k, threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, query_start, query_end, 0, N, arena.popcount_indices, results) _chemfp.knearest_results_finalize(results, 0, N) return results
def knearest_tanimoto_search_symmetric(arena, k, threshold): N = len(arena) results = SearchResults(N, arena.ids) _chemfp.knearest_tanimoto_arena_symmetric(k, threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, 0, N, 0, N, arena.popcount_indices, results) _chemfp.knearest_results_finalize(results, 0, N) return results
def knearest_tanimoto_search(query_arena, target_arena, k, threshold): require_matching_sizes(query_arena, target_arena) num_queries = len(query_arena) results = SearchResults(num_queries, target_arena.ids) _chemfp.knearest_tanimoto_arena( k, threshold, target_arena.num_bits, query_arena.start_padding, query_arena.end_padding, query_arena.storage_size, query_arena.arena, query_arena.start, query_arena.end, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) _chemfp.knearest_results_finalize(results, 0, num_queries) return results
def knearest_tanimoto_search_fp(query_fp, target_arena, k, threshold): require_matching_fp_size(query_fp, target_arena) query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) if k < 0: raise ValueError("k must be non-negative") results = SearchResults(1) _chemfp.knearest_tanimoto_arena( k, threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) _chemfp.knearest_results_finalize(results, 0, 1) return results[0]
def knearest_tanimoto_search_arena(query_arena, target_arena, k=3, threshold=0.7): """Search for the `k` nearest hits in the `target_arena` at least `threshold` similar to the fingerprints in `query_arena` The hits in the `SearchResults` are ordered by decreasing similarity score. Example:: queries = chemfp.load_fingerprints("queries.fps") targets = chemfp.load_fingerprints("targets.fps") results = chemfp.search.knearest_tanimoto_search_arena(queries, targets, k=3, threshold=0.5) for query_id, query_hits in zip(queries.ids, results): if len(query_hits) >= 2: print query_id, "->", ", ".join(query_hits.get_ids()) :param query_arena: The query fingerprints. :type query_arena: a FingerprintArena :param target_arena: The target fingerprints. :type target_arena: a FingerprintArena :param k: the number of nearest neighbors to find. :type k: positive integer :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :returns: a SearchResults instance """ _require_matching_sizes(query_arena, target_arena) num_queries = len(query_arena) results = SearchResults(num_queries, target_arena.arena_ids) _chemfp.knearest_tanimoto_arena( k, threshold, target_arena.num_bits, query_arena.start_padding, query_arena.end_padding, query_arena.storage_size, query_arena.arena, query_arena.start, query_arena.end, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) _chemfp.knearest_results_finalize(results, 0, num_queries) return results
def knearest_tanimoto_search_fp(query_fp, target_arena, k=3, threshold=0.7): """Search for `k`-nearest hits in `target_arena` which are at least `threshold` similar to `query_fp` The hits in the `SearchResults` are ordered by decreasing similarity score. Example:: query_id, query_fp = chemfp.load_fingerprints("queries.fps")[0] targets = chemfp.load_fingerprints("targets.fps") print list(chemfp.search.knearest_tanimoto_search_fp(query_fp, targets, k=3, threshold=0.0)) :param query_fp: the query fingerprint :type query_fp: a byte string :param target_arena: the target arena :type target_fp: a FingerprintArena :param k: the number of nearest neighbors to find. :type k: positive integer :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :returns: a SearchResult """ _require_matching_fp_size(query_fp, target_arena) query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint( query_fp, target_arena.alignment, target_arena.storage_size) if k < 0: raise ValueError("k must be non-negative") results = SearchResults(1, target_arena.arena_ids) _chemfp.knearest_tanimoto_arena( k, threshold, target_arena.num_bits, query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1, target_arena.start_padding, target_arena.end_padding, target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end, target_arena.popcount_indices, results, 0) _chemfp.knearest_results_finalize(results, 0, 1) return results[0]