def partial_threshold_tanimoto_search_symmetric(results, arena, threshold, query_start=0, query_end=None, target_start=0, target_end=None): assert arena.popcount_indices N = len(arena) if query_end is None: query_end = N elif query_end > N: query_end = N if target_end is None: target_end = N elif target_end > N: target_end = N if query_end > N: raise ValueError("counts array is too small for the given query range") if target_end > N: raise ValueError( "counts array is too small for the given target range") if N: _chemfp.threshold_tanimoto_arena_symmetric( threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, query_start, query_end, target_start, target_end, arena.popcount_indices, results)
def threshold_tanimoto_search_symmetric(arena, threshold=0.7, include_lower_triangle=True, batch_size=100): """Search for the hits in the `arena` at least `threshold` similar to the fingerprints in the arena When `include_lower_triangle` is True, compute the upper-triangle similarities, then copy the results to get the full set of results. When `include_lower_triangle` is False, only compute the upper triangle. The computation can take a long time. Python won't check check for a ^C until the function finishes. This can be irritating. Instead, process only `batch_size` rows at a time before checking for a ^C. The hits in the returned `SearchResults` are in arbitrary order. Example:: arena = chemfp.load_fingerprints("queries.fps") full_result = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=0.2) upper_triangle = chemfp.search.threshold_tanimoto_search_symmetric( arena, threshold=0.2, include_lower_triangle=False) assert sum(map(len, full_result)) == sum(map(len, upper_triangle))*2 :param arena: the set of fingerprints :type arena: a FingerprintArena :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :param include_lower_triangle: if False, compute only the upper triangle, otherwise use symmetry to compute the full matrix :type include_lower_triangle: boolean :param batch_size: the number of rows to process before checking for a ^C :type batch_size: integer :returns: a SearchResults instance """ if batch_size <= 0: raise ValueError("batch_size must be positive") N = len(arena) results = SearchResults(N, arena.arena_ids) if N: # Break it up into batch_size groups in order to let Python's # interrupt handler check for a ^C, which is otherwise # suppressed until the function finishes. for query_start in xrange(0, N, batch_size): query_end = min(query_start + batch_size, N) _chemfp.threshold_tanimoto_arena_symmetric( threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, query_start, query_end, 0, N, arena.popcount_indices, results) if include_lower_triangle: _chemfp.fill_lower_triangle(results, N) return results
def threshold_tanimoto_search_symmetric(arena, threshold, include_lower_triangle=True): assert arena.popcount_indices N = len(arena) results = SearchResults(N, arena.ids) if N: _chemfp.threshold_tanimoto_arena_symmetric( threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, 0, N, 0, N, arena.popcount_indices, results) if include_lower_triangle: _chemfp.fill_lower_triangle(results, N) return results
def partial_threshold_tanimoto_search_symmetric(results, arena, threshold=0.7, query_start=0, query_end=None, target_start=0, target_end=None): """Compute a portion of the symmetric Tanimoto search results For most cases, use threshold_tanimoto_arena_symmetric instead of this function! This function is only useful for thread-pool implementations. In that case, set the number of OpenMP threads to 1. `results` is a SearchResults instance which is at least as large as the arena. It should be reused for successive updates. The function adds hits to results[query_start:query_end] based on computing the upper-triangle portion contained in the rectangle query_start:query_end and target_start:target_end. It does not fill in the lower triangle. To get the full matrix, call `fill_lower_triangle`. You know, this is pretty complicated. Here's the bare minimum example of how to use it correctly to process 10 rows at a time using up to 4 threads:: import chemfp import chemfp.search from chemfp import futures import array chemfp.set_num_threads(1) arena = chemfp.load_fingerprints("targets.fps") n = len(arena) results = chemfp.search.SearchResults(n, arena.ids) with futures.ThreadPoolExecutor(max_workers=4) as executor: for row in xrange(0, n, 10): executor.submit(chemfp.search.partial_threshold_tanimoto_search_symmetric, results, arena, threshold=0.2, query_start=row, query_end=min(row+10, n)) chemfp.search.fill_lower_triangle(results) The hits in the `SearchResults` are in arbitrary order. :param counts: the intermediate search results :type counts: a SearchResults instance :param arena: the fingerprints. :type arena: a FingerprintArena :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :param query_start: the query start row :type query_start: an integer :param query_end: the query end row :type query_end: an integer, or None to mean the last query row :param target_start: the target start row :type target_start: an integer :param target_end: the target end row :type target_end: an integer, or None to mean the last target row :returns: nothing """ assert arena.popcount_indices N = len(arena) if query_end is None: query_end = N elif query_end > N: query_end = N if target_end is None: target_end = N elif target_end > N: target_end = N if query_end > N: raise ValueError("counts array is too small for the given query range") if target_end > N: raise ValueError( "counts array is too small for the given target range") if N: _chemfp.threshold_tanimoto_arena_symmetric( threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, query_start, query_end, target_start, target_end, arena.popcount_indices, results)
def partial_threshold_tanimoto_search_symmetric(results, arena, threshold=0.7, query_start=0, query_end=None, target_start=0, target_end=None): """Compute a portion of the symmetric Tanimoto search results For most cases, use threshold_tanimoto_arena_symmetric instead of this function! This function is only useful for thread-pool implementations. In that case, set the number of OpenMP threads to 1. `results` is a SearchResults instance which is at least as large as the arena. It should be reused for successive updates. The function adds hits to results[query_start:query_end] based on computing the upper-triangle portion contained in the rectangle query_start:query_end and target_start:target_end. It does not fill in the lower triangle. To get the full matrix, call `fill_lower_triangle`. You know, this is pretty complicated. Here's the bare minimum example of how to use it correctly to process 10 rows at a time using up to 4 threads:: import chemfp import chemfp.search from chemfp import futures import array chemfp.set_num_threads(1) arena = chemfp.load_fingerprints("targets.fps") n = len(arena) results = chemfp.search.SearchResults(n, arena.ids) with futures.ThreadPoolExecutor(max_workers=4) as executor: for row in xrange(0, n, 10): executor.submit(chemfp.search.partial_threshold_tanimoto_search_symmetric, results, arena, threshold=0.2, query_start=row, query_end=min(row+10, n)) chemfp.search.fill_lower_triangle(results) The hits in the `SearchResults` are in arbitrary order. :param counts: the intermediate search results :type counts: a SearchResults instance :param arena: the fingerprints. :type arena: a FingerprintArena :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :param query_start: the query start row :type query_start: an integer :param query_end: the query end row :type query_end: an integer, or None to mean the last query row :param target_start: the target start row :type target_start: an integer :param target_end: the target end row :type target_end: an integer, or None to mean the last target row :returns: nothing """ assert arena.popcount_indices N = len(arena) if query_end is None: query_end = N elif query_end > N: query_end = N if target_end is None: target_end = N elif target_end > N: target_end = N if query_end > N: raise ValueError("counts array is too small for the given query range") if target_end > N: raise ValueError("counts array is too small for the given target range") if N: _chemfp.threshold_tanimoto_arena_symmetric( threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, query_start, query_end, target_start, target_end, arena.popcount_indices, results)