Esempio n. 1
0
def threshold_tanimoto_search_symmetric(arena, threshold=0.7, include_lower_triangle=True, batch_size=100):
    """Search for the hits in the `arena` at least `threshold` similar to the fingerprints in the arena

    When `include_lower_triangle` is True, compute the upper-triangle
    similarities, then copy the results to get the full set of
    results. When `include_lower_triangle` is False, only compute the
    upper triangle.

    The computation can take a long time. Python won't check check for
    a ^C until the function finishes. This can be irritating. Instead,
    process only `batch_size` rows at a time before checking for a ^C.

    The hits in the returned `SearchResults` are in arbitrary order.

    Example::

        arena = chemfp.load_fingerprints("queries.fps")
        full_result = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=0.2)
        upper_triangle = chemfp.search.threshold_tanimoto_search_symmetric(
                  arena, threshold=0.2, include_lower_triangle=False)
        assert sum(map(len, full_result)) == sum(map(len, upper_triangle))*2
                  
    :param arena: the set of fingerprints
    :type arena: a FingerprintArena
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param include_lower_triangle:
        if False, compute only the upper triangle, otherwise use symmetry to compute the full matrix
    :type include_lower_triangle: boolean
    :param batch_size: the number of rows to process before checking for a ^C
    :type batch_size: integer
    :returns: a SearchResults instance
    """
    
    if batch_size <= 0:
        raise ValueError("batch_size must be positive")
    N = len(arena)
    results = SearchResults(N, arena.arena_ids)

    if N:
        # Break it up into batch_size groups in order to let Python's
        # interrupt handler check for a ^C, which is otherwise
        # suppressed until the function finishes.
        for query_start in xrange(0, N, batch_size):
            query_end = min(query_start + batch_size, N)
            _chemfp.threshold_tanimoto_arena_symmetric(
                threshold, arena.num_bits,
                arena.start_padding, arena.end_padding, arena.storage_size, arena.arena,
                query_start, query_end, 0, N,
                arena.popcount_indices,
                results)

        if include_lower_triangle:
            _chemfp.fill_lower_triangle(results, N)
        
    return results
Esempio n. 2
0
def partial_threshold_tanimoto_search_symmetric(results, arena, threshold=0.7,
                                                query_start=0, query_end=None,
                                                target_start=0, target_end=None):
    """Compute a portion of the symmetric Tanimoto search results

    For most cases, use threshold_tanimoto_arena_symmetric instead of this
    function!
    
    This function is only useful for thread-pool implementations. In
    that case, set the number of OpenMP threads to 1.

    `results` is a SearchResults instance which is at least as large
    as the arena. It should be reused for successive updates.

    The function adds hits to results[query_start:query_end] based
    on computing the upper-triangle portion contained in the rectangle
    query_start:query_end and target_start:target_end.

    It does not fill in the lower triangle. To get the full matrix,
    call `fill_lower_triangle`.

    You know, this is pretty complicated. Here's the bare minimum
    example of how to use it correctly to process 10 rows at a time
    using up to 4 threads::

        import chemfp
        import chemfp.search
        from chemfp import futures
        import array

        chemfp.set_num_threads(1)

        arena = chemfp.load_fingerprints("targets.fps")
        n = len(arena)
        results = chemfp.search.SearchResults(n, arena.ids)

        with futures.ThreadPoolExecutor(max_workers=4) as executor:
            for row in xrange(0, n, 10):
                executor.submit(chemfp.search.partial_threshold_tanimoto_search_symmetric,
                                results, arena, threshold=0.2,
                                query_start=row, query_end=min(row+10, n))

        chemfp.search.fill_lower_triangle(results)

    The hits in the `SearchResults` are in arbitrary order.

    :param counts: the intermediate search results
    :type counts: a SearchResults instance
    :param arena: the fingerprints.
    :type arena: a FingerprintArena
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param query_start: the query start row
    :type query_start: an integer
    :param query_end: the query end row
    :type query_end: an integer, or None to mean the last query row
    :param target_start: the target start row
    :type target_start: an integer
    :param target_end: the target end row
    :type target_end: an integer, or None to mean the last target row
    :returns: nothing
    """
    assert arena.popcount_indices
    N = len(arena)
    
    if query_end is None:
        query_end = N
    elif query_end > N:
        query_end = N
        
    if target_end is None:
        target_end = N
    elif target_end > N:
        target_end = N

    if query_end > N:
        raise ValueError("counts array is too small for the given query range")
    if target_end > N:
        raise ValueError("counts array is too small for the given target range")

    if N:
        _chemfp.threshold_tanimoto_arena_symmetric(
            threshold, arena.num_bits,
            arena.start_padding, arena.end_padding, arena.storage_size, arena.arena,
            query_start, query_end, target_start, target_end,
            arena.popcount_indices,
            results)