예제 #1
0
def partial_count_tanimoto_hits_symmetric(counts,
                                          arena,
                                          threshold,
                                          query_start=0,
                                          query_end=None,
                                          target_start=0,
                                          target_end=None):
    N = len(arena)

    if query_end is None:
        query_end = N
    elif query_end > N:
        query_end = N

    if target_end is None:
        target_end = N
    elif target_end > N:
        target_end = N

    if query_end > len(counts):
        raise ValueError("counts array is too small for the given query range")
    if target_end > len(counts):
        raise ValueError(
            "counts array is too small for the given target range")

    _chemfp.count_tanimoto_hits_arena_symmetric(
        threshold, arena.num_bits, arena.start_padding, arena.end_padding,
        arena.storage_size, arena.arena, query_start, query_end, target_start,
        target_end, arena.popcount_indices, counts)
예제 #2
0
def count_tanimoto_hits_symmetric(arena, threshold=0.7, batch_size=100):
    """For each fingerprint in the `arena`, count the number of other fingerprints at least `threshold` similar to it

    A fingerprint never matches itself.

    The computation can take a long time. Python won't check check for
    a ^C until the function finishes. This can be irritating. Instead,
    process only `batch_size` rows at a time before checking for a ^C.

    Example::

        arena = chemfp.load_fingerprints("targets.fps")
        counts = chemfp.search.count_tanimoto_hits_symmetric(arena, threshold=0.2)
        print counts[:10]

    The result object is implementation specific. You'll always be able to
    get its length and do an index lookup to get an integer
    count. Currently it's a ctype array of longs, but it could be an
    array.array or Python list in the future.

    :param arena: the set of fingerprints
    :type arena: a FingerprintArena
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param batch_size: the number of rows to process before checking for a ^C
    :type batch_size: integer
    :returns: an array of counts
    """
    N = len(arena)
    counts = (ctypes.c_int * N)()

    # This spends the entire time in C, which means ^C won't work until it finishes.
    # While it's theoretically slightly higher performance, I can't measure the
    # difference, and it's much better to let people be able to interrupt the program.
    #    _chemfp.count_tanimoto_hits_arena_symmetric(
    #        threshold, arena.num_bits,
    #        arena.start_padding, arena.end_padding, arena.storage_size, arena.arena,
    #        0, N, 0, N,
    #        arena.popcount_indices,
    #        counts)
    if batch_size <= 0:
        raise ValueError("batch_size must be positive")

    # Process N rows at a time, which lets Python handle ^C at times.
    # Since the code processes a triangle, this means that early
    # on there will be more time between ^C checks than later.
    # I'm not able to detect the Python overhead, so I'm not going
    # to make it more "efficient".
    for query_start in xrange(0, N, batch_size):
        query_end = min(query_start + batch_size, N)
        _chemfp.count_tanimoto_hits_arena_symmetric(
            threshold, arena.num_bits,
            arena.start_padding, arena.end_padding, arena.storage_size, arena.arena,
            query_start, query_end, 0, N,
            arena.popcount_indices,
            counts)

    return counts
예제 #3
0
def count_tanimoto_hits_symmetric(arena, threshold=0.7, batch_size=100):
    """For each fingerprint in the `arena`, count the number of other fingerprints at least `threshold` similar to it

    A fingerprint never matches itself.

    The computation can take a long time. Python won't check check for
    a ^C until the function finishes. This can be irritating. Instead,
    process only `batch_size` rows at a time before checking for a ^C.

    Example::

        arena = chemfp.load_fingerprints("targets.fps")
        counts = chemfp.search.count_tanimoto_hits_symmetric(arena, threshold=0.2)
        print counts[:10]

    The result object is implementation specific. You'll always be able to
    get its length and do an index lookup to get an integer
    count. Currently it's a ctype array of longs, but it could be an
    array.array or Python list in the future.

    :param arena: the set of fingerprints
    :type arena: a FingerprintArena
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param batch_size: the number of rows to process before checking for a ^C
    :type batch_size: integer
    :returns: an array of counts
    """
    N = len(arena)
    counts = (ctypes.c_int * N)()

    # This spends the entire time in C, which means ^C won't work until it finishes.
    # While it's theoretically slightly higher performance, I can't measure the
    # difference, and it's much better to let people be able to interrupt the program.
    #    _chemfp.count_tanimoto_hits_arena_symmetric(
    #        threshold, arena.num_bits,
    #        arena.start_padding, arena.end_padding, arena.storage_size, arena.arena,
    #        0, N, 0, N,
    #        arena.popcount_indices,
    #        counts)
    if batch_size <= 0:
        raise ValueError("batch_size must be positive")

    # Process N rows at a time, which lets Python handle ^C at times.
    # Since the code processes a triangle, this means that early
    # on there will be more time between ^C checks than later.
    # I'm not able to detect the Python overhead, so I'm not going
    # to make it more "efficient".
    for query_start in xrange(0, N, batch_size):
        query_end = min(query_start + batch_size, N)
        _chemfp.count_tanimoto_hits_arena_symmetric(
            threshold, arena.num_bits, arena.start_padding, arena.end_padding,
            arena.storage_size, arena.arena, query_start, query_end, 0, N,
            arena.popcount_indices, counts)

    return counts
예제 #4
0
def count_tanimoto_hits_symmetric(arena, threshold):
    N = len(arena)
    counts = (ctypes.c_int * N)()

    _chemfp.count_tanimoto_hits_arena_symmetric(threshold, arena.num_bits,
                                                arena.start_padding,
                                                arena.end_padding,
                                                arena.storage_size,
                                                arena.arena, 0, N, 0, N,
                                                arena.popcount_indices, counts)

    return counts
예제 #5
0
def partial_count_tanimoto_hits_symmetric(counts,
                                          arena,
                                          threshold=0.7,
                                          query_start=0,
                                          query_end=None,
                                          target_start=0,
                                          target_end=None):
    """Compute a portion of the symmetric Tanimoto counts

    For most cases, use count_tanimoto_hits_symmetric instead of this
    function!
    
    This function is only useful for thread-pool implementations. In
    that case, set the number of OpenMP threads to 1.

    `counts` is a contiguous array of integers. It should be
    initialized to zeros, and reused for successive calls.

    The function adds counts for counts[query_start:query_end] based
    on computing the upper-triangle portion contained in the rectangle
    query_start:query_end and target_start:target_end and using
    symmetry to fill in the lower half.

    You know, this is pretty complicated. Here's the bare minimum
    example of how to use it correctly to process 10 rows at a time
    using up to 4 threads::

        import chemfp
        import chemfp.search
        from chemfp import futures
        import array
        
        chemfp.set_num_threads(1)  # Globally disable OpenMP
        
        arena = chemfp.load_fingerprints("targets.fps")  # Load the fingerprints
        n = len(arena)
        counts = array.array("i", [0]*n)
        
        with futures.ThreadPoolExecutor(max_workers=4) as executor:
            for row in xrange(0, n, 10):
                executor.submit(chemfp.search.partial_count_tanimoto_hits_symmetric,
                                counts, arena, threshold=0.2,
                                query_start=row, query_end=min(row+10, n))
        
        print counts

    :param counts: the accumulated Tanimoto counts
    :type counts: a contiguous block of integer
    :param arena: the fingerprints.
    :type arena: a FingerprintArena
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param query_start: the query start row
    :type query_start: an integer
    :param query_end: the query end row
    :type query_end: an integer, or None to mean the last query row
    :param target_start: the target start row
    :type target_start: an integer
    :param target_end: the target end row
    :type target_end: an integer, or None to mean the last target row
    :returns: nothing
    """
    N = len(arena)

    if query_end is None:
        query_end = N
    elif query_end > N:
        query_end = N

    if target_end is None:
        target_end = N
    elif target_end > N:
        target_end = N

    if query_end > len(counts):
        raise ValueError("counts array is too small for the given query range")
    if target_end > len(counts):
        raise ValueError(
            "counts array is too small for the given target range")

    _chemfp.count_tanimoto_hits_arena_symmetric(
        threshold, arena.num_bits, arena.start_padding, arena.end_padding,
        arena.storage_size, arena.arena, query_start, query_end, target_start,
        target_end, arena.popcount_indices, counts)
예제 #6
0
def partial_count_tanimoto_hits_symmetric(counts, arena, threshold=0.7,
                                          query_start=0, query_end=None,
                                          target_start=0, target_end=None):
    """Compute a portion of the symmetric Tanimoto counts

    For most cases, use count_tanimoto_hits_symmetric instead of this
    function!
    
    This function is only useful for thread-pool implementations. In
    that case, set the number of OpenMP threads to 1.

    `counts` is a contiguous array of integers. It should be
    initialized to zeros, and reused for successive calls.

    The function adds counts for counts[query_start:query_end] based
    on computing the upper-triangle portion contained in the rectangle
    query_start:query_end and target_start:target_end and using
    symmetry to fill in the lower half.

    You know, this is pretty complicated. Here's the bare minimum
    example of how to use it correctly to process 10 rows at a time
    using up to 4 threads::

        import chemfp
        import chemfp.search
        from chemfp import futures
        import array
        
        chemfp.set_num_threads(1)  # Globally disable OpenMP
        
        arena = chemfp.load_fingerprints("targets.fps")  # Load the fingerprints
        n = len(arena)
        counts = array.array("i", [0]*n)
        
        with futures.ThreadPoolExecutor(max_workers=4) as executor:
            for row in xrange(0, n, 10):
                executor.submit(chemfp.search.partial_count_tanimoto_hits_symmetric,
                                counts, arena, threshold=0.2,
                                query_start=row, query_end=min(row+10, n))
        
        print counts

    :param counts: the accumulated Tanimoto counts
    :type counts: a contiguous block of integer
    :param arena: the fingerprints.
    :type arena: a FingerprintArena
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param query_start: the query start row
    :type query_start: an integer
    :param query_end: the query end row
    :type query_end: an integer, or None to mean the last query row
    :param target_start: the target start row
    :type target_start: an integer
    :param target_end: the target end row
    :type target_end: an integer, or None to mean the last target row
    :returns: nothing
    """
    N = len(arena)
    
    if query_end is None:
        query_end = N
    elif query_end > N:
        query_end = N
        
    if target_end is None:
        target_end = N
    elif target_end > N:
        target_end = N

    if query_end > len(counts):
        raise ValueError("counts array is too small for the given query range")
    if target_end > len(counts):
        raise ValueError("counts array is too small for the given target range")

    _chemfp.count_tanimoto_hits_arena_symmetric(
        threshold, arena.num_bits,
        arena.start_padding, arena.end_padding, arena.storage_size, arena.arena,
        query_start, query_end, target_start, target_end,
        arena.popcount_indices,
        counts)