Example #1
0
def knearest_tanimoto_search_symmetric(arena,
                                       k=3,
                                       threshold=0.7,
                                       batch_size=100):
    """Search for the `k`-nearest hits in the `arena` at least `threshold` similar to the fingerprints in the arena

    The computation can take a long time. Python won't check check for
    a ^C until the function finishes. This can be irritating. Instead,
    process only `batch_size` rows at a time before checking for a ^C.

    The hits in the `SearchResults` are ordered by decreasing similarity score.

    Example::

        arena = chemfp.load_fingerprints("queries.fps")
        results = chemfp.search.knearest_tanimoto_search_symmetric(arena, k=3, threshold=0.8)
        for (query_id, hits) in zip(arena.ids, results):
            print query_id, "->", ", ".join(("%s %.2f" % hit) for hit in  hits.get_ids_and_scores())
    
    :param arena: the set of fingerprints
    :type arena: a FingerprintArena
    :param k: the number of nearest neighbors to find.
    :type k: positive integer
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param include_lower_triangle:
        if False, compute only the upper triangle, otherwise use symmetry to compute the full matrix
    :type include_lower_triangle: boolean
    :param batch_size: the number of rows to process before checking for a ^C
    :type batch_size: integer
    :returns: a SearchResults instance
    """
    N = len(arena)
    if batch_size <= 0:
        raise ValueError("batch_size must be positive")

    results = SearchResults(N, arena.arena_ids)

    if N:
        # Break it up into batch_size groups in order to let Python's
        # interrupt handler check for a ^C, which is otherwise
        # suppressed until the function finishes.
        for query_start in xrange(0, N, batch_size):
            query_end = min(query_start + batch_size, N)
            _chemfp.knearest_tanimoto_arena_symmetric(
                k, threshold, arena.num_bits, arena.start_padding,
                arena.end_padding, arena.storage_size, arena.arena,
                query_start, query_end, 0, N, arena.popcount_indices, results)
        _chemfp.knearest_results_finalize(results, 0, N)

    return results
Example #2
0
def knearest_tanimoto_search_symmetric(arena, k=3, threshold=0.7, batch_size=100):
    """Search for the `k`-nearest hits in the `arena` at least `threshold` similar to the fingerprints in the arena

    The computation can take a long time. Python won't check check for
    a ^C until the function finishes. This can be irritating. Instead,
    process only `batch_size` rows at a time before checking for a ^C.

    The hits in the `SearchResults` are ordered by decreasing similarity score.

    Example::

        arena = chemfp.load_fingerprints("queries.fps")
        results = chemfp.search.knearest_tanimoto_search_symmetric(arena, k=3, threshold=0.8)
        for (query_id, hits) in zip(arena.ids, results):
            print query_id, "->", ", ".join(("%s %.2f" % hit) for hit in  hits.get_ids_and_scores())
    
    :param arena: the set of fingerprints
    :type arena: a FingerprintArena
    :param k: the number of nearest neighbors to find.
    :type k: positive integer
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :param include_lower_triangle:
        if False, compute only the upper triangle, otherwise use symmetry to compute the full matrix
    :type include_lower_triangle: boolean
    :param batch_size: the number of rows to process before checking for a ^C
    :type batch_size: integer
    :returns: a SearchResults instance
    """
    N = len(arena)
    if batch_size <= 0:
        raise ValueError("batch_size must be positive")

    results = SearchResults(N, arena.arena_ids)

    if N:
        # Break it up into batch_size groups in order to let Python's
        # interrupt handler check for a ^C, which is otherwise
        # suppressed until the function finishes.
        for query_start in xrange(0, N, batch_size):
            query_end = min(query_start + batch_size, N)            
            _chemfp.knearest_tanimoto_arena_symmetric(
                k, threshold, arena.num_bits,
                arena.start_padding, arena.end_padding, arena.storage_size, arena.arena,
                query_start, query_end, 0, N,
                arena.popcount_indices,
                results)
        _chemfp.knearest_results_finalize(results, 0, N)
    
    return results
Example #3
0
def knearest_tanimoto_search_symmetric(arena, k, threshold):
    N = len(arena)

    results = SearchResults(N, arena.ids)

    _chemfp.knearest_tanimoto_arena_symmetric(k, threshold, arena.num_bits,
                                              arena.start_padding,
                                              arena.end_padding,
                                              arena.storage_size, arena.arena,
                                              0, N, 0, N,
                                              arena.popcount_indices, results)
    _chemfp.knearest_results_finalize(results, 0, N)

    return results
Example #4
0
def knearest_tanimoto_search(query_arena, target_arena, k, threshold):
    require_matching_sizes(query_arena, target_arena)

    num_queries = len(query_arena)

    results = SearchResults(num_queries, target_arena.ids)

    _chemfp.knearest_tanimoto_arena(
        k, threshold, target_arena.num_bits, query_arena.start_padding,
        query_arena.end_padding, query_arena.storage_size, query_arena.arena,
        query_arena.start, query_arena.end, target_arena.start_padding,
        target_arena.end_padding, target_arena.storage_size,
        target_arena.arena, target_arena.start, target_arena.end,
        target_arena.popcount_indices, results, 0)

    _chemfp.knearest_results_finalize(results, 0, num_queries)

    return results
Example #5
0
def knearest_tanimoto_search_fp(query_fp, target_arena, k, threshold):
    require_matching_fp_size(query_fp, target_arena)
    query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint(
        query_fp, target_arena.alignment, target_arena.storage_size)

    if k < 0:
        raise ValueError("k must be non-negative")

    results = SearchResults(1)
    _chemfp.knearest_tanimoto_arena(
        k, threshold, target_arena.num_bits, query_start_padding,
        query_end_padding, target_arena.storage_size, query_fp, 0, 1,
        target_arena.start_padding, target_arena.end_padding,
        target_arena.storage_size, target_arena.arena, target_arena.start,
        target_arena.end, target_arena.popcount_indices, results, 0)
    _chemfp.knearest_results_finalize(results, 0, 1)

    return results[0]
Example #6
0
def knearest_tanimoto_search_arena(query_arena,
                                   target_arena,
                                   k=3,
                                   threshold=0.7):
    """Search for the `k` nearest hits in the `target_arena` at least `threshold` similar to the fingerprints in `query_arena`

    The hits in the `SearchResults` are ordered by decreasing similarity score.

    Example::
    
        queries = chemfp.load_fingerprints("queries.fps")
        targets = chemfp.load_fingerprints("targets.fps")
        results = chemfp.search.knearest_tanimoto_search_arena(queries, targets, k=3, threshold=0.5)
        for query_id, query_hits in zip(queries.ids, results):
            if len(query_hits) >= 2:
                print query_id, "->", ", ".join(query_hits.get_ids())

    :param query_arena: The query fingerprints.
    :type query_arena: a FingerprintArena
    :param target_arena: The target fingerprints.
    :type target_arena: a FingerprintArena
    :param k: the number of nearest neighbors to find.
    :type k: positive integer
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :returns: a SearchResults instance
    """
    _require_matching_sizes(query_arena, target_arena)

    num_queries = len(query_arena)

    results = SearchResults(num_queries, target_arena.arena_ids)

    _chemfp.knearest_tanimoto_arena(
        k, threshold, target_arena.num_bits, query_arena.start_padding,
        query_arena.end_padding, query_arena.storage_size, query_arena.arena,
        query_arena.start, query_arena.end, target_arena.start_padding,
        target_arena.end_padding, target_arena.storage_size,
        target_arena.arena, target_arena.start, target_arena.end,
        target_arena.popcount_indices, results, 0)

    _chemfp.knearest_results_finalize(results, 0, num_queries)

    return results
Example #7
0
def knearest_tanimoto_search_arena(query_arena, target_arena, k=3, threshold=0.7):
    """Search for the `k` nearest hits in the `target_arena` at least `threshold` similar to the fingerprints in `query_arena`

    The hits in the `SearchResults` are ordered by decreasing similarity score.

    Example::
    
        queries = chemfp.load_fingerprints("queries.fps")
        targets = chemfp.load_fingerprints("targets.fps")
        results = chemfp.search.knearest_tanimoto_search_arena(queries, targets, k=3, threshold=0.5)
        for query_id, query_hits in zip(queries.ids, results):
            if len(query_hits) >= 2:
                print query_id, "->", ", ".join(query_hits.get_ids())

    :param query_arena: The query fingerprints.
    :type query_arena: a FingerprintArena
    :param target_arena: The target fingerprints.
    :type target_arena: a FingerprintArena
    :param k: the number of nearest neighbors to find.
    :type k: positive integer
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :returns: a SearchResults instance
    """
    _require_matching_sizes(query_arena, target_arena)

    num_queries = len(query_arena)

    results = SearchResults(num_queries, target_arena.arena_ids)

    _chemfp.knearest_tanimoto_arena(
        k, threshold, target_arena.num_bits,
        query_arena.start_padding, query_arena.end_padding,
        query_arena.storage_size, query_arena.arena, query_arena.start, query_arena.end,
        target_arena.start_padding, target_arena.end_padding,
        target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end,
        target_arena.popcount_indices,
        results, 0)
    
    _chemfp.knearest_results_finalize(results, 0, num_queries)
    
    return results
Example #8
0
def knearest_tanimoto_search_fp(query_fp, target_arena, k=3, threshold=0.7):
    """Search for `k`-nearest hits in `target_arena` which are at least `threshold` similar to `query_fp`

    The hits in the `SearchResults` are ordered by decreasing similarity score.

    Example::

        query_id, query_fp = chemfp.load_fingerprints("queries.fps")[0]
        targets = chemfp.load_fingerprints("targets.fps")
        print list(chemfp.search.knearest_tanimoto_search_fp(query_fp, targets, k=3, threshold=0.0))

    :param query_fp: the query fingerprint
    :type query_fp: a byte string
    :param target_arena: the target arena
    :type target_fp: a FingerprintArena
    :param k: the number of nearest neighbors to find.
    :type k: positive integer
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :returns: a SearchResult
    """
    _require_matching_fp_size(query_fp, target_arena)
    query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint(
        query_fp, target_arena.alignment, target_arena.storage_size)
    
    if k < 0:
        raise ValueError("k must be non-negative")

    results = SearchResults(1, target_arena.arena_ids)
    _chemfp.knearest_tanimoto_arena(
        k, threshold, target_arena.num_bits,
        query_start_padding, query_end_padding, target_arena.storage_size, query_fp, 0, 1,
        target_arena.start_padding, target_arena.end_padding,
        target_arena.storage_size, target_arena.arena, target_arena.start, target_arena.end,
        target_arena.popcount_indices,
        results, 0)
    _chemfp.knearest_results_finalize(results, 0, 1)

    return results[0]
Example #9
0
def knearest_tanimoto_search_fp(query_fp, target_arena, k=3, threshold=0.7):
    """Search for `k`-nearest hits in `target_arena` which are at least `threshold` similar to `query_fp`

    The hits in the `SearchResults` are ordered by decreasing similarity score.

    Example::

        query_id, query_fp = chemfp.load_fingerprints("queries.fps")[0]
        targets = chemfp.load_fingerprints("targets.fps")
        print list(chemfp.search.knearest_tanimoto_search_fp(query_fp, targets, k=3, threshold=0.0))

    :param query_fp: the query fingerprint
    :type query_fp: a byte string
    :param target_arena: the target arena
    :type target_fp: a FingerprintArena
    :param k: the number of nearest neighbors to find.
    :type k: positive integer
    :param threshold: The minimum score threshold.
    :type threshold: float between 0.0 and 1.0, inclusive
    :returns: a SearchResult
    """
    _require_matching_fp_size(query_fp, target_arena)
    query_start_padding, query_end_padding, query_fp = _chemfp.align_fingerprint(
        query_fp, target_arena.alignment, target_arena.storage_size)

    if k < 0:
        raise ValueError("k must be non-negative")

    results = SearchResults(1, target_arena.arena_ids)
    _chemfp.knearest_tanimoto_arena(
        k, threshold, target_arena.num_bits, query_start_padding,
        query_end_padding, target_arena.storage_size, query_fp, 0, 1,
        target_arena.start_padding, target_arena.end_padding,
        target_arena.storage_size, target_arena.arena, target_arena.start,
        target_arena.end, target_arena.popcount_indices, results, 0)
    _chemfp.knearest_results_finalize(results, 0, 1)

    return results[0]