Example #1
0
def threshold_tanimoto_search_fp(query_fp, target_reader, threshold):
    """Find matches in the target reader which are at least threshold similar to the query fingerprint

    The results is an FPSSearchResults instance contain the result.
    """
    ids = []
    scores = []
    fp_size = len(query_fp)
    num_bits = fp_size * 8
        
    NUM_CELLS = 1000
    cells = (TanimotoCell*NUM_CELLS)()

    lineno = target_reader._first_fp_lineno
    
    for block in target_reader.iter_blocks():
        start = 0
        end = len(block)
        while 1:
            err, start, num_lines, num_cells = _chemfp.fps_threshold_tanimoto_search(
                num_bits, 0, 0, fp_size, query_fp, 0, -1,
                block, start, end,
                threshold, cells)
            lineno += num_lines
            if err:
                raise _chemfp_error(err, lineno, target_reader._filename)
                
            for cell in itertools.islice(cells, 0, num_cells):
                    ids.append(block[cell.id_start:cell.id_end])
                    scores.append(cell.score)
            if start == end:
                break
    return FPSSearchResult(ids, scores)
Example #2
0
def threshold_tanimoto_search_arena(query_arena, target_reader, threshold):
    """Find matches in the target reader which are at least threshold similar to the query arena fingerprints

    The results are a list in the form [search_results1, search_results2, ...]
    where search_results are in the same order as the fingerprints in the query_arena.
    """
    
    require_matching_sizes(query_arena, target_reader)

    if not query_arena:
        return FPSSearchResults([])
    results = [FPSSearchResult([], []) for i in xrange(len(query_arena))]
    
    # Compute at least 100 tanimotos per query, but at most 10,000 at a time
    # (That's about 200K of memory)
    NUM_CELLS = max(10000, len(query_arena) * 100)
    cells = (TanimotoCell*NUM_CELLS)()

    lineno = target_reader._first_fp_lineno

    for block in target_reader.iter_blocks():
        start = 0
        end = len(block)
        while 1:
            err, start, num_lines, num_cells = _chemfp.fps_threshold_tanimoto_search(
                query_arena.metadata.num_bits,
                query_arena.start_padding, query_arena.end_padding,
                query_arena.storage_size, query_arena.arena, 0, -1,
                block, start, end,
                threshold, cells)
            lineno += num_lines
            if err:
                raise _chemfp_error(err, lineno, target_reader._filename)
                
            for cell in itertools.islice(cells, 0, num_cells):
                id = block[cell.id_start:cell.id_end]
                result = results[cell.query_index]
                result.ids.append(id)
                result.scores.append(cell.score)
                
            if start == end:
                break

    return FPSSearchResults(results)