def test_partial_threshold_search(self): threshold = 0.1 N = len(fps) result = search.SearchResults(N, fps.arena_ids) expected = [[] for i in range(N)] for i in range(0, N, 13): for j in range(i, N, 8): search.partial_threshold_tanimoto_search_symmetric( result, fps, threshold, i, i + 13, j, j + 8) slow_threshold_search(expected, fps, threshold, i, i + 13, j, j + 8) _compare_search_results(self, result, expected) counts_before = map(len, result) search.fill_lower_triangle(result) counts_after = map(len, result) self.assertNotEqual(counts_before, counts_after) self.assertSequenceEqual( counts_after, search.count_tanimoto_hits_symmetric(fps, threshold)) normal = search.threshold_tanimoto_search_symmetric(fps, threshold) _compare_search_results(self, result, list(normal.iter_indices_and_scores()))
def test_symmetric(self): # query[i] always matches target[i] so x[i] will be at least one x = search.count_tanimoto_hits_arena(fps, fps, 0.6) # This only processes the upper-triangle, and not the diagonal y = search.count_tanimoto_hits_symmetric(fps, 0.6) self.assertEquals(len(x), len(y)) for i in range(len(x)): self.assertEquals(x[i]-1, y[i])
def test_symmetric(self): # query[i] always matches target[i] so x[i] will be at least one x = search.count_tanimoto_hits_arena(fps, fps, 0.6) # This only processes the upper-triangle, and not the diagonal y = search.count_tanimoto_hits_symmetric(fps, 0.6) self.assertEquals(len(x), len(y)) for i in range(len(x)): self.assertEquals(x[i] - 1, y[i])
def test_partial_counts(self): threshold = 0.2 N = len(fps) counts = array.array("i", [0]*N) expected = [0] * N for i in range(0, N, 10): for j in range(i, N, 8): slow_counts(expected, fps, threshold, i, i+10, j, j+8) search.partial_count_tanimoto_hits_symmetric(counts, fps, threshold, i, i+10, j, j+8) self.assertSequenceEqual(counts, expected) normal = search.count_tanimoto_hits_symmetric(fps, threshold) self.assertSequenceEqual(normal, expected)
def test_partial_counts(self): threshold = 0.2 N = len(fps) counts = array.array("i", [0] * N) expected = [0] * N for i in range(0, N, 10): for j in range(i, N, 8): slow_counts(expected, fps, threshold, i, i + 10, j, j + 8) search.partial_count_tanimoto_hits_symmetric( counts, fps, threshold, i, i + 10, j, j + 8) self.assertSequenceEqual(counts, expected) normal = search.count_tanimoto_hits_symmetric(fps, threshold) self.assertSequenceEqual(normal, expected)
def test_partial_threshold_search(self): threshold = 0.1 N = len(fps) result = search.SearchResults(N, fps.arena_ids) expected = [[] for i in range(N)] for i in range(0, N, 13): for j in range(i, N, 8): search.partial_threshold_tanimoto_search_symmetric(result, fps, threshold, i, i+13, j, j+8) slow_threshold_search(expected, fps, threshold, i, i+13, j, j+8) _compare_search_results(self, result, expected) counts_before = map(len, result) search.fill_lower_triangle(result) counts_after = map(len, result) self.assertNotEqual(counts_before, counts_after) self.assertSequenceEqual(counts_after, search.count_tanimoto_hits_symmetric(fps, threshold)) normal = search.threshold_tanimoto_search_symmetric(fps, threshold) _compare_search_results(self, result, list(normal.iter_indices_and_scores()))
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict((i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header(outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources}) if args.count: counts = search.count_tanimoto_hits_symmetric(targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric(targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric(targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict( zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict( (i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header( outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources }) if args.count: counts = search.count_tanimoto_hits_symmetric( targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter( targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric( targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric( targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[ current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2 - t1, t3 - t2, t3 - t1))
def test_zeros(self): y = search.count_tanimoto_hits_symmetric(zeros, 0.9) self.assertSequenceEqual(y, [0, 0, 0, 0, 1, 1]) y = search.count_tanimoto_hits_symmetric(zeros, 0.001) self.assertSequenceEqual(y, [0, 0, 1, 2, 2, 3])
def test_zeros(self): y = search.count_tanimoto_hits_symmetric(zeros, 0.9) self.assertSequenceEqual(y, [0, 0, 0, 0, 1, 1]) y = search.count_tanimoto_hits_symmetric(zeros, 0.001) self.assertSequenceEqual(y, [0, 0, 1, 2, 2, 3])