Example #1
0
    def test_partial_threshold_search(self):
        threshold = 0.1
        N = len(fps)
        result = search.SearchResults(N, fps.arena_ids)
        expected = [[] for i in range(N)]

        for i in range(0, N, 13):
            for j in range(i, N, 8):
                search.partial_threshold_tanimoto_search_symmetric(
                    result, fps, threshold, i, i + 13, j, j + 8)
                slow_threshold_search(expected, fps, threshold, i, i + 13, j,
                                      j + 8)

        _compare_search_results(self, result, expected)

        counts_before = map(len, result)
        search.fill_lower_triangle(result)
        counts_after = map(len, result)
        self.assertNotEqual(counts_before, counts_after)
        self.assertSequenceEqual(
            counts_after, search.count_tanimoto_hits_symmetric(fps, threshold))

        normal = search.threshold_tanimoto_search_symmetric(fps, threshold)
        _compare_search_results(self, result,
                                list(normal.iter_indices_and_scores()))
    def test_symmetric(self):
        # query[i] always matches target[i] so x[i] will be at least one
        x = search.count_tanimoto_hits_arena(fps, fps, 0.6)

        # This only processes the upper-triangle, and not the diagonal
        y = search.count_tanimoto_hits_symmetric(fps, 0.6)

        self.assertEquals(len(x), len(y))
        for i in range(len(x)):
            self.assertEquals(x[i]-1, y[i])
Example #3
0
    def test_symmetric(self):
        # query[i] always matches target[i] so x[i] will be at least one
        x = search.count_tanimoto_hits_arena(fps, fps, 0.6)

        # This only processes the upper-triangle, and not the diagonal
        y = search.count_tanimoto_hits_symmetric(fps, 0.6)

        self.assertEquals(len(x), len(y))
        for i in range(len(x)):
            self.assertEquals(x[i] - 1, y[i])
    def test_partial_counts(self):
        threshold = 0.2
        N = len(fps)
        counts = array.array("i", [0]*N)
        expected = [0] * N

        for i in range(0, N, 10):
            for j in range(i, N, 8):
                slow_counts(expected, fps, threshold, i, i+10, j, j+8)
                search.partial_count_tanimoto_hits_symmetric(counts, fps, threshold,
                                                             i, i+10, j, j+8)
                self.assertSequenceEqual(counts, expected)

        normal = search.count_tanimoto_hits_symmetric(fps, threshold)
        self.assertSequenceEqual(normal, expected)
Example #5
0
    def test_partial_counts(self):
        threshold = 0.2
        N = len(fps)
        counts = array.array("i", [0] * N)
        expected = [0] * N

        for i in range(0, N, 10):
            for j in range(i, N, 8):
                slow_counts(expected, fps, threshold, i, i + 10, j, j + 8)
                search.partial_count_tanimoto_hits_symmetric(
                    counts, fps, threshold, i, i + 10, j, j + 8)
                self.assertSequenceEqual(counts, expected)

        normal = search.count_tanimoto_hits_symmetric(fps, threshold)
        self.assertSequenceEqual(normal, expected)
    def test_partial_threshold_search(self):
        threshold = 0.1
        N = len(fps)
        result = search.SearchResults(N, fps.arena_ids)
        expected = [[] for i in range(N)]

        for i in range(0, N, 13):
            for j in range(i, N, 8):
                search.partial_threshold_tanimoto_search_symmetric(result, fps, threshold,
                                                                   i, i+13, j, j+8)
                slow_threshold_search(expected, fps, threshold, i, i+13, j, j+8)

        _compare_search_results(self, result, expected)

        counts_before = map(len, result)
        search.fill_lower_triangle(result)
        counts_after = map(len, result)
        self.assertNotEqual(counts_before, counts_after)
        self.assertSequenceEqual(counts_after,
                                 search.count_tanimoto_hits_symmetric(fps, threshold))

        normal = search.threshold_tanimoto_search_symmetric(fps, threshold)
        _compare_search_results(self, result, list(normal.iter_indices_and_scores()))
Example #7
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}
    
    original_ids = []
    fps = chemfp.open(target_filename)
    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp
    
    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict((i, original_ids[original_index])
                                            for i, original_index in enumerate(targets.ids))
    
    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(outfile, {
            "num_bits": targets.metadata.num_bits,
            "software": SOFTWARE,
            "type": type,
            "targets": target_filename,
            "target_sources": targets.metadata.sources})

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(targets, threshold,
                                                          batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(targets, threshold,
                                                                     batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(targets, k, threshold,
                                                                    batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[current_index].get_ids_and_scores()
                outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n") # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
Example #8
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}

    original_ids = []
    fps = chemfp.open(target_filename)

    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp

    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(
        zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict(
        (i, original_ids[original_index])
        for i, original_index in enumerate(targets.ids))

    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(
            outfile, {
                "num_bits": targets.metadata.num_bits,
                "software": SOFTWARE,
                "type": type,
                "targets": target_filename,
                "target_sources": targets.metadata.sources
            })

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(
                targets, threshold, batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(
                targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(
                    targets, threshold, batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(
                    targets, k, threshold, batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[
                    current_index].get_ids_and_scores()
                outfile.write("%d\t%s" %
                              (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n")  # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" %
                         (t2 - t1, t3 - t2, t3 - t1))
 def test_zeros(self):
     y = search.count_tanimoto_hits_symmetric(zeros, 0.9)
     self.assertSequenceEqual(y, [0, 0, 0, 0, 1, 1])
     y = search.count_tanimoto_hits_symmetric(zeros, 0.001)
     self.assertSequenceEqual(y, [0, 0, 1, 2, 2, 3])
Example #10
0
 def test_zeros(self):
     y = search.count_tanimoto_hits_symmetric(zeros, 0.9)
     self.assertSequenceEqual(y, [0, 0, 0, 0, 1, 1])
     y = search.count_tanimoto_hits_symmetric(zeros, 0.001)
     self.assertSequenceEqual(y, [0, 0, 1, 2, 2, 3])