Ejemplo n.º 1
0
    def test_upper_only(self):
        # query[i] always matches target[i] so x[i] will always contain i
        x = search.threshold_tanimoto_search_arena(fps, fps, 0.9)
        x = list(x)

        # This only processes the upper-triangle, and not the diagonal
        y = search.threshold_tanimoto_search_symmetric(fps, 0.9, include_lower_triangle=False)


        rows = list(row.get_indices_and_scores() for row in y)
        row_sizes = map(len, rows)
        # Move elements to the lower triangle
        for rowno, (row, row_size) in enumerate(zip(rows, row_sizes)):
            for (colno, score) in row[:row_size]:
                assert colno > rowno, (rowno, colno)
                rows[colno].append( (rowno, score) )

            # Fill in the diagonal
            row.append((rowno, 1.0))

            # Put into a consistent order
            row.sort()
            
            # Match with the NxM algorithm
            expected_row = x[rowno]
            expected_row.reorder("increasing-index")

            self.assertEquals(row, list(expected_row), rowno)
Ejemplo n.º 2
0
def main(args=None):
    args = parser.parse_args(args)

    if args.profile and psutil is None:
        sys.stderr.write(
            "WARNING: Must install the 'psutil' module to see memory statistics.\n"
        )

    # Load the fingerprints
    start_stats = get_profile_stats()
    try:
        arena = chemfp.load_fingerprints(args.fingerprint_filename)
    except IOError as err:
        sys.stderr.write("Cannot open fingerprint file: %s" % (err, ))
        raise SystemExit(2)

    # Make sure I can generate output before doing the heavy calculations
    outfile, outfile_close = open_output(parser, args.output)

    try:
        load_stats = get_profile_stats()

        # Generate the NxN similarity matrix for the given threshold
        similarity_table = search.threshold_tanimoto_search_symmetric(
            arena, threshold=args.threshold)
        similarity_stats = get_profile_stats()

        # Do the clustering
        cluster_results = taylor_butina_cluster(similarity_table)
        cluster_stats = get_profile_stats()

        # Report the results
        report_cluster_results(cluster_results, arena, outfile)

        # Report the time and memory use.
        if args.profile:
            print("#fingerprints:",
                  len(arena),
                  "#bits/fp:",
                  arena.num_bits,
                  "threshold:",
                  args.threshold,
                  "#matches:",
                  similarity_table.count_all(),
                  file=sys.stderr)
            profile_report("Load", start_stats, load_stats)
            profile_report("Similarity", load_stats, similarity_stats)
            profile_report("Clustering", similarity_stats, cluster_stats)
            profile_report("Total", start_stats, get_profile_time())
    finally:
        outfile_close()
Ejemplo n.º 3
0
    def test_upper_and_lower(self):
        # query[i] always matches target[i] so x[i] will always contain i
        x = search.threshold_tanimoto_search_arena(fps, fps, 0.9)

        # This only processes the upper-triangle, and not the diagonal
        y = search.threshold_tanimoto_search_symmetric(fps, 0.9)

        for i, (x_row, y_row) in enumerate(zip(x, y)):
            x_row = x_row.get_indices_and_scores()
            y_row = y_row.get_indices_and_scores()
            y_row.append((i, 1.0))
            x_row.sort()
            y_row.sort()

            self.assertEquals(x_row, y_row)
Ejemplo n.º 4
0
    def test_upper_and_lower(self):
        # query[i] always matches target[i] so x[i] will always contain i
        x = search.threshold_tanimoto_search_arena(fps, fps, 0.9)

        # This only processes the upper-triangle, and not the diagonal
        y = search.threshold_tanimoto_search_symmetric(fps, 0.9)

        for i, (x_row, y_row) in enumerate(zip(x, y)):
            x_row = x_row.get_indices_and_scores()
            y_row = y_row.get_indices_and_scores()
            y_row.append((i, 1.0))
            x_row.sort()
            y_row.sort()

            self.assertEquals(x_row, y_row)
Ejemplo n.º 5
0
def distance_matrix(arena):
    n = len(arena)

    # Start off a similarity matrix with 1.0s along the diagonal
    similarities = np.identity(n, "d")

    ## Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results = search.threshold_tanimoto_search_symmetric(arena, threshold=0.0)

    # Copy the results into the NumPy array.
    for row_index, row in enumerate(results.iter_indices_and_scores()):
        for target_index, target_score in row:
            similarities[row_index, target_index] = target_score

    # Return the distance matrix using the similarity matrix
    return 1.0 - similarities
Ejemplo n.º 6
0
def distance_matrix_1d(arena):
    print "Start calculating distance matrix"
    start_time = time.time()
    n = len(arena)

    # Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results = search.threshold_tanimoto_search_symmetric(arena, threshold=0.0, include_lower_triangle=False)

    dists = []

    for row_index, row in enumerate(results.iter_indices_and_scores()):
        scores = [target_score for target_index, target_score in row]
        dists.extend([1 - x for x in scores])
        print sys.getsizeof(dists)

    print "time taken to calculate ", n, " : ", time.time() - start_time
    # Return the distance matrix using the similarity matrix
    return dists
Ejemplo n.º 7
0
def distance_matrix(arena):
    start_time = time.time()
    n = len(arena)
    # Start off a similarity matrix with 1.0s along the diagonal
    similarities = numpy.identity(n, "d")

    # Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results = search.threshold_tanimoto_search_symmetric(arena, threshold=0.0,include_lower_triangle=True)

    similarity_list = [[(1 - score) for score in scores] for (i, scores) in enumerate(results.iter_scores())]
    # Copy the results into the NumPy array.
    #for row_index, row in enumerate(results.iter_indices_and_scores()):
    #    for target_index, target_score in row:
    #        similarities[row_index, target_index] = target_score

    print "time taken to calculate ", n, " : ", time.time() - start_time
    # Return the distance matrix using the similarity matrix
    return similarity_list
Ejemplo n.º 8
0
    def test_partial_threshold_search(self):
        threshold = 0.1
        N = len(fps)
        result = search.SearchResults(N, fps.arena_ids)
        expected = [[] for i in range(N)]

        for i in range(0, N, 13):
            for j in range(i, N, 8):
                search.partial_threshold_tanimoto_search_symmetric(result, fps, threshold,
                                                                   i, i+13, j, j+8)
                slow_threshold_search(expected, fps, threshold, i, i+13, j, j+8)

        _compare_search_results(self, result, expected)

        counts_before = map(len, result)
        search.fill_lower_triangle(result)
        counts_after = map(len, result)
        self.assertNotEqual(counts_before, counts_after)
        self.assertSequenceEqual(counts_after,
                                 search.count_tanimoto_hits_symmetric(fps, threshold))

        normal = search.threshold_tanimoto_search_symmetric(fps, threshold)
        _compare_search_results(self, result, list(normal.iter_indices_and_scores()))
Ejemplo n.º 9
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}
    
    original_ids = []
    fps = chemfp.open(target_filename)
    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp
    
    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict((i, original_ids[original_index])
                                            for i, original_index in enumerate(targets.ids))
    
    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(outfile, {
            "num_bits": targets.metadata.num_bits,
            "software": SOFTWARE,
            "type": type,
            "targets": target_filename,
            "target_sources": targets.metadata.sources})

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(targets, threshold,
                                                          batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(targets, threshold,
                                                                     batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(targets, k, threshold,
                                                                    batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[current_index].get_ids_and_scores()
                outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n") # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
Ejemplo n.º 10
0
    ]
    #percentage_list = [1]
    if BUTINA:
        if not RDKIT:
            print "CHEMFP"
            arena_actives = read_chemfp("dataset/actives_final.sdf")
            arena_all = read_chemfp("dataset/merged.sdf")

            for perc in percentage_list:
                print "Clustering for ", perc, " molecules "
                arena_actives_subset, arena_subset = get_arena_percentage(
                    perc, arena_actives, arena_all)
                start_time = time.time()
                print "Actives Subset length ", len(arena_actives_subset)
                print "Merged Subset length ", len(arena_subset)
                similarity_table = search.threshold_tanimoto_search_symmetric(
                    arena_subset, threshold=0.5)

                #centroid_table = sorted(((len(indices), i, indices)
                #                         for (i, indices) in enumerate(similarity_table.iter_indices())),
                #                        reverse=True)
                tuple_list = sorted(((len(indices), i) for (
                    i, indices) in enumerate(similarity_table.iter_indices())),
                                    reverse=True)
                neighbours_list = [
                    indices
                    for (i,
                         indices) in enumerate(similarity_table.iter_indices())
                ]
                print "time taken to calculate neighbours: ", time.time(
                ) - start_time
                clusters = ButinaClustering(tuple_list, neighbours_list,
Ejemplo n.º 11
0
def butina(args):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open(args.input_path, format='fps')
    arena = chemfp.load_fingerprints(targets)

    chemfp.set_num_threads(args.processors)
    results = search.threshold_tanimoto_search_symmetric(
        arena, threshold=args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append(fp_idx)
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append((fp_idx, unassigned))
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write("#%s true singletons\n" % len(true_singletons))
    out.write("#%s false singletons\n" % len(false_singletons))
    out.write("#clusters: %s\n" % len_cluster)

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" %
                  (centroid_name, len(members), " ".join(arena.ids[idx]
                                                         for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()
Ejemplo n.º 12
0
def clusmidf(smidf, th=0.8, method='butina', arena=None):

    if method != 'butina' and method != 'cl':
        print('Please select butina or cl')
        return None

    # Init time counter
    start = time.time()

    # Get the arena
    if arena is None:
        arena = smidf2arena(smidf)

    # Do the clustering
    if method == 'butina':
        # Generate the similarity table
        similarity_table = search.threshold_tanimoto_search_symmetric(
            arena, threshold=th)

        # Cluster the data
        clus_res = taylor_butina_cluster(similarity_table)

        # Output
        out = []
        # We need to re-sort the clusters as the creation of them does not generate a monotonously decreasing list
        cs_sorted = sorted([(len(c[1]), c[1], c[0])
                            for c in clus_res.clusters],
                           reverse=True)
        for i in range(len(cs_sorted)):
            cl = []
            c = cs_sorted[i]
            cl.append(
                arena.ids[c[2]]
            )  # Retrieve the arenaid of the centroid and add to the cluster
            cl.extend([
                arena.ids[x] for x in c[1]
            ])  # Retrieve the arenaid of the neighbors and add to cluster
            out.append(cl)
        for i in range(len(clus_res.false_singletons)):
            cl = [arena.ids[clus_res.false_singletons[i]]]
            out.append(cl)
        for i in range(len(clus_res.true_singletons)):
            cl = [arena.ids[clus_res.true_singletons[i]]]
            out.append(cl)

    elif method == 'cl':
        # Generate the condensed distance table
        distances = ssd.squareform(distance_matrix(arena))

        # Cluster the data
        clus_res = fcluster(linkage(distances, method='complete'), th,
                            'distance')

        # Ouptut
        aids = arena.ids
        out = []
        for i in np.unique(clus_res):
            cl = [aids[i] for i in list(np.where(clus_res == i)[0])]
            out.append(cl)
        out = [
            x[2] for x in sorted([(len(x), i, x) for (i, x) in enumerate(out)],
                                 reverse=True)
        ]

    # End time count and report
    end = time.time()
    elapsed_time = end - start
    print('Clustering time: ' +
          time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

    # Return cluster results
    return out
Ejemplo n.º 13
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}

    original_ids = []
    fps = chemfp.open(target_filename)

    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp

    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(
        zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict(
        (i, original_ids[original_index])
        for i, original_index in enumerate(targets.ids))

    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(
            outfile, {
                "num_bits": targets.metadata.num_bits,
                "software": SOFTWARE,
                "type": type,
                "targets": target_filename,
                "target_sources": targets.metadata.sources
            })

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(
                targets, threshold, batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(
                targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(
                    targets, threshold, batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(
                    targets, k, threshold, batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[
                    current_index].get_ids_and_scores()
                outfile.write("%d\t%s" %
                              (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n")  # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" %
                         (t2 - t1, t3 - t2, t3 - t1))
Ejemplo n.º 14
0
def butina( args ):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )

    chemfp.set_num_threads( args.processors )
    results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append( fp_idx )
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append( (fp_idx, unassigned) )
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write( "#%s true singletons\n" % len(true_singletons) )
    out.write( "#%s false singletons\n" % len(false_singletons) )
    out.write( "#clusters: %s\n" % len_cluster )

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()