def find_similarity_matches(
    query: chemfp.arena.FingerprintArena,
    target: Optional[chemfp.arena.FingerprintArena],
    threshold: float = 0.95,
    n_threads: int = 1,
) -> Mapping[str, Mapping[str, float]]:
    chemfp.set_num_threads(n_threads)
    if target is not None:
        match_res = chemfp.search.threshold_tanimoto_search_arena(
            query, target, threshold=threshold)
    else:
        match_res = chemfp.search.threshold_tanimoto_search_symmetric(
            query, threshold=threshold, include_lower_triangle=False)
    return {
        q_id: {t: s
               for t, s in targets}
        for q_id, targets in zip(query.ids, match_res.iter_ids_and_scores())
        if len(targets) > 0
    }
예제 #2
0
def distance_matrix(arena, tanimoto_threshold = 0.0):
    n = len(arena)
    # Start off a similarity matrix with 1.0s along the diagonal
    try:
        similarities = numpy.identity(n, "d")
    except:
        raise Exception('Input dataset is to large!')
    chemfp.set_num_threads( args.processors )

    ## Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)

    # Copy the results into the NumPy array.
    for row_index, row in enumerate(results.iter_indices_and_scores()):
        for target_index, target_score in row:
            similarities[row_index, target_index] = target_score

    # Return the distance matrix using the similarity matrix
    return 1.0 - similarities
예제 #3
0
def distance_matrix(arena, tanimoto_threshold = 0.0):
    n = len(arena)
    # Start off a similarity matrix with 1.0s along the diagonal
    try:
        similarities = numpy.identity(n, "d")
    except:
        raise Exception('Input dataset is to large!')
    chemfp.set_num_threads( args.processors )

    ## Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)

    # Copy the results into the NumPy array.
    for row_index, row in enumerate(results.iter_indices_and_scores()):
        for target_index, target_score in row:
            similarities[row_index, target_index] = target_score

    # Return the distance matrix using the similarity matrix
    return 1.0 - similarities
예제 #4
0
def butina(args):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open(args.input_path, format='fps')
    arena = chemfp.load_fingerprints(targets)

    chemfp.set_num_threads(args.processors)
    results = search.threshold_tanimoto_search_symmetric(
        arena, threshold=args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append(fp_idx)
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append((fp_idx, unassigned))
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write("#%s true singletons\n" % len(true_singletons))
    out.write("#%s false singletons\n" % len(false_singletons))
    out.write("#clusters: %s\n" % len_cluster)

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" %
                  (centroid_name, len(members), " ".join(arena.ids[idx]
                                                         for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()
예제 #5
0
def butina( args ):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )

    chemfp.set_num_threads( args.processors )
    results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append( fp_idx )
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append( (fp_idx, unassigned) )
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write( "#%s true singletons\n" % len(true_singletons) )
    out.write( "#%s false singletons\n" % len(false_singletons) )
    out.write( "#clusters: %s\n" % len_cluster )

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()
예제 #6
0
 def test_set_beyond_max(self):
     chemfp.set_num_threads(chemfp.get_max_threads()+1)
     self.assertEquals(chemfp.get_num_threads(), chemfp.get_max_threads())
예제 #7
0
 def test_set_to_two(self):
     chemfp.set_num_threads(2)
     self.assertEquals(chemfp.get_num_threads(), 2)
예제 #8
0
 def tearDown(self):
     chemfp.set_num_threads(self._num_threads)
예제 #9
0
 def test_set_to_zero(self):
     chemfp.set_num_threads(0)
     self.assertEquals(chemfp.get_num_threads(), 1)
예제 #10
0
 def tearDown(self):
     chemfp.set_num_threads(self._num_threads)
예제 #11
0
 def test_set_beyond_max(self):
     chemfp.set_num_threads(chemfp.get_max_threads() + 1)
     self.assertEquals(chemfp.get_num_threads(), chemfp.get_max_threads())
예제 #12
0
 def test_set_to_two(self):
     chemfp.set_num_threads(2)
     self.assertEquals(chemfp.get_num_threads(), 2)
예제 #13
0
 def test_set_to_one(self):
     chemfp.set_num_threads(1)
     self.assertEquals(chemfp.get_num_threads(), 1)
예제 #14
0
 def test_set_to_zero(self):
     chemfp.set_num_threads(0)
     self.assertEquals(chemfp.get_num_threads(), 1)
예제 #15
0
os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) )


chemfp_fingerprint_file = temp_link
tanimoto_threshold = float(sys.argv[2])
outfile = sys.argv[3]
processors = int(sys.argv[4])


def get_hit_indicies(hits):
    return [id for (id, score) in hits]

out = open(outfile, 'w')
dataset = chemfp.load_fingerprints( chemfp_fingerprint_file )

chemfp.set_num_threads( processors )
search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold)
#search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold)

# Reorder so the centroid with the most hits comes first.
# (That's why I do a reverse search.)
# Ignore the arbitrariness of breaking ties by fingerprint index
results = sorted( (  (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores())  ),reverse=True)


# Determine the true/false singletons and the clusters
true_singletons = []
false_singletons = []
clusters = []

seen = set()
예제 #16
0
 def test_set_to_one(self):
     chemfp.set_num_threads(1)
     self.assertEquals(chemfp.get_num_threads(), 1)
예제 #17
0
os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link))

chemfp_fingerprint_file = temp_link
tanimoto_threshold = float(sys.argv[2])
outfile = sys.argv[3]
processors = int(sys.argv[4])


def get_hit_indicies(hits):
    return [id for (id, score) in hits]


out = open(outfile, 'w')
dataset = chemfp.load_fingerprints(chemfp_fingerprint_file)

chemfp.set_num_threads(processors)
search = dataset.threshold_tanimoto_search_arena(dataset,
                                                 threshold=tanimoto_threshold)
#search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold)

# Reorder so the centroid with the most hits comes first.
# (That's why I do a reverse search.)
# Ignore the arbitrariness of breaking ties by fingerprint index
results = sorted(
    ((len(hits), i, hits)
     for (i, hits) in enumerate(search.iter_indices_and_scores())),
    reverse=True)

# Determine the true/false singletons and the clusters
true_singletons = []
false_singletons = []