def find_similarity_matches( query: chemfp.arena.FingerprintArena, target: Optional[chemfp.arena.FingerprintArena], threshold: float = 0.95, n_threads: int = 1, ) -> Mapping[str, Mapping[str, float]]: chemfp.set_num_threads(n_threads) if target is not None: match_res = chemfp.search.threshold_tanimoto_search_arena( query, target, threshold=threshold) else: match_res = chemfp.search.threshold_tanimoto_search_symmetric( query, threshold=threshold, include_lower_triangle=False) return { q_id: {t: s for t, s in targets} for q_id, targets in zip(query.ids, match_res.iter_ids_and_scores()) if len(targets) > 0 }
def distance_matrix(arena, tanimoto_threshold = 0.0): n = len(arena) # Start off a similarity matrix with 1.0s along the diagonal try: similarities = numpy.identity(n, "d") except: raise Exception('Input dataset is to large!') chemfp.set_num_threads( args.processors ) ## Compute the full similarity matrix. # The implementation computes the upper-triangle then copies # the upper-triangle into lower-triangle. It does not include # terms for the diagonal. results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) # Copy the results into the NumPy array. for row_index, row in enumerate(results.iter_indices_and_scores()): for target_index, target_score in row: similarities[row_index, target_index] = target_score # Return the distance matrix using the similarity matrix return 1.0 - similarities
def butina(args): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open(args.input_path, format='fps') arena = chemfp.load_fingerprints(targets) chemfp.set_num_threads(args.processors) results = search.threshold_tanimoto_search_symmetric( arena, threshold=args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append(fp_idx) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append((fp_idx, unassigned)) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write("#%s true singletons\n" % len(true_singletons)) out.write("#%s false singletons\n" % len(false_singletons)) out.write("#clusters: %s\n" % len_cluster) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()
def butina( args ): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open( args.input_path, format='fps' ) arena = chemfp.load_fingerprints( targets ) chemfp.set_num_threads( args.processors ) results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append( fp_idx ) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append( (fp_idx, unassigned) ) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write( "#%s true singletons\n" % len(true_singletons) ) out.write( "#%s false singletons\n" % len(false_singletons) ) out.write( "#clusters: %s\n" % len_cluster ) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()
def test_set_beyond_max(self): chemfp.set_num_threads(chemfp.get_max_threads()+1) self.assertEquals(chemfp.get_num_threads(), chemfp.get_max_threads())
def test_set_to_two(self): chemfp.set_num_threads(2) self.assertEquals(chemfp.get_num_threads(), 2)
def tearDown(self): chemfp.set_num_threads(self._num_threads)
def test_set_to_zero(self): chemfp.set_num_threads(0) self.assertEquals(chemfp.get_num_threads(), 1)
def test_set_beyond_max(self): chemfp.set_num_threads(chemfp.get_max_threads() + 1) self.assertEquals(chemfp.get_num_threads(), chemfp.get_max_threads())
def test_set_to_one(self): chemfp.set_num_threads(1) self.assertEquals(chemfp.get_num_threads(), 1)
os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) ) chemfp_fingerprint_file = temp_link tanimoto_threshold = float(sys.argv[2]) outfile = sys.argv[3] processors = int(sys.argv[4]) def get_hit_indicies(hits): return [id for (id, score) in hits] out = open(outfile, 'w') dataset = chemfp.load_fingerprints( chemfp_fingerprint_file ) chemfp.set_num_threads( processors ) search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold) #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold) # Reorder so the centroid with the most hits comes first. # (That's why I do a reverse search.) # Ignore the arbitrariness of breaking ties by fingerprint index results = sorted( ( (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores()) ),reverse=True) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set()
os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link)) chemfp_fingerprint_file = temp_link tanimoto_threshold = float(sys.argv[2]) outfile = sys.argv[3] processors = int(sys.argv[4]) def get_hit_indicies(hits): return [id for (id, score) in hits] out = open(outfile, 'w') dataset = chemfp.load_fingerprints(chemfp_fingerprint_file) chemfp.set_num_threads(processors) search = dataset.threshold_tanimoto_search_arena(dataset, threshold=tanimoto_threshold) #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold) # Reorder so the centroid with the most hits comes first. # (That's why I do a reverse search.) # Ignore the arbitrariness of breaking ties by fingerprint index results = sorted( ((len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores())), reverse=True) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = []