def test_upper_only(self): # query[i] always matches target[i] so x[i] will always contain i x = search.threshold_tanimoto_search_arena(fps, fps, 0.9) x = list(x) # This only processes the upper-triangle, and not the diagonal y = search.threshold_tanimoto_search_symmetric(fps, 0.9, include_lower_triangle=False) rows = list(row.get_indices_and_scores() for row in y) row_sizes = map(len, rows) # Move elements to the lower triangle for rowno, (row, row_size) in enumerate(zip(rows, row_sizes)): for (colno, score) in row[:row_size]: assert colno > rowno, (rowno, colno) rows[colno].append( (rowno, score) ) # Fill in the diagonal row.append((rowno, 1.0)) # Put into a consistent order row.sort() # Match with the NxM algorithm expected_row = x[rowno] expected_row.reorder("increasing-index") self.assertEquals(row, list(expected_row), rowno)
def main(args=None): args = parser.parse_args(args) if args.profile and psutil is None: sys.stderr.write( "WARNING: Must install the 'psutil' module to see memory statistics.\n" ) # Load the fingerprints start_stats = get_profile_stats() try: arena = chemfp.load_fingerprints(args.fingerprint_filename) except IOError as err: sys.stderr.write("Cannot open fingerprint file: %s" % (err, )) raise SystemExit(2) # Make sure I can generate output before doing the heavy calculations outfile, outfile_close = open_output(parser, args.output) try: load_stats = get_profile_stats() # Generate the NxN similarity matrix for the given threshold similarity_table = search.threshold_tanimoto_search_symmetric( arena, threshold=args.threshold) similarity_stats = get_profile_stats() # Do the clustering cluster_results = taylor_butina_cluster(similarity_table) cluster_stats = get_profile_stats() # Report the results report_cluster_results(cluster_results, arena, outfile) # Report the time and memory use. if args.profile: print("#fingerprints:", len(arena), "#bits/fp:", arena.num_bits, "threshold:", args.threshold, "#matches:", similarity_table.count_all(), file=sys.stderr) profile_report("Load", start_stats, load_stats) profile_report("Similarity", load_stats, similarity_stats) profile_report("Clustering", similarity_stats, cluster_stats) profile_report("Total", start_stats, get_profile_time()) finally: outfile_close()
def test_upper_and_lower(self): # query[i] always matches target[i] so x[i] will always contain i x = search.threshold_tanimoto_search_arena(fps, fps, 0.9) # This only processes the upper-triangle, and not the diagonal y = search.threshold_tanimoto_search_symmetric(fps, 0.9) for i, (x_row, y_row) in enumerate(zip(x, y)): x_row = x_row.get_indices_and_scores() y_row = y_row.get_indices_and_scores() y_row.append((i, 1.0)) x_row.sort() y_row.sort() self.assertEquals(x_row, y_row)
def distance_matrix(arena): n = len(arena) # Start off a similarity matrix with 1.0s along the diagonal similarities = np.identity(n, "d") ## Compute the full similarity matrix. # The implementation computes the upper-triangle then copies # the upper-triangle into lower-triangle. It does not include # terms for the diagonal. results = search.threshold_tanimoto_search_symmetric(arena, threshold=0.0) # Copy the results into the NumPy array. for row_index, row in enumerate(results.iter_indices_and_scores()): for target_index, target_score in row: similarities[row_index, target_index] = target_score # Return the distance matrix using the similarity matrix return 1.0 - similarities
def distance_matrix_1d(arena): print "Start calculating distance matrix" start_time = time.time() n = len(arena) # Compute the full similarity matrix. # The implementation computes the upper-triangle then copies # the upper-triangle into lower-triangle. It does not include # terms for the diagonal. results = search.threshold_tanimoto_search_symmetric(arena, threshold=0.0, include_lower_triangle=False) dists = [] for row_index, row in enumerate(results.iter_indices_and_scores()): scores = [target_score for target_index, target_score in row] dists.extend([1 - x for x in scores]) print sys.getsizeof(dists) print "time taken to calculate ", n, " : ", time.time() - start_time # Return the distance matrix using the similarity matrix return dists
def distance_matrix(arena): start_time = time.time() n = len(arena) # Start off a similarity matrix with 1.0s along the diagonal similarities = numpy.identity(n, "d") # Compute the full similarity matrix. # The implementation computes the upper-triangle then copies # the upper-triangle into lower-triangle. It does not include # terms for the diagonal. results = search.threshold_tanimoto_search_symmetric(arena, threshold=0.0,include_lower_triangle=True) similarity_list = [[(1 - score) for score in scores] for (i, scores) in enumerate(results.iter_scores())] # Copy the results into the NumPy array. #for row_index, row in enumerate(results.iter_indices_and_scores()): # for target_index, target_score in row: # similarities[row_index, target_index] = target_score print "time taken to calculate ", n, " : ", time.time() - start_time # Return the distance matrix using the similarity matrix return similarity_list
def test_partial_threshold_search(self): threshold = 0.1 N = len(fps) result = search.SearchResults(N, fps.arena_ids) expected = [[] for i in range(N)] for i in range(0, N, 13): for j in range(i, N, 8): search.partial_threshold_tanimoto_search_symmetric(result, fps, threshold, i, i+13, j, j+8) slow_threshold_search(expected, fps, threshold, i, i+13, j, j+8) _compare_search_results(self, result, expected) counts_before = map(len, result) search.fill_lower_triangle(result) counts_after = map(len, result) self.assertNotEqual(counts_before, counts_after) self.assertSequenceEqual(counts_after, search.count_tanimoto_hits_symmetric(fps, threshold)) normal = search.threshold_tanimoto_search_symmetric(fps, threshold) _compare_search_results(self, result, list(normal.iter_indices_and_scores()))
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict((i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header(outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources}) if args.count: counts = search.count_tanimoto_hits_symmetric(targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric(targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric(targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
] #percentage_list = [1] if BUTINA: if not RDKIT: print "CHEMFP" arena_actives = read_chemfp("dataset/actives_final.sdf") arena_all = read_chemfp("dataset/merged.sdf") for perc in percentage_list: print "Clustering for ", perc, " molecules " arena_actives_subset, arena_subset = get_arena_percentage( perc, arena_actives, arena_all) start_time = time.time() print "Actives Subset length ", len(arena_actives_subset) print "Merged Subset length ", len(arena_subset) similarity_table = search.threshold_tanimoto_search_symmetric( arena_subset, threshold=0.5) #centroid_table = sorted(((len(indices), i, indices) # for (i, indices) in enumerate(similarity_table.iter_indices())), # reverse=True) tuple_list = sorted(((len(indices), i) for ( i, indices) in enumerate(similarity_table.iter_indices())), reverse=True) neighbours_list = [ indices for (i, indices) in enumerate(similarity_table.iter_indices()) ] print "time taken to calculate neighbours: ", time.time( ) - start_time clusters = ButinaClustering(tuple_list, neighbours_list,
def butina(args): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open(args.input_path, format='fps') arena = chemfp.load_fingerprints(targets) chemfp.set_num_threads(args.processors) results = search.threshold_tanimoto_search_symmetric( arena, threshold=args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append(fp_idx) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append((fp_idx, unassigned)) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write("#%s true singletons\n" % len(true_singletons)) out.write("#%s false singletons\n" % len(false_singletons)) out.write("#clusters: %s\n" % len_cluster) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()
def clusmidf(smidf, th=0.8, method='butina', arena=None): if method != 'butina' and method != 'cl': print('Please select butina or cl') return None # Init time counter start = time.time() # Get the arena if arena is None: arena = smidf2arena(smidf) # Do the clustering if method == 'butina': # Generate the similarity table similarity_table = search.threshold_tanimoto_search_symmetric( arena, threshold=th) # Cluster the data clus_res = taylor_butina_cluster(similarity_table) # Output out = [] # We need to re-sort the clusters as the creation of them does not generate a monotonously decreasing list cs_sorted = sorted([(len(c[1]), c[1], c[0]) for c in clus_res.clusters], reverse=True) for i in range(len(cs_sorted)): cl = [] c = cs_sorted[i] cl.append( arena.ids[c[2]] ) # Retrieve the arenaid of the centroid and add to the cluster cl.extend([ arena.ids[x] for x in c[1] ]) # Retrieve the arenaid of the neighbors and add to cluster out.append(cl) for i in range(len(clus_res.false_singletons)): cl = [arena.ids[clus_res.false_singletons[i]]] out.append(cl) for i in range(len(clus_res.true_singletons)): cl = [arena.ids[clus_res.true_singletons[i]]] out.append(cl) elif method == 'cl': # Generate the condensed distance table distances = ssd.squareform(distance_matrix(arena)) # Cluster the data clus_res = fcluster(linkage(distances, method='complete'), th, 'distance') # Ouptut aids = arena.ids out = [] for i in np.unique(clus_res): cl = [aids[i] for i in list(np.where(clus_res == i)[0])] out.append(cl) out = [ x[2] for x in sorted([(len(x), i, x) for (i, x) in enumerate(out)], reverse=True) ] # End time count and report end = time.time() elapsed_time = end - start print('Clustering time: ' + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) # Return cluster results return out
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict( zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict( (i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header( outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources }) if args.count: counts = search.count_tanimoto_hits_symmetric( targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter( targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric( targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric( targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[ current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2 - t1, t3 - t2, t3 - t1))
def butina( args ): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open( args.input_path, format='fps' ) arena = chemfp.load_fingerprints( targets ) chemfp.set_num_threads( args.processors ) results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append( fp_idx ) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append( (fp_idx, unassigned) ) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write( "#%s true singletons\n" % len(true_singletons) ) out.write( "#%s false singletons\n" % len(false_singletons) ) out.write( "#clusters: %s\n" % len_cluster ) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()