def load_fps(strucfile): fp_fname = os.path.splitext(strucfile)[0] + "_%s_new.fps" % fpname if os.path.isfile(fp_fname): #print "%s exists!" % fp_fname return chemfp.load_fingerprints(fp_fname) else: fparena = chemfp.load_fingerprints( chemfp.read_structure_fingerprints(fptype, source=strucfile)) fparena.save(fp_fname) print "%s saved!" % fp_fname return fparena
def test_arena_copy(self): data = "ABCD\t0\n" * 200 from cStringIO import StringIO arena = chemfp.load_fingerprints(StringIO(data)) def make_subarena_copy(): arena[1:].copy() memory_growth(make_subarena_copy)
def test_1_alignment(self): a = chemfp.load_fingerprints(StringIO(zeros), reorder=True, alignment=1) self.assertEquals(a.start_padding, 0) self.assertEquals(a.end_padding, 0) self.assertEquals(a.arena, "\x00\x00\x00\x00\x00\x10")
def distance_matrix(self, t): fingerprints_file = self.compound_group.fp_file arena = chemfp.load_fingerprints(fingerprints_file) n = len(arena) # The Tanimoto search computes all of the scores when threshold=0.0. # The SearchResult contains sparse data, so I set all values # now to 1.0 so you can experiment with higher thresholds. distances = numpy.ones((n, n), numpy.float64) # Keep track of where the query subarena is in the query query_row = 0 for query_arena in arena.iter_arenas(): results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t) for q_i, hits in enumerate(results.iter_indices_and_scores()): query_idx = query_row + q_i for target_idx, score in hits: distances[query_idx, target_idx] = 1 - score query_row += len(query_arena) ############################# self.data.new_file() self.data.write(distances.tostring()) self.data.close() self.save(validate=False, cascade=False) self.calc_knn_fp(arena)
def distance_matrix(self,t): fingerprints_file = self.compound_group.fp_file arena = chemfp.load_fingerprints(fingerprints_file) n = len(arena) # The Tanimoto search computes all of the scores when threshold=0.0. # The SearchResult contains sparse data, so I set all values # now to 1.0 so you can experiment with higher thresholds. distances = numpy.ones((n, n), numpy.float64) # Keep track of where the query subarena is in the query query_row = 0 for query_arena in arena.iter_arenas(): results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t) for q_i, hits in enumerate(results.iter_indices_and_scores()): query_idx = query_row + q_i for target_idx, score in hits: distances[query_idx, target_idx] = 1 - score query_row += len(query_arena) ############################# self.data.new_file() self.data.write(distances.tostring()) self.data.close() self.save(validate=False,cascade=False) self.calc_knn_fp(arena)
def test_2_alignment(self): a = chemfp.load_fingerprints(StringIO(zeros), reorder=True, alignment=2) self.assertEquals(a.start_padding, 0) self.assertEquals(a.end_padding, 1) # The code overallocates one byte self.assertEquals(a.arena, "\x00\x00\x00\x00\x00\x10\x00")
def test_2_alignment(self): a = chemfp.load_fingerprints(StringIO(zeros), reorder=False, alignment=2) self.assertEquals(a.start_padding, 0) self.assertEquals(a.end_padding, 0) self.assertEquals(a.arena, "\x00\x00\x00\x10\x00\x00") self.assertEquals(a.storage_size, 2)
def _load(fingerprints, reorder): if len(fingerprints) == 0: num_bits = 16 else: num_bits = len(fingerprints[0])*8 id_fps = ((str(i), fp) for (i, fp) in enumerate(fingerprints)) return chemfp.load_fingerprints(id_fps, metadata=chemfp.Metadata(num_bits=num_bits), reorder=reorder, alignment=1)
def get_coordinate(request): if request.is_ajax(): # pdb.set_trace() mdsres = MDSRes.objects().with_id(request.session.get("mds_id")) Clusters = [] compound_group = CompoundCollection.objects().with_id(request.session.get("collection_id")) if(mdsres is not None) : data = numpy.fromstring(mdsres.data.read()) d = len(data) datamd = numpy.reshape(data, (d/3,3)) compounds = Compound.objects(compound_group = request.session.get("collection_id")) if mdsres.simmatrix.method == "FP" : arena = chemfp.load_fingerprints(mdsres.simmatrix.compound_group.fp_file) compounds = arena.ids elif mdsres.simmatrix.method == "eden" : compounds = [compound.name for compound in compounds] elif mdsres.simmatrix.method == "pre_cluster_eden" or mdsres.simmatrix.method == "pre_cluster_eden_pca" : Clusters = Cluster.objects(collection = request.session.get("collection_id")) Clusters = [{"nodes" : [node.name for node in cluster.nodes] ,"id":str(cluster.id) ,"centriod" : cluster.centriod.name ,"density" :len(cluster.nodes)} for cluster in Clusters] compounds = Cluster.get_clusters_centriod(compound_group) compounds = [compound.name for compound in compounds] elif(mdsres is None) : pcares = PCARes.objects().with_id(request.session.get("mds_id")) data = numpy.fromstring(pcares.data.read()) d = len(data) datamd = numpy.reshape(data, (d/3,3)) Clusters = Cluster.objects(collection = request.session.get("collection_id")) Clusters = [{"nodes" : [node.name for node in cluster.nodes] ,"id":str(cluster.id) ,"centriod" : cluster.centriod.name ,"density" :len(cluster.nodes)} for cluster in Clusters] compounds = Cluster.get_clusters_centriod(compound_group) compounds = [compound.name for compound in compounds] # the order of the compounds is not correct !!! make sure to fix (arena.ids[idx] from the fingerprint file) response_data = {} response_data['result'] = 'Success' response_data['message'] = 'Compounds Coordinate' response_data['coord'] = datamd.tolist() response_data["clusters"] = Clusters response_data['comps'] = [] for compound in compounds : compitem = { "name" :compound } response_data['comps'].append(compitem) return HttpResponse(json.dumps(response_data), mimetype="application/json") return HttpResponse({}, mimetype="application/json")
def main(args=None): args = parser.parse_args(args) if args.profile and psutil is None: sys.stderr.write( "WARNING: Must install the 'psutil' module to see memory statistics.\n" ) # Load the fingerprints start_stats = get_profile_stats() try: arena = chemfp.load_fingerprints(args.fingerprint_filename) except IOError as err: sys.stderr.write("Cannot open fingerprint file: %s" % (err, )) raise SystemExit(2) # Make sure I can generate output before doing the heavy calculations outfile, outfile_close = open_output(parser, args.output) try: load_stats = get_profile_stats() # Generate the NxN similarity matrix for the given threshold similarity_table = search.threshold_tanimoto_search_symmetric( arena, threshold=args.threshold) similarity_stats = get_profile_stats() # Do the clustering cluster_results = taylor_butina_cluster(similarity_table) cluster_stats = get_profile_stats() # Report the results report_cluster_results(cluster_results, arena, outfile) # Report the time and memory use. if args.profile: print("#fingerprints:", len(arena), "#bits/fp:", arena.num_bits, "threshold:", args.threshold, "#matches:", similarity_table.count_all(), file=sys.stderr) profile_report("Load", start_stats, load_stats) profile_report("Similarity", load_stats, similarity_stats) profile_report("Clustering", similarity_stats, cluster_stats) profile_report("Total", start_stats, get_profile_time()) finally: outfile_close()
def test_16_alignment(self): arenas = [chemfp.load_fingerprints(StringIO(ordered_zeros), reorder=True, alignment=16) for i in range(10)] for a in arenas: if a.start_padding == 0 and a.end_padding == 0: s = a.arena else: self.assertEquals(a.start_padding + a.end_padding + 1, 16) self.assertEquals(a.arena[:a.start_padding], "\x00" * a.start_padding) self.assertEquals(a.arena[-a.end_padding:], "\x00" * a.end_padding) s = a.arena[a.start_padding:-a.end_padding] self.assertEquals(s, ("\x00"*16 + "\x00"*16 + "\x00\x10" + "\x00"*14))
def view_mds_result(request): comps = Compound.objects mdsres = MDSRes.objects() data = numpy.fromstring(mdsres[0].data.read()) d = len(data) datamd = numpy.reshape(data, (d / 3, 3)) arena = chemfp.load_fingerprints(mdsres[0].simmatrix.fp) alldata = zip(datamd.tolist(), arena.ids) return render_to_response('view_mds.html', { "view_titel": "View MDS Result", "data": alldata, "Compounds": comps }, context_instance=RequestContext(request))
def test_8_alignment(self): arenas = [chemfp.load_fingerprints(StringIO(zeros), reorder=False, alignment=8) for i in range(10)] for a in arenas: if a.start_padding == a.end_padding == 0: s = a.arena else: self.assertEquals(a.arena[:a.start_padding], "\x00" * a.start_padding) self.assertEquals(a.arena[-a.end_padding:], "\x00" * a.end_padding) s = a.arena[a.start_padding:-a.end_padding] self.assertEquals(s, "\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x10\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00") self.assertEquals(a.storage_size, 8)
def make_fingerprint_arena( mols: Union[Molmap, Mapping[str, str]], fingerprint_type: str = "morgan", fingerprint_args: Mapping[str, Any] = {}, ) -> chemfp.arena.FingerprintArena: fp_maker = chemfp_fingerprint_functions[fingerprint_type]( fingerprint_args).make_fingerprinter() if isinstance(next(iter(mols.values())), Mol): fp_generator = ((str(n), fp_maker(m)) for n, m in mols.items()) fp = fp_maker(next(iter(mols.values()))) else: fp_generator = ((str(n), fp.encode()) for n, fp in mols.items()) fp = next(iter(mols.values())) arena = chemfp.load_fingerprints( fp_generator, metadata=chemfp.Metadata(num_bits=len(fp) * 8)) return arena
def search_pubchem(MACCS_bit, thr): """ search """ out = [] converted = chemfp.encodings.from_binary_lsb(MACCS_bit) for i in range(5323): a = "./fps/Compound_" + str(i * 25000 + 1).zfill(9) + '_' + str( (i + 1) * 25000).zfill(9) + ".fps" try: arena = chemfp.load_fingerprints(a, reorder=False, format="fps") out.extend( chemfp.search.threshold_tanimoto_search_fp( converted[1], arena, thr).get_ids_and_scores()) except: i = i #print "No such file or directory: " + a return out
def test_16_alignment(self): arenas = [ chemfp.load_fingerprints(StringIO(ordered_zeros), reorder=True, alignment=16) for i in range(10) ] for a in arenas: if a.start_padding == 0 and a.end_padding == 0: s = a.arena else: self.assertEquals(a.start_padding + a.end_padding + 1, 16) self.assertEquals(a.arena[:a.start_padding], "\x00" * a.start_padding) self.assertEquals(a.arena[-a.end_padding:], "\x00" * a.end_padding) s = a.arena[a.start_padding:-a.end_padding] self.assertEquals( s, ("\x00" * 16 + "\x00" * 16 + "\x00\x10" + "\x00" * 14))
def smidf2arena(smidf, reorder=True): # Write df of smiles, id smidf.to_csv('smidf.smi', header=False, sep=' ', index=False) # Generate fps file sp.call(['rdkit2fps', './smidf.smi', '-o', 'smidf.fps']) ## Load the FPs into an arena try: arena = chemfp.load_fingerprints('./smidf.fps', reorder=reorder) except IOError as err: sys.stderr.write("Cannot open fingerprint file: %s" % (err, )) raise SystemExit(2) # Remove files sp.call(['rm', './smidf.smi', './smidf.fps']) # Return arena return arena
def test_8_alignment(self): arenas = [ chemfp.load_fingerprints(StringIO(zeros), reorder=False, alignment=8) for i in range(10) ] for a in arenas: if a.start_padding == a.end_padding == 0: s = a.arena else: self.assertEquals(a.arena[:a.start_padding], "\x00" * a.start_padding) self.assertEquals(a.arena[-a.end_padding:], "\x00" * a.end_padding) s = a.arena[a.start_padding:-a.end_padding] self.assertEquals( s, "\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x10\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00") self.assertEquals(a.storage_size, 8)
from __future__ import absolute_import, with_statement import unittest2 from support import fullpath import chemfp import chemfp.bitops import _chemfp set_alignment_method = chemfp.bitops.set_alignment_method get_alignment_method = chemfp.bitops.get_alignment_method CHEBI_TARGETS = fullpath("chebi_rdmaccs.fps") CHEBI_QUERIES = fullpath("chebi_queries.fps.gz") targets = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=8) targets_64 = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=64) available_methods = chemfp.bitops.get_methods() alignment_methods = chemfp.bitops.get_alignment_methods() all_methods = dict.fromkeys("LUT8-1 LUT8-4 LUT16-4 Lauradoux POPCNT Gillies ssse3".split()) class TestMethods(unittest2.TestCase): def test_no_duplicates(self): methods = chemfp.bitops.get_methods() self.assertEquals(len(methods), len(set(methods))) def test_for_unknown_methods(self): for method in chemfp.bitops.get_methods():
def read_chemfp(input_file): reader = chemfp.read_molecule_fingerprints("RDKit-Morgan fpSize=1024", input_file) arena_all = chemfp.load_fingerprints(reader, reorder=False) return arena_all
parser.add_argument("-c", "--cluster", dest="cluster_image", help="Path to the output cluster image.") parser.add_argument("-s", "--smatrix", dest="similarity_matrix", help="Path to the similarity matrix output file.") parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float, default=0.0, help="Tanimoto threshold [0.0]") parser.add_argument("--oformat", default='png', help="Output format (png, svg)") parser.add_argument('-p', '--processors', type=int, default=4) args = parser.parse_args() targets = chemfp.open( args.input_path, format='fps' ) arena = chemfp.load_fingerprints( targets ) distances = distance_matrix( arena, args.tanimoto_threshold ) if args.similarity_matrix: numpy.savetxt(args.similarity_matrix, distances) if args.cluster_image: linkage = hcluster.linkage(distances, method="single", metric="euclidean") hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.) pylab.savefig(args.cluster_image, format=args.oformat)
def butina( args ): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open( args.input_path, format='fps' ) arena = chemfp.load_fingerprints( targets ) chemfp.set_num_threads( args.processors ) results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append( fp_idx ) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append( (fp_idx, unassigned) ) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write( "#%s true singletons\n" % len(true_singletons) ) out.write( "#%s false singletons\n" % len(false_singletons) ) out.write( "#clusters: %s\n" % len_cluster ) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()
def _open(self, name): return chemfp.load_fingerprints(name, reorder=True)
def butina(args): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open(args.input_path, format='fps') arena = chemfp.load_fingerprints(targets) chemfp.set_num_threads(args.processors) results = search.threshold_tanimoto_search_symmetric( arena, threshold=args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append(fp_idx) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append((fp_idx, unassigned)) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write("#%s true singletons\n" % len(true_singletons)) out.write("#%s false singletons\n" % len(false_singletons)) out.write("#clusters: %s\n" % len_cluster) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()
# Test the symmetric code import unittest2 from cStringIO import StringIO import array import chemfp from chemfp import search, bitops from support import fullpath, PUBCHEM_SDF, PUBCHEM_SDF_GZ fps = chemfp.load_fingerprints(fullpath("queries.fps")) zeros = chemfp.load_fingerprints(StringIO("""\ 0000\tA 0000\tB 0001\tC 0002\tD FFFE\tE FFFF\tF """)) def slow_counts(counts, fps, threshold, query_start, query_end, target_start, target_end): N = len(fps) query_end = min(N, query_end) target_end = min(N, target_end) for row in range(query_start, query_end): row_fp = fps[row][1]
def test_4_alignment(self): a = chemfp.load_fingerprints(StringIO(ordered_zeros), reorder=True, alignment=4) self.assertEquals(a.start_padding, 0) self.assertEquals(a.end_padding, 0) self.assertEquals(a.arena, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00")
# An implementation of Taylor-Butina clustering # See http://www.chemomine.co.uk/dbclus-paper.pdf # and http://www.redbrick.dcu.ie/~noel/R_clustering.html import chemfp THRESHOLD = 0.80 dataset = chemfp.load_fingerprints("docs/pubchem_targets.fps") print "Clustering", len(dataset), "fingerprints" # I'll make a list with tuples containing: # - the number of hits # - an arbitrary and not very good tie-breaker value (larger values go first) # - the fingerprint index # - the list of fingerprint indices within THRESHOLD of that fingerprint def tie_breaker_value(hits): # This is pretty arbitrary; it's the largest non-1.0 score; or 1.0 # Noel references better work on tie breaking by John MacCuish at Mesa Analytics try: tie_breaker = max(score for (idx, score) in hits if score != 1.0) except ValueError: tie_breaker = 1.0 return tie_breaker def hit_members(hits): return [idx for (idx, score) in hits]
# An implementation of Taylor-Butina clustering # See http://www.chemomine.co.uk/dbclus-paper.pdf # and http://www.redbrick.dcu.ie/~noel/R_clustering.html import chemfp THRESHOLD = 0.80 dataset = chemfp.load_fingerprints("docs/pubchem_targets.fps") print "Clustering", len(dataset), "fingerprints" # I'll make a list with tuples containing: # - the number of hits # - an arbitrary and not very good tie-breaker value (larger values go first) # - the fingerprint index # - the list of fingerprint indices within THRESHOLD of that fingerprint def tie_breaker_value(hits): # This is pretty arbitrary; it's the largest non-1.0 score; or 1.0 # Noel references better work on tie breaking by John MacCuish at Mesa Analytics try: tie_breaker = max(score for (idx, score) in hits if score != 1.0) except ValueError: tie_breaker = 1.0 return tie_breaker def hit_members(hits): return [idx for (idx, score) in hits] # Assign the compound index to its hits
raise SystemExit(1) batch_size = args.batch_size query_arena_iter = queries.iter_arenas(batch_size) t1 = time.time() first_query_arena = None for first_query_arena in query_arena_iter: break if args.scan: # Leave the targets as-is pass elif args.memory: targets = chemfp.load_fingerprints(targets) if not first_query_arena: # No input. Leave as-is pass elif len(first_query_arena) < min(10, batch_size): # Figure out the optimal search. If there is a # small number of inputs (< ~10) then a scan # of the FPS file is faster than an arena search. pass else: targets = chemfp.load_fingerprints(targets) problems = chemfp.check_metadata_problems(queries.metadata, targets.metadata) for (severity, error, msg_template) in problems: msg = msg_template % dict(metadata1="queries", metadata2="targets") if severity == "error":
# Test the symmetric code import unittest2 from cStringIO import StringIO import array import chemfp from chemfp import search, bitops from support import fullpath, PUBCHEM_SDF, PUBCHEM_SDF_GZ fps = chemfp.load_fingerprints(fullpath("queries.fps")) zeros = chemfp.load_fingerprints( StringIO("""\ 0000\tA 0000\tB 0001\tC 0002\tD FFFE\tE FFFF\tF """)) def slow_counts(counts, fps, threshold, query_start, query_end, target_start, target_end): N = len(fps) query_end = min(N, query_end) target_end = min(N, target_end) for row in range(query_start, query_end):
import unittest2 from cStringIO import StringIO from support import fullpath import chemfp import chemfp.bitops import _chemfp set_alignment_method = chemfp.bitops.set_alignment_method get_alignment_method = chemfp.bitops.get_alignment_method CHEBI_TARGETS = fullpath("chebi_rdmaccs.fps") CHEBI_QUERIES = fullpath("chebi_queries.fps.gz") targets = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=8) targets_64 = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=64) available_methods = chemfp.bitops.get_methods() alignment_methods = chemfp.bitops.get_alignment_methods() all_methods = dict.fromkeys("LUT8-1 LUT8-4 LUT16-4 Lauradoux POPCNT Gillies ssse3".split()) class TestMethods(unittest2.TestCase): def test_no_duplicates(self): methods = chemfp.bitops.get_methods() self.assertEquals(len(methods), len(set(methods))) def test_for_unknown_methods(self): for method in chemfp.bitops.get_methods():
temp_link = "%s.%s" % (temp_file.name, 'fps') temp_file.close() os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) ) chemfp_fingerprint_file = temp_link tanimoto_threshold = float(sys.argv[2]) outfile = sys.argv[3] processors = int(sys.argv[4]) def get_hit_indicies(hits): return [id for (id, score) in hits] out = open(outfile, 'w') dataset = chemfp.load_fingerprints( chemfp_fingerprint_file ) chemfp.set_num_threads( processors ) search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold) #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold) # Reorder so the centroid with the most hits comes first. # (That's why I do a reverse search.) # Ignore the arbitrariness of breaking ties by fingerprint index results = sorted( ( (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores()) ),reverse=True) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = []
raise SystemExit(1) batch_size = args.batch_size query_arena_iter = queries.iter_arenas(batch_size) t1 = time.time() first_query_arena = None for first_query_arena in query_arena_iter: break if args.scan: # Leave the targets as-is pass elif args.memory: targets = chemfp.load_fingerprints(targets) if not first_query_arena: # No input. Leave as-is pass elif len(first_query_arena) < min(10, batch_size): # Figure out the optimal search. If there is a # small number of inputs (< ~10) then a scan # of the FPS file is faster than an arena search. pass else: targets = chemfp.load_fingerprints(targets) problems = chemfp.check_metadata_problems(queries.metadata, targets.metadata) for (severity, error, msg_template) in problems: msg = msg_template % dict(metadata1="queries", metadata2="targets")
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict((i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header(outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources}) if args.count: counts = search.count_tanimoto_hits_symmetric(targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric(targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric(targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict( zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict( (i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header( outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources }) if args.count: counts = search.count_tanimoto_hits_symmetric( targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter( targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric( targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric( targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[ current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2 - t1, t3 - t2, t3 - t1))
temp_link = "%s.%s" % (temp_file.name, 'fps') temp_file.close() os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link)) chemfp_fingerprint_file = temp_link tanimoto_threshold = float(sys.argv[2]) outfile = sys.argv[3] processors = int(sys.argv[4]) def get_hit_indicies(hits): return [id for (id, score) in hits] out = open(outfile, 'w') dataset = chemfp.load_fingerprints(chemfp_fingerprint_file) chemfp.set_num_threads(processors) search = dataset.threshold_tanimoto_search_arena(dataset, threshold=tanimoto_threshold) #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold) # Reorder so the centroid with the most hits comes first. # (That's why I do a reverse search.) # Ignore the arbitrariness of breaking ties by fingerprint index results = sorted( ((len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores())), reverse=True) # Determine the true/false singletons and the clusters