def test_row_iteration(self): reader = chemfp.open(CHEBI_TARGETS) num = sum(1 for x in reader.iter_rows()) self.assertEqual(num, 2000) row_reader = chemfp.open(CHEBI_TARGETS).iter_rows() fields = [next(row_reader) for i in range(5)] self.assertEqual( fields, [['00000000000000008200008490892dc00dc4a7d21e', 'CHEBI:776'], ['000000000000200080000002800002040c0482d608', 'CHEBI:1148'], ['0000000000000221000800111601017000c1a3d21e', 'CHEBI:1734'], ['00000000000000000000020000100000000400951e', 'CHEBI:1895'], ['0000000002001021820a00011681015004cdb3d21e', 'CHEBI:2303']])
def test_reiter_open_handle_arena_search(self): reader = chemfp.open(CHEBI_TARGETS) # The main goal is to prevent people from searching a # partially open file. This reflects an implementation # problem; the iterator should be shared across all instances. it = iter(reader) arena = next(it) for method in (reader.id_threshold_tanimoto_search, reader.id_knearest_tanimoto_search): with self.assertRaisesRegexp(TypeError, "FPS file is not at the start"): for x in method(arena): break
def test_iter_blocks(self): reader = chemfp.open(CHEBI_TARGETS) line_counts = 0 has_776 = False has_17582 = False for block in reader.iter_blocks(): line_counts += block.count("\n") if "00000000000000008200008490892dc00dc4a7d21e\tCHEBI:776" in block: has_776 = True if "00000000020012008008000104000064844ca2521c\tCHEBI:17582" in block: has_17582 = True self.assertEqual(line_counts, 2000) self.assertTrue(has_776, "Missing CHEBI:776") self.assertTrue(has_17582, "Missing CHEBI:17582")
def test_id_threshold_tanimoto_search_0_on_0(self): zeros = ("0000\tfirst\n" "0010\tsecond\n" "0000\tthird\n") query_arena = next(chemfp.open(StringIO(zeros)).iter_arenas()) self.assertEqual(query_arena.ids, ["first", "second", "third"]) targets = self._open(StringIO(zeros)) result = targets.id_threshold_tanimoto_search(query_arena, threshold=0.0) ids, hits = zip(*result) self.assertSequenceEqual(ids, query_arena.arena_ids) self.assertEquals(map(len, hits), [3, 3, 3]) targets = self._open(StringIO(zeros)) result = targets.id_threshold_tanimoto_search(query_arena, threshold=0.000001) ids, hits = zip(*result) self.assertSequenceEqual(ids, query_arena.arena_ids) self.assertEquals(map(len, hits), [0, 1, 0])
def main(args=None): args = parser.parse_args(args) target_filename = args.target_filename[0] threshold = args.threshold k = args.k_nearest if args.count and k is not None and k != "all": parser.error("--count search does not support --k-nearest") # People should not use this without setting parameters. On the # other hand, I don't want an error message if there are no # parameters. This solution seems to make sense. if threshold is None: if k is None: # If nothing is set, use defaults of --thresdhold 0.7 -k 3 threshold = 0.7 k = 3 else: # only k is set; search over all possible matches threshold = 0.0 else: if k is None: # only threshold is set; search for all hits above that threshold k = "all" if k == "all": pass elif k < 0: parser.error("--k-nearest must be non-negative or 'all'") if not (0.0 <= threshold <= 1.0): parser.error("--threshold must be between 0.0 and 1.0, inclusive") if args.batch_size < 1: parser.error("--batch-size must be positive") bitops.use_environment_variables() if args.NxN: if args.scan: parser.error("Cannot specify --scan with an --NxN search") if args.hex_query: parser.error("Cannot specify --hex-query with an --NxN search") if args.queries: parser.error("Cannot specify --queries with an --NxN search") do_NxN_searches(args, k, threshold, target_filename) return if args.scan and args.memory: parser.error("Cannot specify both --scan and --memory") if args.hex_query and args.queries: parser.error("Cannot specify both --hex-query and --queries") if args.hex_query: query_id = args.query_id for c, name in ( ("\t", "tab"), ("\n", "newline"), ("\r", "control-return"), ("\0", "NUL")): if c in query_id: parser.error("--query-id must not contain the %s character" % (name,)) # Open the target file. This reads just enough to get the header. try: targets = chemfp.open(target_filename, format=args.target_format) except (IOError, ValueError, chemfp.ChemFPError), err: sys.stderr.write("Cannot open targets file: %s" % err) raise SystemExit(1)
for (severity, error, msg_template) in chemfp.check_fp_problems(query_fp, targets.metadata): if severity == "error": parser.error(msg_template % dict(fp="query", metadata=repr(target_filename))) num_bits = targets.metadata.num_bits if num_bits is None: num_bits = len(query_fp) * 8 query_metadata = chemfp.Metadata(num_bits=num_bits, num_bytes=len(query_fp)) queries = chemfp.Fingerprints(query_metadata, [(query_id, query_fp)]) query_filename = None else: query_filename = args.queries try: queries = chemfp.open(query_filename, format=args.query_format) except (ValueError, IOError, chemfp.ChemFPError), err: sys.stderr.write("Cannot open queries file: %s\n" % (err,)) raise SystemExit(1) batch_size = args.batch_size query_arena_iter = queries.iter_arenas(batch_size) t1 = time.time() first_query_arena = None for first_query_arena in query_arena_iter: break if args.scan: # Leave the targets as-is
parser.add_argument("-c", "--cluster", dest="cluster_image", help="Path to the output cluster image.") parser.add_argument("-s", "--smatrix", dest="similarity_matrix", help="Path to the similarity matrix output file.") parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float, default=0.0, help="Tanimoto threshold [0.0]") parser.add_argument("--oformat", default='png', help="Output format (png, svg)") parser.add_argument('-p', '--processors', type=int, default=4) args = parser.parse_args() targets = chemfp.open( args.input_path, format='fps' ) arena = chemfp.load_fingerprints( targets ) distances = distance_matrix( arena, args.tanimoto_threshold ) if args.similarity_matrix: numpy.savetxt(args.similarity_matrix, distances) if args.cluster_image: linkage = hcluster.linkage(distances, method="single", metric="euclidean") hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.) pylab.savefig(args.cluster_image, format=args.oformat)
def butina(args): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open(args.input_path, format='fps') arena = chemfp.load_fingerprints(targets) chemfp.set_num_threads(args.processors) results = search.threshold_tanimoto_search_symmetric( arena, threshold=args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append(fp_idx) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append((fp_idx, unassigned)) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write("#%s true singletons\n" % len(true_singletons)) out.write("#%s false singletons\n" % len(false_singletons)) out.write("#clusters: %s\n" % len_cluster) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()
def _open(self, name): reader = chemfp.open(name) return SlowFingerprints(reader.metadata, list(reader))
def butina( args ): """ Taylor-Butina clustering from the chemfp help. """ out = args.output_path targets = chemfp.open( args.input_path, format='fps' ) arena = chemfp.load_fingerprints( targets ) chemfp.set_num_threads( args.processors ) results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters true_singletons = [] false_singletons = [] clusters = [] seen = set() #for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue seen.add(fp_idx) if size == 0: # The only fingerprint in the exclusion sphere is itself true_singletons.append( fp_idx ) continue # Figure out which ones haven't yet been assigned unassigned = set(members) - seen if not unassigned: false_singletons.append(fp_idx) continue # this is a new cluster clusters.append( (fp_idx, unassigned) ) seen.update(unassigned) len_cluster = len(clusters) #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) out.write( "#%s true singletons\n" % len(true_singletons) ) out.write( "#%s false singletons\n" % len(false_singletons) ) out.write( "#clusters: %s\n" % len_cluster ) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): centroid_idx, members = cluster return -len(members), arena.ids[centroid_idx] clusters.sort(key=cluster_sort_key) for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) #ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close()
assert DBL_MIN > 0.0 CHEBI_TARGETS = fullpath("chebi_rdmaccs.fps") CHEBI_QUERIES = fullpath("chebi_queries.fps.gz") MACCS_SMI = fullpath("maccs.smi") # Backwards compatibility for Python 2.5 try: next except NameError: def next(it): return it.next() QUERY_ARENA = next(chemfp.open(CHEBI_QUERIES).iter_arenas(10)) class CommonReaderAPI(object): _open = None def _check_target_metadata(self, metadata): self.assertEqual(metadata.num_bits, 166) self.assertEqual(metadata.num_bytes, 21) self.assertEqual(metadata.software, "OEChem/1.7.4 (20100809)") self.assertEqual(metadata.type, "RDMACCS-OpenEye/1") self.assertEqual(metadata.sources, ["/Users/dalke/databases/ChEBI_lite.sdf.gz"]) self.assertEqual(metadata.date, "2011-09-16T13:49:04") self.assertEqual(metadata.aromaticity, "mmff")
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict( zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict( (i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header( outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources }) if args.count: counts = search.count_tanimoto_hits_symmetric( targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter( targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric( targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric( targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[ current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2 - t1, t3 - t2, t3 - t1))
def main(args=None): args = parser.parse_args(args) target_filename = args.target_filename[0] threshold = args.threshold k = args.k_nearest if args.count and k is not None and k != "all": parser.error("--count search does not support --k-nearest") # People should not use this without setting parameters. On the # other hand, I don't want an error message if there are no # parameters. This solution seems to make sense. if threshold is None: if k is None: # If nothing is set, use defaults of --thresdhold 0.7 -k 3 threshold = 0.7 k = 3 else: # only k is set; search over all possible matches threshold = 0.0 else: if k is None: # only threshold is set; search for all hits above that threshold k = "all" if k == "all": pass elif k < 0: parser.error("--k-nearest must be non-negative or 'all'") if not (0.0 <= threshold <= 1.0): parser.error("--threshold must be between 0.0 and 1.0, inclusive") if args.batch_size < 1: parser.error("--batch-size must be positive") bitops.use_environment_variables() if args.NxN: if args.scan: parser.error("Cannot specify --scan with an --NxN search") if args.hex_query: parser.error("Cannot specify --hex-query with an --NxN search") if args.queries: parser.error("Cannot specify --queries with an --NxN search") do_NxN_searches(args, k, threshold, target_filename) return if args.scan and args.memory: parser.error("Cannot specify both --scan and --memory") if args.hex_query and args.queries: parser.error("Cannot specify both --hex-query and --queries") if args.hex_query: query_id = args.query_id for c, name in (("\t", "tab"), ("\n", "newline"), ("\r", "control-return"), ("\0", "NUL")): if c in query_id: parser.error("--query-id must not contain the %s character" % (name, )) # Open the target file. This reads just enough to get the header. try: targets = chemfp.open(target_filename, format=args.target_format) except (IOError, ValueError, chemfp.ChemFPError), err: sys.stderr.write("Cannot open targets file: %s" % err) raise SystemExit(1)
targets.metadata): if severity == "error": parser.error(msg_template % dict(fp="query", metadata=repr(target_filename))) num_bits = targets.metadata.num_bits if num_bits is None: num_bits = len(query_fp) * 8 query_metadata = chemfp.Metadata(num_bits=num_bits, num_bytes=len(query_fp)) queries = chemfp.Fingerprints(query_metadata, [(query_id, query_fp)]) query_filename = None else: query_filename = args.queries try: queries = chemfp.open(query_filename, format=args.query_format) except (ValueError, IOError, chemfp.ChemFPError), err: sys.stderr.write("Cannot open queries file: %s\n" % (err, )) raise SystemExit(1) batch_size = args.batch_size query_arena_iter = queries.iter_arenas(batch_size) t1 = time.time() first_query_arena = None for first_query_arena in query_arena_iter: break if args.scan: # Leave the targets as-is
def do_NxN_searches(args, k, threshold, target_filename): t1 = time.time() # load_fingerprints sorts the fingerprints based on popcount # I want the output to be in the same order as the input. # This means I need to do some reordering. Consider: # 0003 ID_A # 010a ID_B # 1000 ID_C # I use this to generate: # original_ids = ["ID_A", "ID_B", "ID_C"] # targets.ids = [2, 0, 1] # original_index_to_current_index = {2:0, 0:1, 1:2} # current_index_to_original_index = {0:2, 1:0, 2:1} original_ids = [] fps = chemfp.open(target_filename) def get_index_to_id(fps): for i, (id, fp) in enumerate(fps): original_ids.append(id) yield i, fp targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata) original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets)))) current_index_to_original_id = dict((i, original_ids[original_index]) for i, original_index in enumerate(targets.ids)) t2 = time.time() outfile = io.open_output(args.output) with io.ignore_pipe_errors: type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict( k=k, threshold=threshold, max_score=1.0) if args.count: type = "Count threshold=%(threshold)s NxN=full" % dict( threshold=threshold) write_count_magic(outfile) else: write_simsearch_magic(outfile) write_simsearch_header(outfile, { "num_bits": targets.metadata.num_bits, "software": SOFTWARE, "type": type, "targets": target_filename, "target_sources": targets.metadata.sources}) if args.count: counts = search.count_tanimoto_hits_symmetric(targets, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] count = counts[current_index] outfile.write("%d\t%s\n" % (count, original_id)) else: hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes) if k == "all": results = search.threshold_tanimoto_search_symmetric(targets, threshold, batch_size=args.batch_size) else: results = search.knearest_tanimoto_search_symmetric(targets, k, threshold, batch_size=args.batch_size) for original_index, original_id in enumerate(original_ids): current_index = original_index_to_current_index[original_index] new_indices_and_scores = results[current_index].get_ids_and_scores() outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id)) for (new_index, score) in new_indices_and_scores: original_id = original_ids[new_index] outfile.write(hit_formatter % (original_id, score)) outfile.write("\n") # XXX flush? t3 = time.time() if args.times: sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
def main(args=None): args = parser.parse_args(args) target_filename = args.target_filename[0] threshold = args.threshold k = args.k_nearest if args.count and k is not None and k != "all": parser.error("--count search does not support --k-nearest") # People should not use this without setting parameters. On the # other hand, I don't want an error message if there are no # parameters. This solution seems to make sense. if threshold is None: if k is None: # If nothing is set, use defaults of --thresdhold 0.7 -k 3 threshold = 0.7 k = 3 else: # only k is set; search over all possible matches threshold = 0.0 else: if k is None: # only threshold is set; search for all hits above that threshold k = "all" if args.scan and args.memory: parser.error("Cannot specify both --scan and --memory") if args.hex_query and args.queries: parser.error("Cannot specify both --hex-query and --queries") if args.hex_query: query_id = args.query_id for c, name in (("\t", "tab"), ("\n", "newline"), ("\r", "control-return"), ("\0", "NUL")): if c in query_id: parser.error("--query-id must not contain the %s character" % (name, )) if k == "all": pass elif k < 0: parser.error("--k-nearest must non-negative or 'all'") if not (0.0 <= threshold <= 1.0): parser.error("--threshold must be between 0.0 and 1.0, inclusive") if args.batch_size < 1: parser.error("--batch-size must be positive") batch_size = args.batch_size bitops.use_environment_variables() # Open the target file. This reads just enough to get the header. targets = chemfp.open(target_filename) if args.hex_query is not None: try: query_fp = args.hex_query.decode("hex") except ValueError, err: parser.error("--hex-query is not a hex string: %s" % (err, )) for (severity, error, msg_template) in chemfp.check_fp_problems(query_fp, targets.metadata): if severity == "error": parser.error(msg_template % dict(fp="query", metadata=repr(target_filename))) num_bits = targets.metadata.num_bits if num_bits is None: num_bits = len(query_fp) * 8 query_metadata = chemfp.Metadata(num_bits=num_bits, num_bytes=len(query_fp)) queries = chemfp.Fingerprints(query_metadata, [(query_id, query_fp)]) query_filename = None