Example #1
0
    def test_row_iteration(self):
        reader = chemfp.open(CHEBI_TARGETS)
        num = sum(1 for x in reader.iter_rows())
        self.assertEqual(num, 2000)

        row_reader = chemfp.open(CHEBI_TARGETS).iter_rows()
        fields = [next(row_reader) for i in range(5)]
        self.assertEqual(
            fields,
            [['00000000000000008200008490892dc00dc4a7d21e', 'CHEBI:776'],
             ['000000000000200080000002800002040c0482d608', 'CHEBI:1148'],
             ['0000000000000221000800111601017000c1a3d21e', 'CHEBI:1734'],
             ['00000000000000000000020000100000000400951e', 'CHEBI:1895'],
             ['0000000002001021820a00011681015004cdb3d21e', 'CHEBI:2303']])
Example #2
0
 def test_reiter_open_handle_arena_search(self):
     reader = chemfp.open(CHEBI_TARGETS)
     # The main goal is to prevent people from searching a
     # partially open file.  This reflects an implementation
     # problem; the iterator should be shared across all instances.
     it = iter(reader)
     arena = next(it)
     for method in (reader.id_threshold_tanimoto_search,
                    reader.id_knearest_tanimoto_search):
         with self.assertRaisesRegexp(TypeError,
                                      "FPS file is not at the start"):
             for x in method(arena):
                 break
Example #3
0
    def test_iter_blocks(self):
        reader = chemfp.open(CHEBI_TARGETS)
        line_counts = 0
        has_776 = False
        has_17582 = False
        for block in reader.iter_blocks():
            line_counts += block.count("\n")
            if "00000000000000008200008490892dc00dc4a7d21e\tCHEBI:776" in block:
                has_776 = True
            if "00000000020012008008000104000064844ca2521c\tCHEBI:17582" in block:
                has_17582 = True

        self.assertEqual(line_counts, 2000)
        self.assertTrue(has_776, "Missing CHEBI:776")
        self.assertTrue(has_17582, "Missing CHEBI:17582")
Example #4
0
    def test_id_threshold_tanimoto_search_0_on_0(self):
        zeros = ("0000\tfirst\n" "0010\tsecond\n" "0000\tthird\n")
        query_arena = next(chemfp.open(StringIO(zeros)).iter_arenas())
        self.assertEqual(query_arena.ids, ["first", "second", "third"])

        targets = self._open(StringIO(zeros))
        result = targets.id_threshold_tanimoto_search(query_arena,
                                                      threshold=0.0)
        ids, hits = zip(*result)
        self.assertSequenceEqual(ids, query_arena.arena_ids)
        self.assertEquals(map(len, hits), [3, 3, 3])

        targets = self._open(StringIO(zeros))
        result = targets.id_threshold_tanimoto_search(query_arena,
                                                      threshold=0.000001)
        ids, hits = zip(*result)
        self.assertSequenceEqual(ids, query_arena.arena_ids)
        self.assertEquals(map(len, hits), [0, 1, 0])
Example #5
0
def main(args=None):
    args = parser.parse_args(args)
    target_filename = args.target_filename[0]

    threshold = args.threshold
    k = args.k_nearest

    if args.count and k is not None and k != "all":
        parser.error("--count search does not support --k-nearest")

    # People should not use this without setting parameters.  On the
    # other hand, I don't want an error message if there are no
    # parameters. This solution seems to make sense.

    if threshold is None:
        if k is None:
            # If nothing is set, use defaults of --thresdhold 0.7 -k 3            
            threshold = 0.7
            k = 3
        else:
            # only k is set; search over all possible matches
            threshold = 0.0
    else:
        if k is None:
            # only threshold is set; search for all hits above that threshold
            k = "all"

    if k == "all":
        pass
    elif k < 0:
        parser.error("--k-nearest must be non-negative or 'all'")

    if not (0.0 <= threshold <= 1.0):
        parser.error("--threshold must be between 0.0 and 1.0, inclusive")

    if args.batch_size < 1:
        parser.error("--batch-size must be positive")

    bitops.use_environment_variables()

    if args.NxN:
        if args.scan:
            parser.error("Cannot specify --scan with an --NxN search")
        if args.hex_query:
            parser.error("Cannot specify --hex-query with an --NxN search")
        if args.queries:
            parser.error("Cannot specify --queries with an --NxN search")
        do_NxN_searches(args, k, threshold, target_filename)
        return
            
    if args.scan and args.memory:
        parser.error("Cannot specify both --scan and --memory")

    if args.hex_query and args.queries:
        parser.error("Cannot specify both --hex-query and --queries")
    if args.hex_query:
        query_id = args.query_id
        for c, name in ( ("\t", "tab"),
                         ("\n", "newline"),
                         ("\r", "control-return"),
                         ("\0", "NUL")):
            if c in query_id:
                parser.error("--query-id must not contain the %s character" %
                             (name,))
    

    # Open the target file. This reads just enough to get the header.

    try:
        targets = chemfp.open(target_filename, format=args.target_format)
    except (IOError, ValueError, chemfp.ChemFPError), err:
        sys.stderr.write("Cannot open targets file: %s" % err)
        raise SystemExit(1)
Example #6
0
        for (severity, error, msg_template) in chemfp.check_fp_problems(query_fp, targets.metadata):
            if severity == "error":
                parser.error(msg_template % dict(fp="query", metadata=repr(target_filename)))
            
        num_bits = targets.metadata.num_bits
        if num_bits is None:
            num_bits = len(query_fp) * 8
        query_metadata = chemfp.Metadata(num_bits=num_bits, num_bytes=len(query_fp))
        queries = chemfp.Fingerprints(query_metadata,
                                      [(query_id, query_fp)])
        query_filename = None
    else:
        query_filename = args.queries
        try:
            queries = chemfp.open(query_filename, format=args.query_format)
        except (ValueError, IOError, chemfp.ChemFPError), err:
            sys.stderr.write("Cannot open queries file: %s\n" % (err,))
            raise SystemExit(1)

    batch_size = args.batch_size
    query_arena_iter = queries.iter_arenas(batch_size)
    
    t1 = time.time()

    first_query_arena = None
    for first_query_arena in query_arena_iter:
        break

    if args.scan:
        # Leave the targets as-is
Example #7
0
    parser.add_argument("-c", "--cluster", dest="cluster_image",
                    help="Path to the output cluster image.")

    parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
                    help="Path to the similarity matrix output file.")

    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
                    type=float, default=0.0,
                    help="Tanimoto threshold [0.0]")

    parser.add_argument("--oformat", default='png', help="Output format (png, svg)")

    parser.add_argument('-p', '--processors', type=int, 
        default=4)

    args = parser.parse_args()

    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )
    distances  = distance_matrix( arena, args.tanimoto_threshold )

    if args.similarity_matrix:
        numpy.savetxt(args.similarity_matrix, distances)

    if args.cluster_image:
        linkage = hcluster.linkage(distances, method="single", metric="euclidean")
        hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.)
        pylab.savefig(args.cluster_image, format=args.oformat)

Example #8
0
def butina(args):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open(args.input_path, format='fps')
    arena = chemfp.load_fingerprints(targets)

    chemfp.set_num_threads(args.processors)
    results = search.threshold_tanimoto_search_symmetric(
        arena, threshold=args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append(fp_idx)
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append((fp_idx, unassigned))
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write("#%s true singletons\n" % len(true_singletons))
    out.write("#%s false singletons\n" % len(false_singletons))
    out.write("#clusters: %s\n" % len_cluster)

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" %
                  (centroid_name, len(members), " ".join(arena.ids[idx]
                                                         for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()
Example #9
0
 def _open(self, name):
     reader = chemfp.open(name)
     return SlowFingerprints(reader.metadata, list(reader))
Example #10
0
def butina( args ):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )

    chemfp.set_num_threads( args.processors )
    results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append( fp_idx )
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append( (fp_idx, unassigned) )
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write( "#%s true singletons\n" % len(true_singletons) )
    out.write( "#%s false singletons\n" % len(false_singletons) )
    out.write( "#clusters: %s\n" % len_cluster )

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()
Example #11
0
assert DBL_MIN > 0.0

CHEBI_TARGETS = fullpath("chebi_rdmaccs.fps")
CHEBI_QUERIES = fullpath("chebi_queries.fps.gz")
MACCS_SMI = fullpath("maccs.smi")

# Backwards compatibility for Python 2.5
try:
    next
except NameError:

    def next(it):
        return it.next()


QUERY_ARENA = next(chemfp.open(CHEBI_QUERIES).iter_arenas(10))


class CommonReaderAPI(object):
    _open = None

    def _check_target_metadata(self, metadata):
        self.assertEqual(metadata.num_bits, 166)
        self.assertEqual(metadata.num_bytes, 21)
        self.assertEqual(metadata.software, "OEChem/1.7.4 (20100809)")
        self.assertEqual(metadata.type, "RDMACCS-OpenEye/1")
        self.assertEqual(metadata.sources,
                         ["/Users/dalke/databases/ChEBI_lite.sdf.gz"])
        self.assertEqual(metadata.date, "2011-09-16T13:49:04")
        self.assertEqual(metadata.aromaticity, "mmff")
Example #12
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}

    original_ids = []
    fps = chemfp.open(target_filename)

    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp

    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(
        zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict(
        (i, original_ids[original_index])
        for i, original_index in enumerate(targets.ids))

    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(
            outfile, {
                "num_bits": targets.metadata.num_bits,
                "software": SOFTWARE,
                "type": type,
                "targets": target_filename,
                "target_sources": targets.metadata.sources
            })

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(
                targets, threshold, batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(
                targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(
                    targets, threshold, batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(
                    targets, k, threshold, batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[
                    current_index].get_ids_and_scores()
                outfile.write("%d\t%s" %
                              (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n")  # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" %
                         (t2 - t1, t3 - t2, t3 - t1))
Example #13
0
def main(args=None):
    args = parser.parse_args(args)
    target_filename = args.target_filename[0]

    threshold = args.threshold
    k = args.k_nearest

    if args.count and k is not None and k != "all":
        parser.error("--count search does not support --k-nearest")

    # People should not use this without setting parameters.  On the
    # other hand, I don't want an error message if there are no
    # parameters. This solution seems to make sense.

    if threshold is None:
        if k is None:
            # If nothing is set, use defaults of --thresdhold 0.7 -k 3
            threshold = 0.7
            k = 3
        else:
            # only k is set; search over all possible matches
            threshold = 0.0
    else:
        if k is None:
            # only threshold is set; search for all hits above that threshold
            k = "all"

    if k == "all":
        pass
    elif k < 0:
        parser.error("--k-nearest must be non-negative or 'all'")

    if not (0.0 <= threshold <= 1.0):
        parser.error("--threshold must be between 0.0 and 1.0, inclusive")

    if args.batch_size < 1:
        parser.error("--batch-size must be positive")

    bitops.use_environment_variables()

    if args.NxN:
        if args.scan:
            parser.error("Cannot specify --scan with an --NxN search")
        if args.hex_query:
            parser.error("Cannot specify --hex-query with an --NxN search")
        if args.queries:
            parser.error("Cannot specify --queries with an --NxN search")
        do_NxN_searches(args, k, threshold, target_filename)
        return

    if args.scan and args.memory:
        parser.error("Cannot specify both --scan and --memory")

    if args.hex_query and args.queries:
        parser.error("Cannot specify both --hex-query and --queries")
    if args.hex_query:
        query_id = args.query_id
        for c, name in (("\t", "tab"), ("\n", "newline"),
                        ("\r", "control-return"), ("\0", "NUL")):
            if c in query_id:
                parser.error("--query-id must not contain the %s character" %
                             (name, ))

    # Open the target file. This reads just enough to get the header.

    try:
        targets = chemfp.open(target_filename, format=args.target_format)
    except (IOError, ValueError, chemfp.ChemFPError), err:
        sys.stderr.write("Cannot open targets file: %s" % err)
        raise SystemExit(1)
Example #14
0
                                                       targets.metadata):
            if severity == "error":
                parser.error(msg_template %
                             dict(fp="query", metadata=repr(target_filename)))

        num_bits = targets.metadata.num_bits
        if num_bits is None:
            num_bits = len(query_fp) * 8
        query_metadata = chemfp.Metadata(num_bits=num_bits,
                                         num_bytes=len(query_fp))
        queries = chemfp.Fingerprints(query_metadata, [(query_id, query_fp)])
        query_filename = None
    else:
        query_filename = args.queries
        try:
            queries = chemfp.open(query_filename, format=args.query_format)
        except (ValueError, IOError, chemfp.ChemFPError), err:
            sys.stderr.write("Cannot open queries file: %s\n" % (err, ))
            raise SystemExit(1)

    batch_size = args.batch_size
    query_arena_iter = queries.iter_arenas(batch_size)

    t1 = time.time()

    first_query_arena = None
    for first_query_arena in query_arena_iter:
        break

    if args.scan:
        # Leave the targets as-is
Example #15
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}
    
    original_ids = []
    fps = chemfp.open(target_filename)
    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp
    
    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict((i, original_ids[original_index])
                                            for i, original_index in enumerate(targets.ids))
    
    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(outfile, {
            "num_bits": targets.metadata.num_bits,
            "software": SOFTWARE,
            "type": type,
            "targets": target_filename,
            "target_sources": targets.metadata.sources})

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(targets, threshold,
                                                          batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(targets, threshold,
                                                                     batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(targets, k, threshold,
                                                                    batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[current_index].get_ids_and_scores()
                outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n") # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
Example #16
0
    parser.add_argument("-c", "--cluster", dest="cluster_image",
                    help="Path to the output cluster image.")

    parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
                    help="Path to the similarity matrix output file.")

    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
                    type=float, default=0.0,
                    help="Tanimoto threshold [0.0]")

    parser.add_argument("--oformat", default='png', help="Output format (png, svg)")

    parser.add_argument('-p', '--processors', type=int, 
        default=4)

    args = parser.parse_args()

    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )
    distances  = distance_matrix( arena, args.tanimoto_threshold )

    if args.similarity_matrix:
        numpy.savetxt(args.similarity_matrix, distances)

    if args.cluster_image:
        linkage = hcluster.linkage(distances, method="single", metric="euclidean")
        hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.)
        pylab.savefig(args.cluster_image, format=args.oformat)

Example #17
0
def main(args=None):
    args = parser.parse_args(args)
    target_filename = args.target_filename[0]

    threshold = args.threshold
    k = args.k_nearest

    if args.count and k is not None and k != "all":
        parser.error("--count search does not support --k-nearest")

    # People should not use this without setting parameters.  On the
    # other hand, I don't want an error message if there are no
    # parameters. This solution seems to make sense.

    if threshold is None:
        if k is None:
            # If nothing is set, use defaults of --thresdhold 0.7 -k 3
            threshold = 0.7
            k = 3
        else:
            # only k is set; search over all possible matches
            threshold = 0.0
    else:
        if k is None:
            # only threshold is set; search for all hits above that threshold
            k = "all"

    if args.scan and args.memory:
        parser.error("Cannot specify both --scan and --memory")

    if args.hex_query and args.queries:
        parser.error("Cannot specify both --hex-query and --queries")
    if args.hex_query:
        query_id = args.query_id
        for c, name in (("\t", "tab"), ("\n", "newline"),
                        ("\r", "control-return"), ("\0", "NUL")):
            if c in query_id:
                parser.error("--query-id must not contain the %s character" %
                             (name, ))

    if k == "all":
        pass
    elif k < 0:
        parser.error("--k-nearest must non-negative or 'all'")

    if not (0.0 <= threshold <= 1.0):
        parser.error("--threshold must be between 0.0 and 1.0, inclusive")

    if args.batch_size < 1:
        parser.error("--batch-size must be positive")

    batch_size = args.batch_size

    bitops.use_environment_variables()

    # Open the target file. This reads just enough to get the header.

    targets = chemfp.open(target_filename)

    if args.hex_query is not None:
        try:
            query_fp = args.hex_query.decode("hex")
        except ValueError, err:
            parser.error("--hex-query is not a hex string: %s" % (err, ))

        for (severity, error,
             msg_template) in chemfp.check_fp_problems(query_fp,
                                                       targets.metadata):
            if severity == "error":
                parser.error(msg_template %
                             dict(fp="query", metadata=repr(target_filename)))

        num_bits = targets.metadata.num_bits
        if num_bits is None:
            num_bits = len(query_fp) * 8
        query_metadata = chemfp.Metadata(num_bits=num_bits,
                                         num_bytes=len(query_fp))
        queries = chemfp.Fingerprints(query_metadata, [(query_id, query_fp)])
        query_filename = None