def run_from_cmdline(): parser = argparse.ArgumentParser(''' NOTICE: You probably want to run.py rather than this script. ''') parser.add_argument('--dataset', choices=DATASETS.keys(), help=f'Dataset to benchmark on.', required=True) parser.add_argument('--algorithm', help='Name of algorithm for saving the results.', required=True) parser.add_argument( '--module', help= 'Python module containing algorithm. E.g. "ann_benchmarks.algorithms.annoy"', required=True) parser.add_argument('--constructor', help='Constructer to load from modulel. E.g. "Annoy"', required=True) parser.add_argument( '--count', help='K: Number of nearest neighbours for the algorithm to return.', required=True, type=int) parser.add_argument( '--runs', help= 'Number of times to run the algorihm. Will use the fastest run-time over the bunch.', required=True, type=int) parser.add_argument( '--batch', help= 'If flag included, algorithms will be run in batch mode, rather than "individual query" mode.', action='store_true') parser.add_argument( 'build', help= 'JSON of arguments to pass to the constructor. E.g. ["angular", 100]') parser.add_argument( 'queries', help='JSON of arguments to pass to the queries. E.g. [100]', nargs='*', default=[]) args = parser.parse_args() algo_args = json.loads(args.build) print(algo_args) query_args = [json.loads(q) for q in args.queries] definition = Definition( algorithm=args.algorithm, docker_tag=None, # not needed module=args.module, constructor=args.constructor, arguments=algo_args, query_argument_groups=query_args, disabled=False) run(definition, args.dataset, args.count, args.runs, args.batch)
def run_from_cmdline(): parser = argparse.ArgumentParser() parser.add_argument( '--dataset', choices=DATASETS.keys(), required=True) parser.add_argument( '--algorithm', required=True) parser.add_argument( '--module', required=True) parser.add_argument( '--constructor', required=True) parser.add_argument( '--count', required=True, type=int) parser.add_argument( '--runs', required=True, type=int) parser.add_argument( '--batch', action='store_true') parser.add_argument( 'build') parser.add_argument( 'queries', nargs='*', default=[]) args = parser.parse_args() algo_args = json.loads(args.build) query_args = [json.loads(q) for q in args.queries] definition = Definition( algorithm=args.algorithm, docker_tag=None, # not needed module=args.module, constructor=args.constructor, arguments=algo_args, query_argument_groups=query_args, disabled=False ) run(definition, args.dataset, args.count, args.runs, args.batch)
def run_from_cmdline(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', choices=DATASETS.keys(), required=True) parser.add_argument('--algorithm', required=True) parser.add_argument('--module', required=True) parser.add_argument('--constructor', required=True) parser.add_argument('--count', required=True, type=int) parser.add_argument('--json-args', action='store_true') parser.add_argument('-a', '--arg', dest='args', action='append') args = parser.parse_args() if args.json_args: algo_args = [json.loads(arg) for arg in args.args] else: algo_args = args.args definition = Definition( algorithm=args.algorithm, docker_tag=None, # not needed module=args.module, constructor=args.constructor, arguments=algo_args) run(definition, args.dataset, args.count)
def run_from_cmdline(): parser = argparse.ArgumentParser() parser.add_argument( '--dataset', choices=DATASETS.keys(), required=True) parser.add_argument( '--algorithm', required=True) parser.add_argument( '--module', required=True) parser.add_argument( '--constructor', required=True) parser.add_argument( '--count', required=True, type=int) parser.add_argument( '--json-args', action='store_true') parser.add_argument( '-a', '--arg', dest='args', action='append') args = parser.parse_args() if args.json_args: algo_args = [json.loads(arg) for arg in args.args] else: algo_args = args.args definition = Definition( algorithm=args.algorithm, docker_tag=None, # not needed module=args.module, constructor=args.constructor, arguments=algo_args ) run(definition, args.dataset, args.count)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=2) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help= 'If set, then will run everything locally (inside the same process) rather than using Docker' ) parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument('--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments) if not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups=not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) if args.docker_tag: print('running only', args.docker_tag) definitions = [ d for d in definitions if d.docker_tag == args.docker_tag ] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print( 'missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [ d for d in definitions if d.docker_tag in docker_tags ] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor of # the right name, then the definition is broken assert status != InstantiationStatus.NO_CONSTRUCTOR, """\ %s.%s(%s): error: the module '%s' does not expose the named constructor""" % ( df.module, df.constructor, df.arguments, df.module) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because of a missing # dependency), print a warning and remove this definition from the # list of things to be run print("""\ %s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): print('Not running disabled algorithms:', [d for d in definitions if d.disabled]) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: print('Order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same process) rather than using Docker') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) args = parser.parse_args() if args.timeout == -1: args.timeout = None if hasattr(args, "list_algorithms"): list_algorithms(args.definitions) sys.exit(0) # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # TODO(erikbern): should make this a helper function somewhere definitions = [definition for definition in definitions if not os.path.exists(get_result_filename(args.dataset, args.count, definition))] random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if args.docker_tag: print('running only', args.docker_tag) definitions = [d for d in definitions if d.docker_tag == args.docker_tag] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [d for d in definitions if d.docker_tag in docker_tags] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] print('order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='re-run algorithms even if their results already exist', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only' ' the best result', default=5) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1' 'if no timeout should be set', default=2 * 3600) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same ' 'process) rather than using Docker') parser.add_argument('--batch', action='store_true', help='If set, algorithms get all queries at once') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument('--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') parser.add_argument('--parallelism', type=positive_int, help='Number of Docker containers in parallel', default=1) args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) logging.config.fileConfig("logging.conf") logger = logging.getLogger("annb") # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset, dimension = get_dataset(args.dataset) point_type = dataset.attrs.get('point_type', 'float') distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments, args.batch) if args.force or not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups=not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: logger.info(f'running only {args.algorithm}') definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag = tag.split(':')[0] docker_tags.add(tag) if args.docker_tag: logger.info(f'running only {args.docker_tag}') definitions = [ d for d in definitions if d.docker_tag == args.docker_tag ] if set(d.docker_tag for d in definitions).difference(docker_tags): logger.info( f'not all docker images available, only: {set(docker_tags)}') logger.info( f'missing docker images: ' f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}' ) definitions = [ d for d in definitions if d.docker_tag in docker_tags ] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor # of the right name, then the definition is broken if status == InstantiationStatus.NO_CONSTRUCTOR: raise Exception( "%s.%s(%s): error: the module '%s' does not" " expose the named constructor" % (df.module, df.constructor, df.arguments, df.module)) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because # of a missing dependency), print a warning and remove # this definition from the list of things to be run logging.warning( "%s.%s(%s): the module '%s' could not be " "loaded; skipping" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): logger.info( f'Not running disabled algorithms {[d for d in definitions if d.disabled]}' ) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: logger.info(f'Order: {definitions}') if args.parallelism > multiprocessing.cpu_count() - 1: raise Exception('Parallelism larger than %d! (CPU count minus one)' % (multiprocessing.cpu_count() - 1)) # Multiprocessing magic to farm this out to all CPUs queue = multiprocessing.Queue() for definition in definitions: queue.put(definition) if args.batch and args.parallelism > 1: raise Exception( f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})" ) workers = [ multiprocessing.Process(target=run_worker, args=(i + 1, args, queue)) for i in range(args.parallelism) ] [worker.start() for worker in workers] [worker.join() for worker in workers]
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same process) rather than using Docker') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument( '--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments) if not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups = not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) if args.docker_tag: print('running only', args.docker_tag) definitions = [d for d in definitions if d.docker_tag == args.docker_tag] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [d for d in definitions if d.docker_tag in docker_tags] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor of # the right name, then the definition is broken assert status != InstantiationStatus.NO_CONSTRUCTOR, """\ %s.%s(%s): error: the module '%s' does not expose the named constructor""" % (df.module, df.constructor, df.arguments, df.module) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because of a missing # dependency), print a warning and remove this definition from the # list of things to be run print("""\ %s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): print('Not running disabled algorithms:', [d for d in definitions if d.disabled]) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: print('Order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
import argparse from ann_benchmarks.datasets import DATASETS, get_dataset_fn if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--dataset', choices=DATASETS.keys(), required=True) args = parser.parse_args() fn = get_dataset_fn(args.dataset) DATASETS[args.dataset](fn)