Exemple #1
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
        or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print("type D: ", type(D))
    print("type x_train: ", type(X_train))
    print("type x_test: ", type(X_test))
    print("type distance: ", type(distance))
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    X_train = dataset_transform[distance](X_train)
    X_test = dataset_transform[distance](X_test)

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                  (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            descriptor, results = run_individual_query(algo, X_train, X_test,
                                                       distance, count,
                                                       run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(definition.algorithm,
                                                    batch)
            descriptor["dataset"] = dataset
            store_results(dataset, count, definition, query_arguments,
                          descriptor, results, batch)
    finally:
        algo.done()
Exemple #2
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
        or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                  (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            descriptor, results = run_individual_query(
                algo, X_train, X_test, distance, count, run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(
                definition.algorithm, batch)
            descriptor["dataset"] = dataset
            store_results(dataset, count, definition,
                          query_arguments, descriptor, results, batch)
    finally:
        algo.done()
Exemple #3
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
            or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    if algo.builds_graph():
        # Test data first to avoid converting test set index to graph index
        X_train = numpy.concatenate((X_test, X_train))
        # The protocol expects the count to be given at query time, so it has
        # to be set as a parameter beforehand.
        algo.set_count(count)
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)

        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                    (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            if algo.builds_graph():
                descriptor, results = check_graph(algo, X_train, X_test, distance, count)
            else:
                descriptor, results = run_individual_query(algo, X_train, X_test,
                    distance, count, run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(definition.algorithm, batch)
            descriptor["dataset"] = dataset
            descriptor["count"] = int(count)
            descriptor["batch_mode"] = batch
            store_results(dataset, count, definition,
                    query_arguments, descriptor, results, batch)
    finally:
        algo.done()
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove-100-angular')
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--sub-algorithm',
        metavar='NAME',
        help='run only the named instance of an algorithm (requires --algo)',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument('--single',
                        help='run only a single algorithm instance at a time',
                        action='store_true')
    parser.add_argument('--batch',
                        help='Provide Queryset as Batch',
                        action='store_true')
    parser.add_argument('--no_save_index',
                        help='do not save indices',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print('The following algorithms are supported...')
        for point in definitions:
            print('\t... for the point type "%s"...' % point)
            for metric in definitions[point]:
                print('\t\t... and the distance metric "%s":' % metric)
                for algorithm in definitions[point][metric]:
                    print('\t\t\t%s' % algorithm)
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    X_train = dataset['train']
    X_test = dataset['test']
    distance = dataset.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.count, distance):
            algos_already_run.add((run.attrs["library"], run.attrs["name"]))

    point_type = 'float'  # TODO(erikbern): should look at the type of X_train
    algos = get_algorithms(definitions, constructors, len(X_train[0]),
                           point_type, distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(target=run_algo,
                                    args=(args.count, X_train, X_test, library,
                                          algo, distance, send_pipe, args.runs,
                                          args.single, args.batch))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            r = recv_pipe.poll(args.timeout)
            if r:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                attrs, results = recv_pipe.recv()
                if "expect_extra" in attrs:
                    if attrs["expect_extra"]:
                        attrs["extra"] = recv_pipe.recv()
                    del attrs["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(attrs, results, args.dataset, args.count, distance)
        elif timed_out:
            print('algorithm worker process took too long')
        else:
            print('algorithm worker process stopped unexpectedly')
Exemple #5
0
def run(definition,
        dataset,
        count,
        run_count=3,
        force_single=False,
        use_batch_query=False):
    algo = instantiate_algorithm(definition)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        t0 = time.time()
        index_size_before = algo.get_index_size("self")
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_index_size("self") - index_size_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        best_search_time = float('inf')
        for i in range(run_count):
            print('Run %d/%d...' % (i + 1, run_count))
            n_items_processed = [
                0
            ]  # a bit dumb but can't be a scalar since of Python's scoping rules

            def single_query(v):
                start = time.time()
                candidates = algo.query(v, count)
                total = (time.time() - start)
                candidates = [
                    (int(idx),
                     float(metrics[distance]['distance'](v, X_train[idx])))
                    for idx in candidates
                ]
                n_items_processed[0] += 1
                if n_items_processed[0] % 1000 == 0:
                    print('Processed %d/%d queries...' %
                          (n_items_processed[0], X_test.shape[0]))
                if len(candidates) > count:
                    print(
                        'warning: algorithm %s returned %d results, but count is only %d)'
                        % (algo.name, len(candidates), count))
                return (total, candidates)

            def batch_query(X):
                start = time.time()
                result = algo.batch_query(X, count)
                total = (time.time() - start)
                candidates = [[
                    (int(idx),
                     float(metrics[distance]['distance'](v, X_train[idx])))
                    for idx in single_results
                ] for v, single_results in zip(X, results)]
                return [(total / float(len(X)), v) for v in candidates]

            if use_batch_query:
                results = batch_query(X_test)
            elif algo.use_threads() and not force_single:
                pool = multiprocessing.pool.ThreadPool()
                results = pool.map(single_query, X_test)
            else:
                results = [single_query(x) for x in X_test]

            total_time = sum(time for time, _ in results)
            total_candidates = sum(
                len(candidates) for _, candidates in results)
            search_time = total_time / len(X_test)
            avg_candidates = total_candidates / len(X_test)
            best_search_time = min(best_search_time, search_time)

        verbose = hasattr(algo, "query_verbose")
        attrs = {
            "batch_mode": use_batch_query,
            "build_time": build_time,
            "best_search_time": best_search_time,
            "candidates": avg_candidates,
            "expect_extra": verbose,
            "index_size": index_size,
            "name": algo.name,
            "run_count": run_count,
            "run_alone": force_single,
            "distance": distance,
            "count": int(count),
            "algo": definition.algorithm,
            "dataset": dataset
        }
        store_results(dataset, count, definition, attrs, results)
    finally:
        algo.done()
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove')
    parser.add_argument(
        '--query-dataset',
        metavar='NAME',
        help=
        'load query points from another dataset instead of choosing them randomly from the training dataset',
        default=None)
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument(
        '--distance',
        help='the metric used to calculate the distance between points',
        default='angular')
    parser.add_argument(
        '--limit',
        help=
        'the maximum number of points to load from the dataset, or -1 to load all of them',
        type=int,
        default=-1)
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--sub-algorithm',
        metavar='NAME',
        help='run only the named instance of an algorithm (requires --algo)',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument('--single',
                        help='run only a single algorithm instance at a time',
                        action='store_true')
    parser.add_argument('--no_save_index',
                        help='do not save indices',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print "The following algorithms are supported..."
        for point in definitions:
            print "\t... for the point type '%s'..." % point
            for metric in definitions[point]:
                print "\t\t... and the distance metric '%s':" % metric
                for algorithm in definitions[point][metric]:
                    print "\t\t\t%s" % algorithm
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    manifest, X = get_dataset(args.dataset, args.limit)
    if not args.query_dataset:
        X_train, X_test = split_dataset(
            X, test_size=manifest['dataset']['test_size'])
    else:
        X_train = X
        query_manifest, X_test = get_dataset(args.query_dataset)
        assert manifest["dataset"] == query_manifest["dataset"], """\
error: the training dataset and query dataset have incompatible manifests"""

    queries_fn = get_query_cache_path(args.dataset, args.count, args.limit,
                                      args.distance, args.query_dataset)
    print('storing queries in', queries_fn)

    if not os.path.exists(queries_fn):
        queries = compute_distances(args.distance, args.count, X_train, X_test)
        with open(queries_fn, 'w') as f:
            pickle.dump(queries, f)
    else:
        with open(queries_fn) as f:
            queries = pickle.load(f)

    print('got', len(queries), 'queries')

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.limit, args.count,
                               args.distance, args.query_dataset):
            algos_already_run.add((run["library"], run["name"]))

    point_type = manifest['dataset']['point_type']
    algos = get_algorithms(definitions, constructors, len(X_train[0]),
                           point_type, args.distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(target=run_algo,
                                    args=(args.count, X_train, queries,
                                          library, algo, args.distance,
                                          send_pipe, args.runs, args.single))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            results = recv_pipe.poll(args.timeout)
            if results:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                results = recv_pipe.recv()
                if "expect_extra" in results:
                    if results["expect_extra"]:
                        results["extra"] = recv_pipe.recv()
                    del results["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(results, args.dataset, args.limit, args.count,
                          args.distance, args.query_dataset)
        elif timed_out:
            print "(algorithm worker process took too long)"
        else:
            print "(algorithm worker process stopped unexpectedly)"
Exemple #7
0
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False):
    algo = instantiate_algorithm(definition)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        t0 = time.time()
        index_size_before = algo.get_index_size("self")
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_index_size("self") - index_size_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        best_search_time = float('inf')
        for i in range(run_count):
            print('Run %d/%d...' % (i+1, run_count))
            n_items_processed = [0]  # a bit dumb but can't be a scalar since of Python's scoping rules

            def single_query(v):
                start = time.time()
                candidates = algo.query(v, count)
                total = (time.time() - start)
                candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
                              for idx in candidates]
                n_items_processed[0] += 1
                if n_items_processed[0] % 1000 == 0:
                    print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0]))
                if len(candidates) > count:
                    print('warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count))
                return (total, candidates)

            def batch_query(X):
                start = time.time()
                result = algo.batch_query(X, count)
                total = (time.time() - start)
                candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
                               for idx in single_results]
                              for v, single_results in zip(X, results)]
                return [(total / float(len(X)), v) for v in candidates]

            if use_batch_query:
                results = batch_query(X_test)
            elif algo.use_threads() and not force_single:
                pool = multiprocessing.pool.ThreadPool()
                results = pool.map(single_query, X_test)
            else:
                results = [single_query(x) for x in X_test]

            total_time = sum(time for time, _ in results)
            total_candidates = sum(len(candidates) for _, candidates in results)
            search_time = total_time / len(X_test)
            avg_candidates = total_candidates / len(X_test)
            best_search_time = min(best_search_time, search_time)

        verbose = hasattr(algo, "query_verbose")
        attrs = {
            "batch_mode": use_batch_query,
            "build_time": build_time,
            "best_search_time": best_search_time,
            "candidates": avg_candidates,
            "expect_extra": verbose,
            "index_size": index_size,
            "name": algo.name,
            "run_count": run_count,
            "run_alone": force_single,
        }
        store_results(dataset, count, definition, attrs, results)
    finally:
        algo.done()
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
            '--dataset',
            metavar='NAME',
            help='the dataset to load training points from',
            default='glove')
    parser.add_argument(
            '--query-dataset',
            metavar='NAME',
            help='load query points from another dataset instead of choosing them randomly from the training dataset',
            default=None)
    parser.add_argument(
            "-k", "--count",
            default=10,
            type=positive_int,
            help="the number of near neighbours to search for")
    parser.add_argument(
            '--distance',
            help='the metric used to calculate the distance between points',
            default='angular')
    parser.add_argument(
            '--limit',
            help='the maximum number of points to load from the dataset, or -1 to load all of them',
            type=int,
            default=-1)
    parser.add_argument(
            '--definitions',
            metavar='FILE',
            help='load algorithm definitions from FILE',
            default='algos.yaml')
    parser.add_argument(
            '--algorithm',
            metavar='NAME',
            help='run only the named algorithm',
            default=None)
    parser.add_argument(
            '--sub-algorithm',
            metavar='NAME',
            help='run only the named instance of an algorithm (requires --algo)',
            default=None)
    parser.add_argument(
            '--list-algorithms',
            help='print the names of all known algorithms and exit',
            action='store_true',
            default=argparse.SUPPRESS)
    parser.add_argument(
            '--force',
            help='''re-run algorithms even if their results already exist''',
            action='store_true')
    parser.add_argument(
            '--runs',
            metavar='COUNT',
            type=positive_int,
            help='run each algorithm instance %(metavar)s times and use only the best result',
            default=3)
    parser.add_argument(
            '--timeout',
            type=int,
            help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
            default=-1)
    parser.add_argument(
            '--single',
            help='run only a single algorithm instance at a time',
            action='store_true')
    parser.add_argument(
            '--no_save_index',
            help='do not save indices',
            action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print "The following algorithms are supported..."
        for point in definitions:
            print "\t... for the point type '%s'..." % point
            for metric in definitions[point]:
                print "\t\t... and the distance metric '%s':" % metric
                for algorithm in definitions[point][metric]:
                    print "\t\t\t%s" % algorithm
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    manifest, X = get_dataset(args.dataset, args.limit)
    if not args.query_dataset:
        X_train, X_test = split_dataset(
                X, test_size = manifest['dataset']['test_size'])
    else:
        X_train = X
        query_manifest, X_test = get_dataset(args.query_dataset)
        assert manifest["dataset"] == query_manifest["dataset"], """\
error: the training dataset and query dataset have incompatible manifests"""

    queries_fn = get_query_cache_path(
        args.dataset, args.count, args.limit, args.distance, args.query_dataset)
    print('storing queries in', queries_fn)

    if not os.path.exists(queries_fn):
        queries = compute_distances(args.distance, args.count, X_train, X_test)
        with open(queries_fn, 'w') as f:
            pickle.dump(queries, f)
    else:
        with open(queries_fn) as f:
            queries = pickle.load(f)

    print('got', len(queries), 'queries')

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.limit, args.count,
                args.distance, args.query_dataset):
            algos_already_run.add((run["library"], run["name"]))

    point_type = manifest['dataset']['point_type']
    algos = get_algorithms(definitions, constructors,
        len(X_train[0]), point_type, args.distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(
            target=run_algo,
            args=(args.count, X_train, queries, library, algo,
                  args.distance, send_pipe, args.runs, args.single))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            results = recv_pipe.poll(args.timeout)
            if results:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                results = recv_pipe.recv()
                if "expect_extra" in results:
                    if results["expect_extra"]:
                        results["extra"] = recv_pipe.recv()
                    del results["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(results, args.dataset, args.limit,
                    args.count, args.distance, args.query_dataset)
        elif timed_out:
            print "(algorithm worker process took too long)"
        else:
            print "(algorithm worker process stopped unexpectedly)"