Esempio n. 1
0
def determine_context(args: argparse.Namespace,
                      exit_stack: ExitStack) -> List[mx.Context]:
    """
    Determine the context we should run on (CPU or GPU).

    :param args: Arguments as returned by argparse.
    :param exit_stack: An ExitStack from contextlib.
    :return: A list with the context(s) to run on.
    """
    if args.use_cpu:
        logger.info("Training Device: CPU")
        context = [mx.cpu()]
    else:
        num_gpus = utils.get_num_gpus()
        check_condition(
            num_gpus >= 1,
            "No GPUs found, consider running on the CPU with --use-cpu "
            "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
            "binary isn't on the path).")
        if args.disable_device_locking:
            context = utils.expand_requested_device_ids(args.device_ids)
        else:
            context = exit_stack.enter_context(
                utils.acquire_gpus(args.device_ids, lock_dir=args.lock_dir))
        if args.batch_type == C.BATCH_TYPE_SENTENCE:
            check_condition(
                args.batch_size % len(context) == 0,
                "When using multiple devices the batch size must be "
                "divisible by the number of devices. Choose a batch "
                "size that is a multiple of %d." % len(context))
        logger.info("Training Device(s): GPU %s", context)
        context = [mx.gpu(gpu_id) for gpu_id in context]
    return context
Esempio n. 2
0
def main():
    logging.basicConfig(level=logging.INFO)
    args = build_parser().parse_args()

    data_loader = Inferred(args.dataset[0], args.dataset[1:])
    params = {
        'data_loader': data_loader,
        'scale': args.scale,
        'dropout': args.dropout,
        'backbone': args.backbone,
        'learning_rate': args.learning_rate,
        'optimizer': args.optimizer,
        'grad_clip_norm': args.grad_clip_norm
    }

    num_gpus = utils.get_num_gpus()
    distribution = tf.contrib.distribute.MirroredStrategy(
        num_gpus=num_gpus) if num_gpus > 1 else None
    config = tf.estimator.RunConfig(train_distribute=distribution,
                                    model_dir=args.experiment,
                                    save_summary_steps=500,
                                    save_checkpoints_steps=500)

    estimator = tf.estimator.Estimator(model_fn, params=params, config=config)

    for epoch in range(args.epochs):
        print('epoch {}'.format(epoch))
        estimator.train(train_input_fn)
Esempio n. 3
0
def determine_decode_and_evaluate_context(
        args: argparse.Namespace, exit_stack: ExitStack,
        train_context: List[mx.Context]) -> Tuple[int, Optional[mx.Context]]:
    """
    Determine the number of sentences to decode and the context we should run on (CPU or GPU).

    :param args: Arguments as returned by argparse.
    :param exit_stack: An ExitStack from contextlib.
    :param train_context: Context for training.
    :return: The number of sentences to decode and a list with the context(s) to run on.
    """
    num_to_decode = args.decode_and_evaluate
    if args.optimized_metric == C.BLEU and num_to_decode == 0:
        logger.info(
            "You chose BLEU as the optimized metric, will turn on BLEU monitoring during training. "
            "To control how many validation sentences are used for calculating bleu use "
            "the --decode-and-evaluate argument.")
        num_to_decode = -1

    if num_to_decode == 0:
        return 0, None

    if args.use_cpu or args.decode_and_evaluate_use_cpu:
        context = mx.cpu()
    elif args.decode_and_evaluate_device_id is not None:
        # decode device is defined from the commandline
        num_gpus = utils.get_num_gpus()
        check_condition(
            num_gpus >= 1,
            "No GPUs found, consider running on the CPU with --use-cpu "
            "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
            "binary isn't on the path).")

        if args.disable_device_locking:
            context = utils.expand_requested_device_ids(
                [args.decode_and_evaluate_device_id])
        else:
            context = exit_stack.enter_context(
                utils.acquire_gpus([args.decode_and_evaluate_device_id],
                                   lock_dir=args.lock_dir))
        context = mx.gpu(context[0])

    else:
        # default decode context is the last training device
        context = train_context[-1]

    logger.info("Decode and Evaluate Device(s): %s", context)
    return num_to_decode, context
Esempio n. 4
0
def _setup_context(args, exit_stack):
    if args.use_cpu:
        context = mx.cpu()
    else:
        num_gpus = get_num_gpus()
        check_condition(num_gpus >= 1,
                        "No GPUs found, consider running on the CPU with --use-cpu "
                        "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
                        "binary isn't on the path).")
        check_condition(len(args.device_ids) == 1, "cannot run on multiple devices for now")
        gpu_id = args.device_ids[0]
        if args.disable_device_locking:
            if gpu_id < 0:
                # without locking and a negative device id we just take the first device
                gpu_id = 0
        else:
            gpu_ids = exit_stack.enter_context(acquire_gpus([gpu_id], lock_dir=args.lock_dir))
            gpu_id = gpu_ids[0]

        context = mx.gpu(gpu_id)
    return context
Esempio n. 5
0
def get_cuda_devices():
    if "CUDA_VISIBLE_DEVICES" in os.environ:
        return os.environ["CUDA_VISIBLE_DEVICES"].split(",")
    else:
        ngpus = get_num_gpus()
        return list(range(ngpus))
Esempio n. 6
0
def empty_dataframe():
    import cudf

    return cudf.DataFrame({"a": [1.0], "b": [1.0]}).head(0)


def cupy_obj():
    import cupy

    size = 10 ** 8
    return cupy.arange(size)


@pytest.mark.slow
@pytest.mark.skipif(
    get_num_gpus() <= 2, reason="Machine does not have more than two GPUs"
)
@pytest.mark.parametrize(
    "cuda_obj_generator", [dataframe, empty_dataframe, series, cupy_obj]
)
@pytest.mark.parametrize("comm_api", ["tag", "am"])
def test_send_recv_cu(cuda_obj_generator, comm_api):
    if comm_api == "am" and not ucp._libs.ucx_api.is_am_supported():
        pytest.skip("AM only supported in UCX >= 1.11")

    base_env = os.environ
    env_client = base_env.copy()
    # grab first two devices
    cvd = get_cuda_devices()[:2]
    cvd = ",".join(map(str, cvd))
    # reverse CVD for other worker
def empty_dataframe():
    import cudf

    return cudf.DataFrame({"a": [1.0], "b": [1.0]}).head(0)


def cupy_obj():
    import cupy

    size = 10**8
    return cupy.arange(size)


@pytest.mark.slow
@pytest.mark.skipif(get_num_gpus() <= 2,
                    reason="Machine does not have more than two GPUs")
@pytest.mark.parametrize("cuda_obj_generator",
                         [dataframe, empty_dataframe, series, cupy_obj])
@pytest.mark.parametrize("comm_api", ["tag", "am"])
def test_send_recv_cu(cuda_obj_generator, comm_api):
    base_env = os.environ
    env_client = base_env.copy()
    # grab first two devices
    cvd = get_cuda_devices()[:2]
    cvd = ",".join(map(str, cvd))
    # reverse CVD for other worker
    env_client["CUDA_VISIBLE_DEVICES"] = cvd[::-1]

    port = random.randint(13000, 15500)
    # serialize function and send to the client and server
Esempio n. 8
0
    def __init__(self, worker_type, sched_addr, sched_port, worker_port,
                 num_gpus, run_dir, data_dir, checkpoint_dir, use_mps):
        logger = logging.getLogger('worker')
        logger.setLevel(logging.DEBUG)
        ch = logging.StreamHandler()
        ch.setFormatter(
            logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT, style='{'))
        logger.addHandler(ch)
        self._logger = logger
        self._logging_handler = ch

        num_available_gpus = utils.get_num_gpus()
        if num_gpus > num_available_gpus:
            raise ValueError('%d GPUs requested active, but only %d total '
                             'GPUs are available' %
                             (num_gpus, num_available_gpus))
        signal.signal(signal.SIGINT, self._signal_handler)
        self._gpu_ids = list(range(num_gpus))
        self._worker_type = worker_type
        self._worker_addr = socket.gethostbyname(socket.gethostname())
        self._worker_port = worker_port
        self._worker_rpc_client = worker_client.WorkerRpcClient(
            self._worker_type, self._worker_addr, self._worker_port,
            sched_addr, sched_port)

        callbacks = {
            'RunJob': self._run_job_callback,
            'KillJob': self._kill_job_callback,
            'Reset': self._reset_callback,
            'Shutdown': self._shutdown_callback,
        }

        self._server_thread = threading.Thread(target=worker_server.serve,
                                               args=(
                                                   worker_port,
                                                   callbacks,
                                               ))
        self._server_thread.daemon = True
        self._server_thread.start()

        self._worker_ids, self._round_duration, error = \
            self._worker_rpc_client.register_worker(len(self._gpu_ids))
        if error:
            raise RuntimeError(error)

        if not os.path.isdir(checkpoint_dir):
            # Set up a new checkpoint directory if does not already exist.
            os.mkdir(checkpoint_dir)
        else:
            # Clear the checkpoints if they have already been created.
            for dirname in os.listdir(checkpoint_dir):
                if os.path.isdir(os.path.join(checkpoint_dir, dirname)):
                    shutil.rmtree(os.path.join(checkpoint_dir, dirname))

        self._dispatcher = dispatcher.Dispatcher(self._round_duration,
                                                 self._gpu_ids,
                                                 self._worker_rpc_client,
                                                 sched_addr,
                                                 sched_port,
                                                 run_dir,
                                                 data_dir,
                                                 checkpoint_dir,
                                                 use_mps=use_mps)

        self._server_thread.join()