Exemple #1
0
 def prepare_nccl2_env(self, is_local):
     """
     :param is_local:
     :return:
     """
     if not is_local:
         logging.debug("is_distributed: %s" % self.params["is_distributed"])
         if self.params["is_distributed"]:
             trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
             worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
             current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
             worker_endpoints = worker_endpoints_env.split(",")
             trainers_num = len(worker_endpoints)
             logging.debug(
                 "worker_endpoints:{} trainers_num:{} current_endpoint:{} \
                   trainer_id:{}".format(worker_endpoints, trainers_num,
                                         current_endpoint, trainer_id))
             # prepare nccl2 env.
             config = fluid.DistributeTranspilerConfig()
             config.mode = "nccl2"
             t = fluid.DistributeTranspiler(config=config)
             t.transpile(trainer_id,
                         trainers=worker_endpoints_env,
                         current_endpoint=current_endpoint,
                         program=self.train_program if
                         self.params["is_do_train"] else self.test_program,
                         startup_program=self.startup_program)
             self.num_trainers = trainers_num
             self.trainer_id = trainer_id
Exemple #2
0
def pserver_prepare(args, train_prog, startup_prog):
    config = fluid.DistributeTranspilerConfig()
    config.slice_var_up = args.split_var
    t = fluid.DistributeTranspiler(config=config)
    envs = args.dist_env
    training_role = envs["training_role"]

    t.transpile(envs["trainer_id"],
                program=train_prog,
                pservers=envs["pserver_endpoints"],
                trainers=envs["num_trainers"],
                sync_mode=not args.async_mode,
                startup_program=startup_prog)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(envs["current_endpoint"])
        pserver_startup_program = t.get_startup_program(
            envs["current_endpoint"],
            pserver_program,
            startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
Exemple #3
0
 def transpile2dist():
     # pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
     # eplist = []
     # port = os.getenv("PADDLE_INIT_PORT")
     # for ip in pserver_ips.split(","):
     #     eplist.append(':'.join([ip, port]))
     # pserver_endpoints = ",".join(eplist)
     pserver_endpoints = os.getenv("PSERVERS")
     print("pserver endpoints: ", pserver_endpoints)
     trainers = int(os.getenv("TRAINERS"))  # total trainer count
     print("trainers total: ", trainers)
     trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", "0"))
     # current_endpoint = os.getenv(
     #         "POD_IP") + ":" + port  # current pserver endpoint
     current_endpoint = os.getenv("SERVER_ENDPOINT")
     role = os.getenv(
         "TRAINING_ROLE",
         "TRAINER")  # get the training role: trainer/pserver
     t = fluid.DistributeTranspiler()
     t.transpile(
         optimize_ops,
         params_grads,
         trainer_id,
         pservers=pserver_endpoints,
         trainers=trainers)
     return t, role, current_endpoint, trainer_id
Exemple #4
0
def train():
    args = parse_args()

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(
        args.embedding_size, args.sparse_feature_dim)
    optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    optimizer.minimize(loss)

    if args.is_local:
        logger.info("run local training")
        main_program = fluid.default_main_program()
        train_loop(args, main_program, data_list, loss, auc_var, batch_auc_var,
                   1, 0)
    else:
        logger.info("run dist training")
        t = fluid.DistributeTranspiler()
        t.transpile(args.trainer_id,
                    pservers=args.endpoints,
                    trainers=args.trainers)
        if args.role == "pserver":
            logger.info("run pserver")
            prog = t.get_pserver_program(args.current_endpoint)
            startup = t.get_startup_program(args.current_endpoint,
                                            pserver_program=prog)
            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(startup)
            exe.run(prog)
        elif args.role == "trainer":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()
            train_loop(args, train_prog, data_list, loss, auc_var,
                       batch_auc_var, args.trainers, args.trainer_id)
Exemple #5
0
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
    t = fluid.DistributeTranspiler()
    t.transpile(trainer_id=trainer_id,
                program=main_program,
                pservers=pserver_endpoints,
                trainers=trainers)
    return t
Exemple #6
0
def train(nn_type, use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    if nn_type == 'mlp':
        net_conf = mlp
    else:
        net_conf = conv_net

    prediction, avg_loss, acc = net_conf(img, label)
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    optimizer.minimize(avg_loss)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    train_reader = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.mnist.train(), buf_size=500),
                                batch_size=BATCH_SIZE)
    test_reader = paddle.batch(paddle.dataset.mnist.test(),
                               batch_size=BATCH_SIZE)
    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)

    def train_loop(main_program):

        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        st = fluid.ExecutionStrategy()
        st.num_threads = 1
        st.allow_op_delay = False
        exe = fluid.ParallelExecutor(use_cuda, avg_loss.name, exec_strategy=st)

        for pass_id in range(100):
            for batch_id, data in enumerate(train_reader()):
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
                print(loss)

    # port = os.getenv("PADDLE_PSERVER_PORT", "6174")
    # pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
    # eplist = []
    # for ip in pserver_ips.split(","):
    #     eplist.append(':'.join([ip, port]))
    # pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
    pserver_endpoints = os.getenv("PADDLE_PSERVER_ENDPOINTS")
    trainers = int(os.getenv("PADDLE_TRAINERS"))
    # current_endpoint = os.getenv("POD_IP") + ":" + port
    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
    t = fluid.DistributeTranspiler()
    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
    if training_role == "PSERVER":
        pserver_prog = t.get_pserver_program(current_endpoint)
        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
        ps_exe = fluid.Executor(fluid.CPUPlace())
        ps_exe.run(pserver_startup)
        ps_exe.run(pserver_prog)
    elif training_role == "TRAINER":
        train_loop(t.get_trainer_program())
Exemple #7
0
def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
    y_predict = fluid.layers.fc(input=x, size=1, act=None)
    y = fluid.layers.data(name='y', shape=[1], dtype='float32')

    # loss function
    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
    avg_cost = fluid.layers.mean(cost)

    # optimizer
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

    port = os.getenv("PADDLE_INIT_PORT", port)
    pserver_ips = os.getenv("PADDLE_INIT_PSERVERS", ip)  # ip,ip...
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
    trainers = int(os.getenv("TRAINERS", trainer_count))
    current_endpoint = os.getenv("POD_IP", ip) + ":" + port
    trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", trainer_id))
    t = fluid.DistributeTranspiler()
    t.transpile(trainer_id,
                pservers=pserver_endpoints,
                trainers=trainers,
                sync_mode=sync_mode)
    pserver_prog = t.get_pserver_program(current_endpoint)
    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
    exe.run(pserver_startup)
    exe.run(pserver_prog)
Exemple #8
0
def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
    remove_ps_flag(os.getpid())
    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
    y_predict = fluid.layers.fc(input=x, size=1, act=None)
    y = fluid.layers.data(name='y', shape=[1], dtype='float32')

    # loss function
    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
    avg_cost = fluid.layers.mean(cost)

    # optimizer
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

    pserver_endpoints = ip + ":" + port
    current_endpoint = ip + ":" + port

    config = fluid.DistributeTranspilerConfig()
    config.sync_mode = sync_mode
    t = fluid.DistributeTranspiler(config=config)
    t.transpile(
        trainer_id,
        pservers=pserver_endpoints,
        trainers=trainers,
        sync_mode=sync_mode)
    pserver_prog = t.get_pserver_program(current_endpoint)
    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
    exe.run(pserver_startup)
    exe.run(pserver_prog)
Exemple #9
0
def dist_transpile(trainer_id, args, train_prog, startup_prog):
    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)
    trainers = int(os.getenv("PADDLE_TRAINERS"))
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

    config = fluid.DistributeTranspilerConfig()
    config.slice_var_up = not args.no_split_var
    t = fluid.DistributeTranspiler(config=config)
    t.transpile(trainer_id,
                program=train_prog,
                pservers=pserver_endpoints,
                trainers=trainers,
                sync_mode=not args.async_mode,
                startup_program=startup_prog)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
        pserver_startup_program = t.get_startup_program(
            current_endpoint, pserver_program, startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
Exemple #10
0
    def get_transpiler(trainer_id,
                       main_program,
                       pserver_endpoints,
                       trainers,
                       sync_mode,
                       dc_asgd=False,
                       current_endpoint=None,
                       nccl_comm_num=1,
                       hogwild_mode=False):
        # NOTE: import fluid until runtime, or else forking processes will cause error.
        config = fluid.DistributeTranspilerConfig()
        config.enable_dc_asgd = dc_asgd
        config.sync_mode = sync_mode
        config.runtime_split_send_recv = hogwild_mode

        if nccl_comm_num > 1:
            config.nccl_comm_num = nccl_comm_num
        # config.runtime_split_send_recv = True
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id=trainer_id,
                    program=main_program,
                    pservers=pserver_endpoints,
                    trainers=trainers,
                    sync_mode=sync_mode,
                    current_endpoint=current_endpoint)
        return t
Exemple #11
0
 def _transpiler_instance(self):
     main = self.get_main_program()
     t = fluid.DistributeTranspiler()
     t.transpile(self.trainer_id,
                 program=main,
                 pservers=self.pserver_eps,
                 trainers=self.trainers)
     return t
def nccl2_prepare(trainer_id, startup_prog, main_prog):
    config = fluid.DistributeTranspilerConfig()
    config.mode = "nccl2"
    t = fluid.DistributeTranspiler(config=config)
    t.transpile(trainer_id,
                trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
                current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
                startup_program=startup_prog,
                program=main_prog)
def train():
    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
    y_predict = fluid.layers.fc(input=x, size=1, act=None)

    loss = fluid.layers.square_error_cost(input=y_predict, label=y)
    avg_loss = fluid.layers.mean(loss)
    opt = fluid.optimizer.SGD(learning_rate=0.001)
    opt.minimize(avg_loss)

    place = fluid.CPUPlace()
    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
    exe = fluid.Executor(place)

    # fetch distributed training environment setting
    training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "localhost")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)
    trainers = int(os.getenv("PADDLE_TRAINERS", "1"))
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "localhost") + ":" + port

    t = fluid.DistributeTranspiler()
    t.transpile(trainer_id=trainer_id,
                pservers=pserver_endpoints,
                trainers=trainers,
                sync_mode=False)

    if training_role == "PSERVER":
        pserver_prog = t.get_pserver_program(current_endpoint)
        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
        exe.run(startup_prog)
        exe.run(pserver_prog)
    elif training_role == "TRAINER":
        trainer_prog = t.get_trainer_program()
        exe.run(fluid.default_startup_program())

        for epoch in range(EPOCH_NUM):
            for batch_id, batch_data in enumerate(train_reader()):
                avg_loss_value, = exe.run(trainer_prog,
                                          feed=feeder.feed(batch_data),
                                          fetch_list=[avg_loss])
                if (batch_id + 1) % 10 == 0:
                    print("Epoch: {0}, Batch: {1}, loss: {2}".format(
                        epoch, batch_id, avg_loss_value[0]))
        # destory the resource of current trainer node in pserver server node
        exe.close()
    else:
        raise AssertionError(
            "PADDLE_TRAINING_ROLE should be one of [TRAINER, PSERVER]")
Exemple #14
0
def nccl2_prepare(args, startup_prog):
    config = fluid.DistributeTranspilerConfig()
    config.mode = "nccl2"
    t = fluid.DistributeTranspiler(config=config)

    envs = args.dist_env

    t.transpile(envs["trainer_id"],
                trainers=','.join(envs["trainer_endpoints"]),
                current_endpoint=envs["current_endpoint"],
                startup_program=startup_prog)
    def _transpiler_instance(self, config=None, sync_mode=True):
        if not self.transpiler:
            main = self.get_main_program()
            self.transpiler = fluid.DistributeTranspiler(config=config)
            self.transpiler.transpile(self.trainer_id,
                                      program=main,
                                      pservers=self.pserver_eps,
                                      trainers=self.trainers,
                                      sync_mode=sync_mode)

        return self.transpiler
Exemple #16
0
def train():
    args = parse_args()

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, auc_var, batch_auc_var, py_reader, _ = ctr_dnn_model(
        args.embedding_size, args.sparse_feature_dim)
    optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    optimizer.minimize(loss)
    if args.cloud_train:
        # the port of all pservers, needed by both trainer and pserver
        port = os.getenv("PADDLE_PORT", "6174")
        # comma separated ips of all pservers, needed by trainer and
        pserver_ips = os.getenv("PADDLE_PSERVERS", "")
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        args.endpoints = ",".join(eplist)
        args.trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        args.current_endpoint = os.getenv("POD_IP", "localhost") + ":" + port
        args.role = os.getenv("TRAINING_ROLE", "TRAINER")
        args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        args.is_local = bool(int(os.getenv("PADDLE_IS_LOCAL", 0)))

    if args.is_local:
        logger.info("run local training")
        main_program = fluid.default_main_program()
        train_loop(args, main_program, py_reader, loss, auc_var, batch_auc_var,
                   1, 0)
    else:
        logger.info("run dist training")
        t = fluid.DistributeTranspiler()
        t.transpile(args.trainer_id,
                    pservers=args.endpoints,
                    trainers=args.trainers)
        if args.role == "pserver" or args.role == "PSERVER":
            logger.info("run pserver")
            prog = t.get_pserver_program(args.current_endpoint)
            startup = t.get_startup_program(args.current_endpoint,
                                            pserver_program=prog)
            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(startup)
            exe.run(prog)
        elif args.role == "trainer" or args.role == "TRAINER":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()
            train_loop(args, train_prog, py_reader, loss, auc_var,
                       batch_auc_var, args.trainers, args.trainer_id)
        else:
            raise ValueError(
                'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
            )
Exemple #17
0
 def _build_trainer_program_for_job(
         self, trainer_id=0, program=None,
         ps_endpoints=[], trainers=0,
         sync_mode=True, startup_program=None,
         job=None):
     transpiler = fluid.DistributeTranspiler()
     transpiler.transpile(trainer_id,
                          program=program,
                          pservers=",".join(ps_endpoints),
                          trainers=trainers,
                          sync_mode=sync_mode,
                          startup_program=startup_program)
     main = transpiler.get_trainer_program(wait_port=False)
     job._trainer_startup_programs.append(startup_program)
     job._trainer_main_programs.append(main)
def nccl2_prepare(args, startup_prog, main_prog):
    config = fluid.DistributeTranspilerConfig()
    config.mode = "nccl2"
    t = fluid.DistributeTranspiler(config=config)

    #envs = args.dist_env
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "127.0.0.1:6170")
    trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "127.0.0.1:6170")

    t.transpile(trainer_id,
                trainers=trainer_endpoints,
                current_endpoint=current_endpoint,
                startup_program=startup_prog,
                program=main_prog)
Exemple #19
0
def dist_transpile(trainer_id, args, train_prog, startup_prog):
    if trainer_id < 0:
        return None, None

    # the port of all pservers, needed by both trainer and pserver
    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
    # comma separated ips of all pservers, needed by trainer and
    # pserver
    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)
    # total number of workers/trainers in the job, needed by
    # trainer and pserver
    trainers = int(os.getenv("PADDLE_TRAINERS"))
    # the IP of the local machine, needed by pserver only
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")
    print("TRAINING_ROLE:", training_role)

    with fluid.program_guard(train_prog, startup_prog):
        config = fluid.DistributeTranspilerConfig()
        config.slice_var_up = not args.no_split_var
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            # NOTE: *MUST* use train_prog, for we are using with guard to
            # generate different program for train and test.
            program=train_prog,
            pservers=pserver_endpoints,
            trainers=trainers,
            sync_mode=not args.async_mode)
        if training_role == "PSERVER":
            pserver_program = t.get_pserver_program(current_endpoint)
            pserver_startup_program = t.get_startup_program(
                current_endpoint,
                pserver_program,
                startup_program=startup_prog)
            return pserver_program, pserver_startup_program
        elif training_role == "TRAINER":
            train_program = t.get_trainer_program()
            return train_program, startup_prog
        else:
            raise ValueError(
                'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
            )
Exemple #20
0
def init_distribuition_env(program):
    if status.mode == DistributionMode.LOCAL:
        log.info('Initializing local training')
    elif status.mode == DistributionMode.NCCL:
        config = F.DistributeTranspilerConfig()
        config.mode = "nccl2"
        F.DistributeTranspiler(config=config).transpile(
            status.replica_id,
            trainers=','.join(status._env),
            current_endpoint=status._this,
            program=program.train_program,
            startup_program=program.startup_program)
        log.info('Initializing distribution training with config %s' %
                 (repr(dis_config)))
        if status.is_master:
            sleep(30)
Exemple #21
0
 def get_transpiler(trainer_id,
                    main_program,
                    pserver_endpoints,
                    trainers,
                    sync_mode,
                    dc_asgd=False):
     # NOTE: import fluid until runtime, or else forking processes will cause error.
     config = fluid.DistributeTranspilerConfig()
     config.enable_dc_asgd = dc_asgd
     t = fluid.DistributeTranspiler(config=config)
     t.transpile(trainer_id=trainer_id,
                 program=main_program,
                 pservers=pserver_endpoints,
                 trainers=trainers,
                 sync_mode=sync_mode)
     return t
Exemple #22
0
    def _define_pserver_executor(self, pserver_args, train_program,
                                 startup_program, test_program):
        sync_mode = pserver_args['sync_mode']
        role = pserver_args['role']
        trainer_id = pserver_args['trainer_id']  # get actual trainer id here
        trainers = pserver_args['trainers']
        current_endpoint = pserver_args['current_endpoint']
        endpoints = pserver_args['endpoints']  # ip:port,ip:port or ip,ip;port

        def _process_endpoints(endpoints):
            if not ';' in endpoints:
                return endpoints
            ips, port = endpoints.split(';')
            return ','.join(['%s:%s' % (ip, port) for ip in ips.split(',')])

        endpoints = _process_endpoints(endpoints)

        dist_config = fluid.DistributeTranspilerConfig()
        dist_config.slice_var_up = False
        t = fluid.DistributeTranspiler(config=dist_config)
        t.transpile(trainer_id,
                    program=train_program,
                    startup_program=startup_program,
                    pservers=endpoints,
                    trainers=trainers,
                    sync_mode=sync_mode)

        if role == "PSERVER":
            ps_prog, ps_startup = t.get_pserver_programs(current_endpoint)
            # use CPU to execute pserver
            self.base_exe = fluid.Executor(fluid.CPUPlace())
            self.base_exe.run(ps_startup)
            self.ckp_step = self.load_model(
                self.model_dir
            )  # PSERVER to load model, and TRAINER to save model
            self.base_exe.run(ps_prog)  # will hold on here
            exit()

        elif role == "TRAINER":
            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
            self.base_exe = fluid.Executor(place)
            self.base_exe.run(startup_program)
            train_program = t.get_trainer_program()

            self.ckp_step = self.get_lastest_checkpoint(self.model_dir)

            self._define_parallel_executor(train_program, test_program)
Exemple #23
0
def train(args):

    if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0:
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
                                            filelist, 0, 1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()

    loss, py_reader = skip_gram_word2vec(
        word2vec_reader.dict_size,
        args.embedding_size,
        is_sparse=args.is_sparse,
        neg_num=args.nce_num)

    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    logger.info("run dist training")

    t = fluid.DistributeTranspiler()
    t.transpile(
        args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
    if args.role == "pserver":
        print("run psever")
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        pserver_startup = t.get_startup_program(args.current_endpoint,
                                                pserver_prog)
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(pserver_startup)
        exe.run(pserver_prog)
    elif args.role == "trainer":
        print("run trainer")
        train_loop(args,
                   t.get_trainer_program(), word2vec_reader, py_reader, loss,
                   args.trainer_id, id_frequencys_pow)
Exemple #24
0
def append_nccl2_prepare(trainer_id, startup_prog):
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    port = os.getenv("PADDLE_PSERVER_PORT")
    worker_ips = os.getenv("PADDLE_TRAINER_IPS")
    worker_endpoints = []
    for ip in worker_ips.split(","):
        worker_endpoints.append(':'.join([ip, port]))
    current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
    num_trainers = len(worker_endpoints)

    config = fluid.DistributeTranspilerConfig()
    config.mode = "nccl2"
    t = fluid.DistributeTranspiler(config=config)
    t.transpile(trainer_id,
                trainers=','.join(worker_endpoints),
                current_endpoint=current_endpoint,
                startup_program=startup_prog)
    return num_trainers, trainer_id
Exemple #25
0
 def _build_server_programs_for_job(
         self, program=None, ps_endpoints=[],
         trainers=0, sync_mode=True,
         startup_program=None, job=None):
     transpiler = fluid.DistributeTranspiler()
     trainer_id = 0
     transpiler.transpile(
         trainer_id,
         program=program,
         pservers=",".join(ps_endpoints),
         trainers=trainers,
         sync_mode=sync_mode,
         startup_program=startup_program)
     job.set_server_endpoints(ps_endpoints)
     for endpoint in ps_endpoints:
         main_prog = transpiler.get_pserver_program(endpoint)
         startup_prog = transpiler.get_startup_program(endpoint, main_prog)
         job._server_startup_programs.append(startup_prog)
         job._server_main_programs.append(main_prog)
Exemple #26
0
    def test_nccl2_transpile(self):
        if fluid.core.is_compiled_with_cuda():  #test nccl2 only with cuda
            main = fluid.Program()
            startup = fluid.Program()
            with fluid.program_guard(main, startup):
                self.net_conf()

            config = fluid.DistributeTranspilerConfig()
            config.mode = "nccl2"
            t = fluid.DistributeTranspiler(config=config)
            t.transpile(0,
                        trainers="127.0.0.1:6174,127.0.0.1:6175",
                        current_endpoint="127.0.0.1:6174",
                        startup_program=startup)
            print([op.type for op in startup.global_block().ops])
            self.assertEqual(startup.global_block().ops[-1].type,
                             "gen_nccl_id")
            self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
        else:
            pass
Exemple #27
0
def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
                                 trainer_id):
    remove_ps_flag(os.getpid())
    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
    y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False)
    y = fluid.layers.data(name='y', shape=[1], dtype='float32')

    # loss function
    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
    avg_cost = fluid.layers.mean(cost)

    # optimizer
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

    ps1 = ip + ":" + str(int(port) + 1)
    ps2 = ip + ":" + port
    pserver_endpoints = ps1 + "," + ps2

    config = fluid.DistributeTranspilerConfig()
    config.sync_mode = sync_mode
    config.slice_var_up = False

    t = fluid.DistributeTranspiler(config=config)
    t.transpile(
        trainer_id,
        pservers=pserver_endpoints,
        trainers=trainers,
        sync_mode=sync_mode)
    pserver_prog = t.get_pserver_program(ps2)

    # pserver2 have no parameter
    assert (len(pserver_prog.blocks) == 2)
    assert (len(pserver_prog.blocks[1].ops) == 0)

    pserver_startup = t.get_startup_program(ps2, pserver_prog)
    exe.run(pserver_startup)
    exe.run(pserver_prog)
Exemple #28
0
def train(args):
    if args.enable_ce:
        SEED = 102
        fluid.default_startup_program().random_seed = SEED
        fluid.default_main_program().random_seed = SEED
    use_cuda = True if args.use_cuda else False
    parallel = True if args.parallel else False
    print("use_cuda:", use_cuda, "parallel:", parallel)
    train_reader, vocab_size = utils.construct_train_data(
        args.train_dir, args.vocab_path, args.batch_size * get_cards(args))
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim,
                                    args.hidden_size)
    # Train program
    train_input_data, cos_pos, avg_cost, acc = ssr.train()

    # Optimization to minimize lost
    optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
    optimizer.minimize(avg_cost)

    print("run distribute training")
    t = fluid.DistributeTranspiler()
    t.transpile(args.trainer_id,
                pservers=args.endpoints,
                trainers=args.trainers)
    if args.role == "pserver":
        print("run psever")
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        pserver_startup = t.get_startup_program(args.current_endpoint,
                                                pserver_prog)
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(pserver_startup)
        exe.run(pserver_prog)
    elif args.role == "trainer":
        print("run trainer")
        train_loop(t.get_trainer_program(), avg_cost, acc, train_input_data,
                   place, args, train_reader)
Exemple #29
0
def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
    PASS_NUM = 100
    EMBED_SIZE = 32
    HIDDEN_SIZE = 256
    N = 5
    BATCH_SIZE = 32
    IS_SPARSE = is_sparse

    def __network__(words):
        embed_first = fluid.layers.embedding(input=words[0],
                                             size=[dict_size, EMBED_SIZE],
                                             dtype='float32',
                                             is_sparse=IS_SPARSE,
                                             param_attr='shared_w')
        embed_second = fluid.layers.embedding(input=words[1],
                                              size=[dict_size, EMBED_SIZE],
                                              dtype='float32',
                                              is_sparse=IS_SPARSE,
                                              param_attr='shared_w')
        embed_third = fluid.layers.embedding(input=words[2],
                                             size=[dict_size, EMBED_SIZE],
                                             dtype='float32',
                                             is_sparse=IS_SPARSE,
                                             param_attr='shared_w')
        embed_forth = fluid.layers.embedding(input=words[3],
                                             size=[dict_size, EMBED_SIZE],
                                             dtype='float32',
                                             is_sparse=IS_SPARSE,
                                             param_attr='shared_w')

        concat_embed = fluid.layers.concat(
            input=[embed_first, embed_second, embed_third, embed_forth],
            axis=1)
        hidden1 = fluid.layers.fc(input=concat_embed,
                                  size=HIDDEN_SIZE,
                                  act='sigmoid')
        predict_word = fluid.layers.fc(input=hidden1,
                                       size=dict_size,
                                       act='softmax')
        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
        avg_cost = fluid.layers.mean(cost)
        return avg_cost, predict_word

    word_dict = paddle.dataset.imikolov.build_dict()
    dict_size = len(word_dict)

    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')

    if not is_parallel:
        avg_cost, predict_word = __network__(
            [first_word, second_word, third_word, forth_word, next_word])
    else:
        raise NotImplementedError()

    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
    sgd_optimizer.minimize(avg_cost)

    train_reader = paddle.batch(paddle.dataset.imikolov.train(word_dict, N),
                                BATCH_SIZE)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    feeder = fluid.DataFeeder(
        feed_list=[first_word, second_word, third_word, forth_word, next_word],
        place=place)

    def train_loop(main_program):
        exe.run(fluid.default_startup_program())

        for pass_id in range(PASS_NUM):
            for data in train_reader():
                avg_cost_np = exe.run(main_program,
                                      feed=feeder.feed(data),
                                      fetch_list=[avg_cost])
                if avg_cost_np[0] < 5.0:
                    if save_dirname is not None:
                        fluid.io.save_inference_model(
                            save_dirname,
                            ['firstw', 'secondw', 'thirdw', 'forthw'],
                            [predict_word], exe)
                    return
                if math.isnan(float(avg_cost_np[0])):
                    sys.exit("got NaN loss, training failed.")

        raise AssertionError("Cost is too large {0:2.2}".format(
            avg_cost_np[0]))

    if is_local:
        train_loop(fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
def train_main(use_cuda, is_sparse, is_local=True):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    context = encoder(is_sparse)
    rnn_out = decoder_train(context, is_sparse)
    label = pd.data(name="target_language_next_word",
                    shape=[1],
                    dtype='int64',
                    lod_level=1)
    cost = pd.cross_entropy(input=rnn_out, label=label)
    avg_cost = pd.mean(cost)

    optimizer = fluid.optimizer.Adagrad(
        learning_rate=1e-4,
        regularization=fluid.regularizer.L2DecayRegularizer(
            regularization_coeff=0.1))
    optimize_ops, params_grads = optimizer.minimize(avg_cost)

    train_data = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
                              batch_size=batch_size)

    exe = Executor(place)

    def train_loop(main_program):
        exe.run(framework.default_startup_program())

        batch_id = 0
        for pass_id in xrange(1):
            for data in train_data():
                word_data = to_lodtensor(map(lambda x: x[0], data), place)
                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
                outs = exe.run(main_program,
                               feed={
                                   'src_word_id': word_data,
                                   'target_language_word': trg_word,
                                   'target_language_next_word': trg_word_next
                               },
                               fetch_list=[avg_cost])
                avg_cost_val = np.array(outs[0])
                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                      " avg_cost=" + str(avg_cost_val))
                if batch_id > 3:
                    break
                batch_id += 1

    if is_local:
        train_loop(framework.default_main_program())
    else:
        port = os.getenv("PADDLE_INIT_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())