Example #1
0
    def test_open_sync_batch_norm(self):
        import paddle.fluid as fluid
        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
        from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy

        if not fluid.core.is_compiled_with_cuda():
            # Operator "gen_nccl_id" has not been registered
            return

        data = fluid.layers.data(name='X', shape=[1], dtype='float32')
        hidden = fluid.layers.fc(input=data, size=10)
        loss = fluid.layers.mean(hidden)

        optimizer = fluid.optimizer.AdamOptimizer()

        role = role_maker.UserDefinedCollectiveRoleMaker(0, ['127.0.0.1:6170'])
        fleet.init(role)

        dist_strategy = DistributedStrategy()
        dist_strategy.sync_batch_norm = True

        dist_optimizer = fleet.distributed_optimizer(optimizer,
                                                     strategy=dist_strategy)
        dist_optimizer.minimize(loss)

        self.assertEqual(dist_strategy.exec_strategy.num_threads, 1)
Example #2
0
    def split_filelist(self, FLAGS):
        """
        split filelist for multi-node or multi gpus.
        """
        if self.is_multi_gpu(FLAGS):
            filelist_arr = FLAGS.file_list.split(',')
            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
            num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM"))
            trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
            trainer_endpoints = trainer_endpoints.split(',')

            role = role_maker.UserDefinedCollectiveRoleMaker(
                current_id=trainer_id, worker_endpoints=trainer_endpoints)
            fleet.init(role)

            filelist_arr = fleet.split_files(filelist_arr)
            FLAGS.file_list = ','.join(filelist_arr)
Example #3
0
    def set_optimizer(self, FLAGS, net_output):
        """
        set optimizer
        """
        optimizer = net_output['optimizer']
        if self.is_multi_gpu(FLAGS):
            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
            num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM"))
            trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
            logging.info("train_id:%s, num_trainers:%s, trainer_endpoints:%s" % (trainer_id,
                        num_trainers, trainer_endpoints))
            
            trainer_endpoints = trainer_endpoints.split(',')

            role = role_maker.UserDefinedCollectiveRoleMaker(current_id=trainer_id,
                    worker_endpoints=trainer_endpoints)
            fleet.init(role)
            
            dist_strategy = DistributedStrategy()
            #num_nodes = len(set([x.split(':')[0] for x in trainer_endpoints]))
            #if num_nodes == 1:
            #    dist_strategy.use_local_sgd = True
                #dist_strategy.mode = "collective" #multi node is nccl2
                #dist_strategy.collective_mode = "local_sgd" # local_sgd or grad_allreduce
            #    logging.info("use local sgd, not nccl2 for single node.")

            """
            #TODO:
            dist_strategy.enable_inplace = FLAGS.with_inplace
            if FLAGS.fuse_ops:
                dist_strategy.fuse_all_reduce_ops = 1
            dist_strategy.nccl_comm_num = FLAGS.nccl_comm_num
            """
            optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)

        return optimizer.minimize(net_output['loss'])
Example #4
0
    def net(self):
        args = self.p_args()
        bert_config = BertConfig("uncased_L-24_H-1024_A-16/bert_config.json")
        bert_config.print_config()
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = 1
        if args.do_train:
            my_dist_env = dist_env()
            worker_endpoints_env = my_dist_env["trainer_endpoints"]
            worker_endpoints = worker_endpoints_env.split(",")
            current_endpoint = my_dist_env["current_endpoint"]
            trainer_id = worker_endpoints.index(current_endpoint)
            # new rolemaker here
            print("current_id: ", trainer_id)
            print("worker_endpoints: ", worker_endpoints)
            role = role_maker.UserDefinedCollectiveRoleMaker(
                current_id=trainer_id, worker_endpoints=worker_endpoints)
            # Fleet get role of each worker
            fleet.init(role)
        exe = fluid.Executor(place)

        # init program
        train_program = fluid.Program()
        startup_prog = fluid.Program()

        if args.random_seed != 0:
            print("set program random seed as: ", args.random_seed)
            startup_prog.random_seed = args.random_seed
            train_program.random_seed = args.random_seed

        task_name = args.task_name.lower()
        processors = {
            'xnli': reader.XnliProcessor,
            'cola': reader.ColaProcessor,
            'mrpc': reader.MrpcProcessor,
            'mnli': reader.MnliProcessor,
        }
        processor = processors[task_name](data_dir=args.data_dir,
                                          vocab_path=args.vocab_path,
                                          max_seq_len=args.max_seq_len,
                                          do_lower_case=args.do_lower_case,
                                          in_tokens=args.in_tokens,
                                          random_seed=args.random_seed)
        num_labels = len(processor.get_labels())

        dev_count = len(worker_endpoints)
        # we need to keep every trainer of fleet the same shuffle_seed
        print("shuffle_seed: ", args.shuffle_seed)
        self.train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            dev_idx=0,
            shuffle=args.shuffle,
            shuffle_seed=args.shuffle_seed)

        num_train_examples = processor.get_num_examples(phase='train')

        max_train_steps = 5
        self.warmup_steps = int(5 * 0.1)

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.nccl_comm_num = 3
        dist_strategy.use_hierarchical_allreduce = True
        #dist_strategy.mode = "collective"
        #dist_strategy.collective_mode = "grad_allreduce"

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)
                scheduled_lr = optimization(loss=self.loss,
                                            warmup_steps=self.warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=False,
                                            loss_scaling=args.loss_scaling,
                                            dist_strategy=dist_strategy)

        exe.run(startup_prog)
        with open("__model__", "wb") as f:
            f.write(fleet._origin_program.desc.serialize_to_string())

        with open("debug_program", "w") as f:
            f.write(str(fleet._origin_program))
        return self.loss