Exemple #1
0
def main():
    args = parse_args()
    print_arguments(args)
    print_paddle_envs()
    if args.update_method != 'local':
        args.dist_env = dist_env()
    train_parallel(args)
Exemple #2
0
def main():
    args = parse_args()
    print_arguments(args)
    print_paddle_envs()
    args.dist_env = dist_env()
    train_parallel(args)
Exemple #3
0
    def net(self):
        args = self.p_args()
        bert_config = BertConfig("uncased_L-24_H-1024_A-16/bert_config.json")
        bert_config.print_config()
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = 1
        if args.do_train:
            my_dist_env = dist_env()
            worker_endpoints_env = my_dist_env["trainer_endpoints"]
            worker_endpoints = worker_endpoints_env.split(",")
            current_endpoint = my_dist_env["current_endpoint"]
            trainer_id = worker_endpoints.index(current_endpoint)
            # new rolemaker here
            print("current_id: ", trainer_id)
            print("worker_endpoints: ", worker_endpoints)
            role = role_maker.UserDefinedCollectiveRoleMaker(
                current_id=trainer_id, worker_endpoints=worker_endpoints)
            # Fleet get role of each worker
            fleet.init(role)
        exe = fluid.Executor(place)

        # init program
        train_program = fluid.Program()
        startup_prog = fluid.Program()

        if args.random_seed != 0:
            print("set program random seed as: ", args.random_seed)
            startup_prog.random_seed = args.random_seed
            train_program.random_seed = args.random_seed

        task_name = args.task_name.lower()
        processors = {
            'xnli': reader.XnliProcessor,
            'cola': reader.ColaProcessor,
            'mrpc': reader.MrpcProcessor,
            'mnli': reader.MnliProcessor,
        }
        processor = processors[task_name](data_dir=args.data_dir,
                                          vocab_path=args.vocab_path,
                                          max_seq_len=args.max_seq_len,
                                          do_lower_case=args.do_lower_case,
                                          in_tokens=args.in_tokens,
                                          random_seed=args.random_seed)
        num_labels = len(processor.get_labels())

        dev_count = len(worker_endpoints)
        # we need to keep every trainer of fleet the same shuffle_seed
        print("shuffle_seed: ", args.shuffle_seed)
        self.train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            dev_idx=0,
            shuffle=args.shuffle,
            shuffle_seed=args.shuffle_seed)

        num_train_examples = processor.get_num_examples(phase='train')

        max_train_steps = 5
        self.warmup_steps = int(5 * 0.1)

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.nccl_comm_num = 3
        dist_strategy.use_hierarchical_allreduce = True
        #dist_strategy.mode = "collective"
        #dist_strategy.collective_mode = "grad_allreduce"

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)
                scheduled_lr = optimization(loss=self.loss,
                                            warmup_steps=self.warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=False,
                                            loss_scaling=args.loss_scaling,
                                            dist_strategy=dist_strategy)

        exe.run(startup_prog)
        with open("__model__", "wb") as f:
            f.write(fleet._origin_program.desc.serialize_to_string())

        with open("debug_program", "w") as f:
            f.write(str(fleet._origin_program))
        return self.loss