Ejemplo n.º 1
0
def evaluate(model, dataloader, eval_metric, args):
    """Evaluation."""
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_output, total_count = 0.0, 0
    total_tokens = 0
    with torch.no_grad():
        # For all the batches in the dataset.
        for iteration, batch in enumerate(dataloader):
            if (iteration + 1) % args.log_interval == 0:
                print_rank_0('> working on iteration: {}'.format(iteration))
            # Forward evaluation.
            output, _, _ = lm_forward_step(batch,
                                           model,
                                           args,
                                           None, [],
                                           eval_metric=eval_metric)
            count = batch['text'].size(0)
            count = torch.cuda.LongTensor([count])
            # Reduce across processes.
            torch.distributed.all_reduce(output,
                                         group=mpu.get_data_parallel_group())
            torch.distributed.all_reduce(count,
                                         group=mpu.get_data_parallel_group())

            total_output += output.item()
            total_count += count.item()
            total_tokens += batch['loss_mask'].sum().item()
    print(total_tokens)
    return {eval_metric: total_output}, total_count
Ejemplo n.º 2
0
def make_data_loader(dataset, batch_size, args):

    shuffle = args.shuffle
    if shuffle:
        sampler = data_utils.samplers.RandomSampler(dataset, replacement=True, num_samples=batch_size*args.train_iters)
    else:
        sampler = torch.utils.data.SequentialSampler(dataset)
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
    distributed = world_size > 1
    drop_last = distributed

    if distributed:
        batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler,
                                                                    batch_size,
                                                                    drop_last,
                                                                    rank,
                                                                    world_size)
    else:
        batch_sampler = torch.utils.data.BatchSampler(sampler,
                                                      batch_size,
                                                      drop_last)

    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_sampler=batch_sampler,
                                              num_workers=args.num_workers,
                                              pin_memory=True)

    return data_loader
Ejemplo n.º 3
0
def make_data_loader(dataset, tokenizer, batch_size, num_iters, args):
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
    distributed = world_size > 1
    if args.transformer_xl:
        batch_sampler = data_utils.samplers.DistributedSequentialSampler(
            len(dataset), num_iters, batch_size, rank, world_size)
    else:
        shuffle = args.shuffle
        if shuffle:
            sampler = data_utils.samplers.RandomSampler(
                dataset,
                replacement=True,
                num_samples=batch_size * args.train_iters)
        else:
            sampler = torch.utils.data.SequentialSampler(dataset)
        drop_last = distributed
        # the GPUs in the same model parallel group receive the same data
        if distributed:
            batch_sampler = data_utils.samplers.DistributedBatchSampler(
                sampler,
                batch_size,
                drop_last,
                rank,
                world_size,
                gradient_accumulation_steps=args.gradient_accumulation_steps)
        else:
            batch_sampler = torch.utils.data.BatchSampler(
                sampler, batch_size, drop_last)
    use_block = args.block_lm or args.encoder_decoder
    if use_block:
        strategy = ConstructBlockStrategy(
            args,
            tokenizer,
            args.max_position_embeddings,
            bert_prob=args.bert_prob,
            gap_sentence_prob=args.gap_sentence_prob,
            gpt_infill_prob=args.gpt_infill_prob,
            average_block_length=args.avg_block_length,
            gpt_min_ratio=args.gpt_min_ratio,
            block_mask_prob=args.block_mask_prob,
            context_mask_ratio=args.context_mask_ratio,
            shuffle_blocks=not args.no_shuffle_block,
            block_position_encoding=not args.no_block_position,
            sentinel_token=args.sentinel_token,
            encoder_decoder=args.encoder_decoder,
            task_mask=args.task_mask,
            random_position=args.random_position,
            masked_lm=args.masked_lm)
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_sampler=batch_sampler,
        num_workers=args.num_workers,
        pin_memory=True,
        collate_fn=strategy.construct_blocks if use_block else None)

    return data_loader
Ejemplo n.º 4
0
def evaluate_ocnli(model, dev_dataloader, device, args):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(dev_dataloader):
            tokens_1, masks_1, tokens_2, masks_2, tokens_3, masks_3, labels = [x.to(device) for x in batch]

            tokens, attention_mask, position_ids = get_batch(tokens_1, args)
            output, _ = model(tokens, position_ids, attention_mask)

            losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:])

            output_1 = torch.sum(losses * masks_1, 1) / torch.sum(masks_1, -1)

            tensor_list = [torch.zeros_like(output_1) for _ in range(mpu.get_data_parallel_world_size())]
            torch.distributed.all_gather(tensor_list, output_1, mpu.get_data_parallel_group())
            output_1 = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy()

            # --------------
            tokens, attention_mask, position_ids = get_batch(tokens_2, args)
            output, _ = model(tokens, position_ids, attention_mask)
            losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:])

            output_2 = torch.sum(losses * masks_2, 1) / torch.sum(masks_2, -1)

            tensor_list = [torch.zeros_like(output_2) for _ in range(mpu.get_data_parallel_world_size())]
            torch.distributed.all_gather(tensor_list, output_2, mpu.get_data_parallel_group())
            output_2 = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy()

            # ---------------

            tokens, attention_mask, position_ids = get_batch(tokens_3, args)
            output, _ = model(tokens, position_ids, attention_mask)
            losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:])

            output_3 = torch.sum(losses * masks_3, 1) / torch.sum(masks_3, -1)

            tensor_list = [torch.zeros_like(output_3) for _ in range(mpu.get_data_parallel_world_size())]
            torch.distributed.all_gather(tensor_list, output_3, mpu.get_data_parallel_group())
            output_3 = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy()


            # --------------

            tensor_list_labels = [torch.zeros_like(labels) for _ in range(mpu.get_data_parallel_world_size())]
            torch.distributed.all_gather(tensor_list_labels, labels, mpu.get_data_parallel_group())

            if torch.distributed.get_rank() == 0:
                labels = torch.stack(tensor_list_labels, 0)
                labels = labels.view(-1).cpu().detach().numpy()
                res = [np.argmin(np.array(x)) for x in zip(output_1, output_2, output_3)]
                res = [x==y for x, y in zip(res, labels)]
                correct += sum(res)
                total += len(res)
    
    if torch.distributed.get_rank() == 0:
        print("EVAL", correct, total)
Ejemplo n.º 5
0
def evaluate(model, dev_dataloader, all_labels, device, args):
    model.eval()

    if torch.distributed.get_rank() == 0:
        res = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dev_dataloader):
            tokens, masks = [x.to(device) for x in batch]

            tokens, attention_mask, position_ids = get_batch(tokens, args)
            output, _ = model(tokens, position_ids, attention_mask)
            losses = mpu.vocab_parallel_cross_entropy(output[:, :-1, :].contiguous().float(), tokens[:, 1:])

            output = torch.sum(losses * masks, 1) / torch.sum(masks, -1)

            tensor_list = [torch.zeros_like(output) for _ in range(mpu.get_data_parallel_world_size())]
            torch.distributed.all_gather(tensor_list, output, mpu.get_data_parallel_group())
            output = torch.stack(tensor_list, 0).view(-1).cpu().detach().numpy()

            if torch.distributed.get_rank() == 0:
                for v in output:
                    res.append(v)

    if torch.distributed.get_rank() == 0:
        cnt = 0
        label_size = max(all_labels) + 1
        num_inst = len(res) // label_size
        for x in range(num_inst):
            label = all_labels[x]
            cur_res = res[x*label_size:(x+1)*label_size]
            pos = np.argmin(cur_res)
            if pos == label:
                cnt += 1
        print("EVAL", cnt, num_inst)
Ejemplo n.º 6
0
    def __init__(self, module):
        super(DistributedDataParallel, self).__init__()
        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False

        self.module = module
        self.data_parallel_group = mpu.get_data_parallel_group()
        src_rank = mpu.get_model_parallel_rank()
        for p in self.module.parameters():
            if torch.is_tensor(p):
                dist.broadcast(p, src_rank, group=self.data_parallel_group)

        def allreduce_params(reduce_after=True,
                             no_scale=False,
                             fp32_allreduce=False):
            if (self.needs_reduction):
                self.needs_reduction = False
                buckets = {}
                for name, param in self.module.named_parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = (param.data.type())
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if self.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print(
                            "WARNING: gloo dist backend for half parameters may be extremely slow."
                            +
                            " It is recommended to use the NCCL backend in this case."
                        )
                        self.warn_on_half = False
                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    if fp32_allreduce:
                        coalesced = coalesced.float()
                    if not no_scale and not reduce_after:
                        coalesced /= dist.get_world_size(
                            group=self.data_parallel_group)
                    dist.all_reduce(coalesced, group=self.data_parallel_group)
                    torch.cuda.synchronize()
                    if not no_scale and reduce_after:
                        coalesced /= dist.get_world_size(
                            group=self.data_parallel_group)
                    for buf, synced in zip(
                            grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)

        self.hook_handles = []
        self.hooks = []
        for param in list(self.module.parameters()):

            def allreduce_hook(*unused):
                Variable._execution_engine.queue_callback(allreduce_params)

        #    handle = param.register_hook(allreduce_hook)
        #self.hooks.append(allreduce_hook)
        #self.hook_handles.append(handle)
        self.allreduce_params = allreduce_params
Ejemplo n.º 7
0
def build_multi_task_dataset(args, tokenizer):
    task_dirs = {
        "mnli": "MNLI",
        "cola": "CoLA",
        "mrpc": "MRPC",
        "qnli": "QNLI",
        "qqp": "QQP",
        "sst2": "SST-2",
        "agnews": "Agnews",
        "yelp-polarity": "yelp_review_polarity_csv",
        "yelp-full": "yelp_review_full_csv",
        "yahoo": "Yahoo",
        "squad": "SQuAD",
        "race": "RACE"
    }
    train, valid = None, None
    if mpu.get_model_parallel_rank() == 0:
        multi_seq_length = args.seq_length
        if args.multi_seq_length is not None:
            multi_seq_length = args.multi_seq_length
        train_datasets, valid_datasets = [], []
        for task in args.multi_task_data:
            task = task.lower()
            data_dir = os.path.join(args.data_dir, task_dirs[task])
            train_datasets.append(
                SuperGlueDataset(args,
                                 task,
                                 data_dir,
                                 multi_seq_length,
                                 "train",
                                 tokenizer,
                                 pattern_ensemble=True))
            valid_datasets.append(
                SuperGlueDataset(args,
                                 task,
                                 data_dir,
                                 multi_seq_length,
                                 "dev",
                                 tokenizer,
                                 pattern_ensemble=True))
        train = MultiTaskDataset(args.multi_task_data, train_datasets)
        valid = MultiTaskDataset(args.multi_task_data, valid_datasets)
        world_size = torch.distributed.get_world_size(
            group=mpu.get_data_parallel_group())
        multi_batch_size = args.batch_size * world_size
        if args.multi_batch_size is not None:
            multi_batch_size = args.multi_batch_size * world_size
        train = make_data_loader(train,
                                 tokenizer,
                                 multi_batch_size,
                                 args.train_iters,
                                 args,
                                 shuffle=True)
        valid = make_data_loader(valid,
                                 tokenizer,
                                 multi_batch_size,
                                 args.train_iters,
                                 args,
                                 shuffle=True)
    return train, valid
def get_model(args, version=None):
    """Build the model."""
    
    print_rank_0('building Bert model ...')
    if version is None:
        model = BertMixtureModel(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      layernorm_epsilon=args.layernorm_epsilon,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      num_experts=args.num_experts,
                      type_vocab_size=2)
    elif version == "v0":
        model = BertMixtureModel_v0(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      layernorm_epsilon=args.layernorm_epsilon,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      num_experts=args.num_experts,
                      type_vocab_size=2)
    
    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    #To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model, device_ids=[i], output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
Ejemplo n.º 9
0
def evaluate_tnews(args, model, dataloader, device, mode="dev"):
    model.eval()
    all_truth, all_preds = [], []
    with torch.no_grad():
        for batch, no_model_batch in tqdm(dataloader, desc="Evaluating {}".format(mode),
                                          disable=(torch.distributed.get_rank() != 0)):
            for k in batch:
                batch[k] = batch[k].to(device)
            for k in no_model_batch:
                no_model_batch[k] = no_model_batch[k].to(device)

            output = model(**batch)
            output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum(
                no_model_batch["loss_mask"], -1).unsqueeze(-1)

            # gather the output logits from other gpus
            tensor_list = [torch.zeros_like(output) for _ in range(mpu.get_data_parallel_world_size())]
            torch.distributed.all_gather(tensor_list, output, mpu.get_data_parallel_group())

            # gather the truth labels from other gpus
            tensor_list_truth = [torch.zeros_like(no_model_batch["truth"], dtype=torch.long) for _ in
                                 range(mpu.get_data_parallel_world_size())]
            torch.distributed.all_gather(tensor_list_truth, no_model_batch["truth"], mpu.get_data_parallel_group())

            if args.model_parallel_size == 1:
                scores = torch.stack(tensor_list, 0).view(-1, 30000)
            else:
                assert args.model_parallel_size == 2, "Now, we only support model parallel <= 2"
                # for convience implementation. Note that the truth labels only appears in the first 15000 part of the logits, e.g. on rank 0, 2, 4, ...
                scores = torch.stack(tensor_list, 0).view(-1, 15000)

            truth = torch.stack(tensor_list_truth, 0)
            truth = truth.view(-1)
            # scores = scores[:, cand_ids]

            preds = torch.argmax(scores, dim=-1)

            all_truth.extend(truth.detach().cpu().tolist())
            all_preds.extend(preds.detach().cpu().tolist())

    acc = sum([int(p == l) for p, l in zip(all_preds, all_truth)]) / len(all_truth)
    acc = torch.tensor(acc).to(device)

    acc_list = [torch.zeros_like(acc) for _ in range(mpu.get_model_parallel_world_size())]
    torch.distributed.all_gather(acc_list, acc, mpu.get_model_parallel_group())

    return acc_list[0].item(), all_truth, all_preds
Ejemplo n.º 10
0
def make_data_loader(dataset, batch_size, args):

    #shuffle = args.shuffle
    #if shuffle:
    #    sampler = data_utils.samplers.RandomSampler(dataset, replacement=True, num_samples=batch_size*args.train_iters)
    #else:
    #    sampler = torch.utils.data.SequentialSampler(dataset)
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
    distributed = world_size > 1
    drop_last = distributed

    #if distributed:
    #    batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler,
    #                                                                batch_size,
    #                                                                drop_last,
    #                                                                rank,
    #                                                                world_size)
    #else:
    #    batch_sampler = torch.utils.data.BatchSampler(sampler,
    #                                                  batch_size,
    #                                                  drop_last)

    #data_loader = torch.utils.data.DataLoader(dataset,
    #                                          batch_sampler=batch_sampler,
    #                                          num_workers=args.num_workers,
    #                                          pin_memory=True)
    ###################
    data_loader = torch.utils.data.DataLoader(
        dataset[rank] if len(dataset) == world_size else dataset[0],
        batch_size=batch_size,
        num_workers=args.num_workers,
        pin_memory=False,
        drop_last=drop_last,
        timeout=5,
        persistent_workers=True)
    return data_loader
Ejemplo n.º 11
0
def make_data_loader(dataset, batch_size, args):
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
    distributed = world_size > 1
    if args.transformer_xl:
        batch_sampler = data_utils.samplers.DistributedSequentialSampler(
            len(dataset), args.train_iters, batch_size, rank, world_size)
    else:
        shuffle = args.shuffle
        if shuffle:
            sampler = data_utils.samplers.RandomSampler(
                dataset,
                replacement=True,
                num_samples=batch_size * args.train_iters)
        else:
            sampler = torch.utils.data.SequentialSampler(dataset)
        drop_last = distributed
        # the GPUs in the same model parallel group receive the same data
        if distributed:
            batch_sampler = data_utils.samplers.DistributedBatchSampler(
                sampler,
                batch_size,
                drop_last,
                rank,
                world_size,
                gradient_accumulation_steps=args.gradient_accumulation_steps)
        else:
            batch_sampler = torch.utils.data.BatchSampler(
                sampler, batch_size, drop_last)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_sampler=batch_sampler,
                                              num_workers=args.num_workers,
                                              pin_memory=True)

    return data_loader
Ejemplo n.º 12
0
def get_model(args):
    """Build the model."""

    print_rank_0('building BERT model ...')
    model = BertModel(args)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)
        if args.fp32_embedding:
            model.module.model.bert.embeddings.word_embeddings.float()
            if args.ds_type=='BERT':
                model.module.model.bert.embeddings.position_embeddings.float()
            else:
                model.module.model.bert.embeddings.token_position_embeddings.float()
                model.module.model.bert.embeddings.para_position_embeddings.float()
                model.module.model.bert.embeddings.sent_position_embeddings.float()
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_tokentypes:
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_layernorm:
            for name, _module in model.named_modules():
                if 'LayerNorm' in name:
                    _module.float()

    # Wrap model for distributed training.
    if args.DDP_impl == 'torch':
        i = torch.cuda.current_device()
        args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel
        model = args.DDP_type(model, device_ids=[i], output_device=i,
                              process_group=mpu.get_data_parallel_group())
    elif args.DDP_impl == 'local':
        args.DDP_type = LocalDDP
        model = args.DDP_type(model)
    else:
        print_rank_0('Unknown DDP implementation specified: {}. '
                     'Exiting.'.format(args.DDP_impl))
        exit()

    return model
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      max_memory_length=args.mem_length,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      relative_encoding=args.transformer_xl)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if hasattr(args, "deepspeed") and args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if not args.deepspeed:
        if USE_TORCH_DDP:
            i = torch.cuda.current_device()
            model = DDP(model,
                        device_ids=[i],
                        output_device=i,
                        process_group=mpu.get_data_parallel_group())
        else:
            model = DDP(model)

    return model
Ejemplo n.º 14
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if args.DDP_impl == 'torch':
        i = torch.cuda.current_device()
        args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel
        model = args.DDP_type(model,
                              device_ids=[i],
                              output_device=i,
                              process_group=mpu.get_data_parallel_group())
    elif args.DDP_impl == 'local':
        args.DDP_type = LocalDDP
        model = args.DDP_type(model)
    else:
        print_rank_0('Unknown DDP implementation specified: {}. '
                     'Exiting.'.format(args.DDP_impl))
        exit()

    return model
Ejemplo n.º 15
0
def backward_step(optimizer, model, lm_loss, args, timers):
    """Backward step."""

    # Total loss.
    loss = lm_loss

    # Backward pass.
    if args.deepspeed:
        model.backward(loss)
    else:
        # optimizer.zero_grad()
        if args.fp16:
            optimizer.backward(loss, update_master_grads=False)
        else:
            loss.backward()

    reduced_losses = lm_loss.view(1)
    torch.distributed.all_reduce(reduced_losses.data, group=mpu.get_data_parallel_group())
    reduced_losses.data = reduced_losses.data / (args.world_size / args.model_parallel_size)
    lm_loss_reduced = reduced_losses

    if args.deepspeed:
        # DeepSpeed backward propagation already addressed all reduce communication.
        # Reset the timer to avoid breaking timer logs below.
        timers('allreduce').reset()
    else:
        if not args.DDP_impl == 'torch':
            timers('allreduce').start()
            model.allreduce_params(reduce_after=False,
                                   fp32_allreduce=args.fp32_allreduce)
            timers('allreduce').stop()

    # Update master gradients.
    if not args.deepspeed:
        if args.fp16:
            optimizer.update_master_grads()

        # Clipping gradients helps prevent the exploding gradient.
        if args.clip_grad > 0:
            if not args.fp16:
                mpu.clip_grad_norm(model.parameters(), args.clip_grad)
            else:
                optimizer.clip_master_grads(args.clip_grad)

    return lm_loss_reduced
Ejemplo n.º 16
0
def get_model(args):
    """Build the model."""

    print_rank_0('building BERT model ...')
    model = BertModel(args)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)
        if args.fp32_embedding:
            model.module.model.bert.embeddings.word_embeddings.float()
            model.module.model.bert.embeddings.position_embeddings.float()
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_tokentypes:
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_layernorm:
            for name, _module in model.named_modules():
                if 'LayerNorm' in name:
                    _module.float()

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model,
                    device_ids=[i],
                    output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
Ejemplo n.º 17
0
def get_model(args):
    """Build the model."""

    print_rank_0('building CPM model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model,
                    device_ids=[i],
                    output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
Ejemplo n.º 18
0
def test_initialize_model_parallel(model_parallel_size):

    if torch.distributed.get_rank() == 0:
        print('> testing initialize_model_parallel with size {} ...'.format(
            model_parallel_size))
    model_parallel_size_ = min(model_parallel_size,
                               torch.distributed.get_world_size())
    assert not mpu.model_parallel_is_initialized()
    mpu.initialize_model_parallel(model_parallel_size_)
    assert mpu.model_parallel_is_initialized()

    # Checks.
    def check(group, world_size, rank):
        assert world_size == torch.distributed.get_world_size(group=group)
        assert rank == torch.distributed.get_rank(group=group)

    # Model parallel.
    world_size = model_parallel_size_
    rank = torch.distributed.get_rank() % model_parallel_size_
    assert world_size == mpu.get_model_parallel_world_size()
    assert rank == mpu.get_model_parallel_rank()
    check(mpu.get_model_parallel_group(), world_size, rank)


    # Data parallel.
    world_size = torch.distributed.get_world_size() // model_parallel_size_
    rank = torch.distributed.get_rank() // model_parallel_size
    assert world_size == mpu.get_data_parallel_world_size()
    assert rank == mpu.get_data_parallel_rank()
    check(mpu.get_data_parallel_group(), world_size, rank)

    # Reset groups
    mpu.destroy_model_parallel()

    torch.distributed.barrier()
    if torch.distributed.get_rank() == 0:
        print('>> passed the test :-)')
Ejemplo n.º 19
0
def get_model(args, config, do_fp16=False):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(**config,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if args.deepspeed and do_fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if do_fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model,
                    device_ids=[i],
                    output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
Ejemplo n.º 20
0
def main():
    """Main training program."""

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False

    # Timer.
    timers = Timers()

    # Arguments.
    args = get_args()

    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # get the tokenizer
    tokenizer = GPT2Tokenizer(os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model'))

    # load train data
    if args.do_train:
        train_dataloader, _ = load_data(args, 'train', tokenizer, 1)
        dev_dataloader, dev_dataset = load_data(args, 'dev', tokenizer, 1)

        with open(args.deepspeed_config, "r") as f:
            deepspeed_conf = json.load(f)

        epoch = args.epoch
        grad_acc = deepspeed_conf["gradient_accumulation_steps"]
        args.train_iters = len(train_dataloader) * epoch / grad_acc

        # Model, optimizer, and learning rate.
        # TODO: maybe need to reinitialize optimizer
    elif args.do_eval:
        # Set an arbitrary positive integer since the optimizer and the scheduler will not be used when do eval.
        args.train_iters = 1

    model, optimizer, lr_scheduler = setup_model_and_optimizer_C(args)
    device = torch.cuda.current_device()

    # give a time stemp to the model
    cur_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
    results_dir = os.path.join(args.results_dir, "{}-{}".format(args.model_name, cur_time))
    os.makedirs(results_dir, exist_ok=True)

    if args.do_train and torch.distributed.get_rank() == 0:

        with open(os.path.join(results_dir, "train_log.txt"), "w") as f:
            f.write("Train losses:\n")

        with open(os.path.join(results_dir, "dev_log.txt"), "w") as f:
            f.write("Dev accs:\n")

    torch.distributed.barrier()

    if args.do_train:
        # cand_ids = torch.tensor(dev_dataset.cand_ids).to(device)
        total_loss, logging_loss, best_acc = 0.0, 0.0, 0.0
        global_step, total_step, best_step = 0, 0, 0
        
        for e in range(epoch):
            model.train()
            for batch, no_model_batch in tqdm(train_dataloader, disable=(torch.distributed.get_rank() != 0)):
                for k in batch:
                    batch[k] = batch[k].to(device)
                for k in no_model_batch:
                    no_model_batch[k] = no_model_batch[k].to(device)

                output = model(**batch)
                # get the loss of the last token
                output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum(no_model_batch["loss_mask"], -1).unsqueeze(-1)
                # get the label of the last token
                # labels = no_model_batch["labels"].float()
                labels = no_model_batch["truth"].float()
                # labels = (torch.sum(labels * no_model_batch["loss_mask"], 1) / torch.sum(no_model_batch["loss_mask"], -1)).long()
                # cross_entropy loss
                # losses = mpu.vocab_parallel_cross_entropy(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1))
                losses = CrossEntropyLoss(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1))
                loss = torch.mean(losses)

                model.backward(loss)
                model.step()

                torch.distributed.all_reduce(loss.data, group=mpu.get_data_parallel_group())
                loss.data = loss.data / mpu.get_data_parallel_world_size()
                total_loss += loss.item() / grad_acc

                if total_step % grad_acc == 0:
                    global_step += 1
                    if global_step != 0 and global_step % args.log_interval == 0:
                        # logging
                        if torch.distributed.get_rank() == 0:
                            train_log = "Epoch {}, global step {}, total step {}, train lm loss: {}".format(e, global_step, epoch * len(train_dataloader), (total_loss - logging_loss) / args.log_interval)
                            yprint(train_log)
                            with open(os.path.join(results_dir, "train_log.txt"), "a") as f:
                                f.write(train_log + "\n")

                        logging_loss = total_loss
    
                    if global_step != 0 and global_step % args.eval_interval == 0:
                        # evaluate on the dev
                        acc, _, _ = evaluate_tnews(args, model, dev_dataloader, device, mode="dev")
                        dev_results_dir = os.path.join(results_dir, "dev_step-{}".format(global_step))

                        if acc > best_acc:
                            best_acc = acc
                            best_step = global_step

                        if torch.distributed.get_rank() == 0:
                            # we will only write the log file once
                            dev_log = "Epoch: {}, Global step: {}, Acc: {}".format(e, global_step, acc)
                            yprint(dev_log)
                            os.makedirs(dev_results_dir, exist_ok=True)
                            with open(os.path.join(dev_results_dir, "dev_result.txt"), "w") as f:
                                f.write(dev_log + "\n")
                            with open(os.path.join(results_dir, "dev_log.txt"), "a") as f:
                                f.write(dev_log + "\n")

                        torch.distributed.barrier()
                        
                        args.save = dev_results_dir
                        save_checkpoint(global_step, model, optimizer, lr_scheduler, args)

                total_step += 1

        with open(os.path.join(dev_results_dir, "dev_log.txt"), "a") as f:
            f.write("Best acc: {} Best step: {}\n".format(best_acc, best_step))

    if args.do_eval:
        # evaluate on the test
        test_dataloader, test_dataset = load_data(args, 'test', tokenizer, 1)
        cand_ids = torch.tensor(test_dataset.cand_ids).to(device)

        if args.do_train:
            # if do training, then evaluate the one with the max acc on dev set.
            eval_ckpt_path = os.path.join(results_dir, "dev_step-{}".format(best_step))
            args.load = eval_ckpt_path
        else:
            # if only do eval, then evaluate the one specified by the user.
            args.load = args.eval_ckpt_path            
        
        load_checkpoint(model=model, optimizer=None, lr_scheduler=None, args=args)
        acc, _, _ = evaluate(args, model, test_dataloader, cand_ids, device, mode="test")

        if torch.distributed.get_rank() == 0:
            eval_log = "Checkpoint from {}: Acc: {}".format(args.load, acc)
            yprint(eval_log)
            with open(os.path.join(results_dir, "eval_log"), "w") as f:
                f.write(eval_log + "\n")

        torch.distributed.barrier()
Ejemplo n.º 21
0
def main():
    """Main training program."""

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False

    # Timer.
    timers = Timers()

    # Arguments.
    args = get_args()

    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # get the tokenizer
    tokenizer = GPT2Tokenizer(
        os.path.join(args.tokenizer_path, 'vocab.json'),
        os.path.join(args.tokenizer_path, 'chinese_vocab.model'))

    # load data
    test_dataloader, test_dataset = load_data(args, 'test', tokenizer, 1)
    # Set an arbitrary positive integer since the optimizer and the scheduler will not be used when do eval.
    args.train_iters = 1

    # Model
    model, _, _ = setup_model_and_optimizer(args)

    device = torch.cuda.current_device()

    # give a time stemp to the model
    cur_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
    results_dir = os.path.join(args.results_dir,
                               "{}-{}".format(args.model_name, cur_time))

    if torch.distributed.get_rank() == 0:
        os.makedirs(results_dir, exist_ok=True)

    model.eval()
    all_sids = []
    all_cids = []
    all_losses = []
    with torch.no_grad():
        for batch, no_model_batch in tqdm(
                test_dataloader,
                desc="Evaluating",
                disable=(torch.distributed.get_rank() != 0)):
            for k in batch:
                batch[k] = batch[k].to(device)
            for k in no_model_batch:
                no_model_batch[k] = no_model_batch[k].to(device)

            output = model(**batch)
            losses = mpu.vocab_parallel_cross_entropy(
                output.contiguous().float(), no_model_batch["labels"])
            loss_mask = no_model_batch["loss_mask"]
            loss = torch.sum(losses * loss_mask,
                             dim=-1) / loss_mask.sum(dim=-1)

            loss_tensor_list = [
                torch.zeros_like(loss).to(device)
                for _ in range(mpu.get_data_parallel_world_size())
            ]
            torch.distributed.all_gather(loss_tensor_list,
                                         loss.data,
                                         group=mpu.get_data_parallel_group())
            all_losses.extend(loss_tensor_list)

            sids = no_model_batch["sids"]
            sid_tensor_list = [
                torch.zeros_like(sids)
                for _ in range(mpu.get_data_parallel_world_size())
            ]
            torch.distributed.all_gather(sid_tensor_list,
                                         sids.data,
                                         group=mpu.get_data_parallel_group())
            all_sids.extend(sid_tensor_list)

            cids = no_model_batch["cids"]
            cid_tensor_list = [
                torch.zeros_like(cids)
                for _ in range(mpu.get_data_parallel_world_size())
            ]
            torch.distributed.all_gather(cid_tensor_list,
                                         cids.data,
                                         group=mpu.get_data_parallel_group())
            all_cids.extend(cid_tensor_list)

    if torch.distributed.get_rank() == 0:
        all_losses = torch.stack(all_losses).view(-1).cpu().detach().numpy()
        all_sids = torch.stack(all_sids).view(-1).cpu().detach().numpy()
        all_cids = torch.stack(all_cids).view(-1).cpu().detach().numpy()

        truth_labels = test_dataset.truth_labels
        preds = [[] for _ in truth_labels]

        for sid, cid, loss in zip(all_sids, all_cids, all_losses):
            preds[sid].append((cid, loss))

        preds = [min(p, key=lambda x: x[1])[0] for p in preds if len(p) > 0]

        yprint("Acc: {}".format(
            sum([int(p == l)
                 for p, l in zip(preds, truth_labels)]) / len(truth_labels)))
        with open(os.path.join(results_dir, "zero-shot_result.txt"), "w") as f:
            f.write("Acc: {}\n".format(
                sum([int(p == l) for p, l in zip(preds, truth_labels)]) /
                len(truth_labels)))

    torch.distributed.barrier()
Ejemplo n.º 22
0
def evaluate(data_iterator,
             model,
             args,
             timers,
             forward_step_func,
             verbose=False):
    """Evaluation."""
    # Turn on evaluation mode which disables dropout.
    model.eval()

    total_lm_loss, total_gpt_loss, total_bert_loss, total_sent_loss, total_multi_loss = 0, 0, 0, 0, 0
    gpt_iters, bert_iters, sent_iters, multi_iters = 0, 0, 0, 0
    mems = []
    with torch.no_grad():
        iteration = 0
        while iteration < args.eval_iters:
            iteration += 1
            if verbose and iteration % args.log_interval == 0:
                print_rank_0('Evaluating iter {}/{}'.format(
                    iteration, args.eval_iters))
            # Forward evaluation.
            lm_loss, mems, mode = forward_step_func(data_iterator,
                                                    model,
                                                    args,
                                                    timers,
                                                    mems=mems)
            '''when contiguous memory optimizations are enabled, the buffers
            allocated by the optimizations are deallocated during backward pass
            in the absence of backward pass the buffers should be reset after each
            forward pass'''
            if args.deepspeed and args.deepspeed_activation_checkpointing:
                deepspeed.checkpointing.reset()

            lm_loss = lm_loss.data.detach().float().item()
            total_lm_loss += lm_loss
            if mode == 'gpt':
                total_gpt_loss += lm_loss
                gpt_iters += 1
            elif mode == 'bert':
                total_bert_loss += lm_loss
                bert_iters += 1
            elif mode == 'sentence':
                total_sent_loss += lm_loss
                sent_iters += 1
            elif mode == 'multi-task':
                total_multi_loss += lm_loss
                multi_iters += 1
    # Move model back to the train mode.
    model.train()
    # Reduce across processes.
    loss_data = torch.cuda.FloatTensor([
        total_lm_loss, total_gpt_loss, total_bert_loss, total_sent_loss,
        total_multi_loss, gpt_iters, bert_iters, sent_iters, multi_iters
    ])
    torch.distributed.all_reduce(loss_data,
                                 group=mpu.get_data_parallel_group())
    loss_data = loss_data.tolist()
    total_lm_loss = loss_data[0] / args.eval_iters / (args.world_size /
                                                      args.model_parallel_size)
    total_gpt_loss = loss_data[1] / loss_data[5] if loss_data[5] > 0 else 0
    total_bert_loss = loss_data[2] / loss_data[6] if loss_data[6] > 0 else 0
    total_sent_loss = loss_data[3] / loss_data[7] if loss_data[7] > 0 else 0
    total_multi_loss = loss_data[4] / loss_data[8] if loss_data[8] > 0 else 0
    return total_lm_loss, total_gpt_loss, total_bert_loss, total_sent_loss, total_multi_loss
Ejemplo n.º 23
0
def get_samples_mapping_(indexed_dataset, data_prefix, num_epochs,
                         max_num_samples, max_seq_length, short_seq_prob, seed,
                         name):
    if not num_epochs:
        if not max_num_samples:
            raise ValueError("Need to specify either max_num_samples "
                             "or num_epochs")
        num_epochs = np.iinfo(np.int32).max - 1
    if not max_num_samples:
        max_num_samples = np.iinfo(np.int64).max - 1

    # Filename of the index mapping
    indexmap_filename = data_prefix
    indexmap_filename += '_{}_indexmap'.format(name)
    if num_epochs != (np.iinfo(np.int32).max - 1):
        indexmap_filename += '_{}ep'.format(num_epochs)
    if max_num_samples != (np.iinfo(np.int64).max - 1):
        indexmap_filename += '_{}mns'.format(max_num_samples)
    indexmap_filename += '_{}msl'.format(max_seq_length)
    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
    indexmap_filename += '_{}s'.format(seed)
    indexmap_filename += '.npy'

    # Build the indexed mapping if not exist.
    if torch.distributed.get_rank() == 0 and \
       not os.path.isfile(indexmap_filename):
        print(' > WARNING: could not find index map file {}, building '
              'the indices on rank 0 ...'.format(indexmap_filename))

        # Make sure the types match the helpers input types.
        assert indexed_dataset.doc_idx.dtype == np.int64
        assert indexed_dataset.sizes.dtype == np.int32

        # Build samples mapping
        verbose = torch.distributed.get_rank() == 0
        start_time = time.time()
        print_rank_0(
            ' > building sapmles index mapping for {} ...'.format(name))
        # First compile and then import.
        from data.dataset_utils import compile_helper
        compile_helper()
        from data import helpers
        samples_mapping = helpers.build_mapping(
            indexed_dataset.doc_idx,
            indexed_dataset.sizes,
            num_epochs,
            max_num_samples,
            max_seq_length - 3,  # account for added tokens
            short_seq_prob,
            seed,
            verbose)
        print_rank_0(' > done building sapmles index maping')
        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
        print_rank_0(
            ' > saved the index mapping in {}'.format(indexmap_filename))
        # Make sure all the ranks have built the mapping
        print_rank_0(' > elasped time to build and save samples mapping '
                     '(seconds): {:4f}'.format(time.time() - start_time))
    # This should be a barrier but nccl barrier assumes
    # device_index=rank which is not the case for model
    # parallel case
    counts = torch.cuda.LongTensor([1])
    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
    assert counts[0].item() == torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())

    # Load indexed dataset.
    print_rank_0(
        ' > loading indexed mapping from {}'.format(indexmap_filename))
    start_time = time.time()
    samples_mapping = np.load(indexmap_filename,
                              allow_pickle=True,
                              mmap_mode='r')
    print_rank_0(
        '    loaded indexed file in {:3.3f} seconds'.format(time.time() -
                                                            start_time))
    print_rank_0('    total number of samples: {}'.format(
        samples_mapping.shape[0]))

    return samples_mapping
Ejemplo n.º 24
0
def make_loaders(args, tokenizer):
    """makes training/val/test"""

    if args.use_tfrecords:
        return make_tfrecord_loaders(args)
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
    if args.loader_scatter is not None:
        assert world_size % args.loader_scatter == 0
    batch_size = args.batch_size * world_size
    eval_batch_size = batch_size
    if args.eval_batch_size is not None:
        eval_batch_size = args.eval_batch_size * world_size
    seq_length = args.seq_length
    if seq_length < 0:
        seq_length = seq_length * world_size
    eval_seq_length = args.eval_seq_length
    if eval_seq_length is not None and eval_seq_length < 0:
        eval_seq_length = eval_seq_length * world_size
    split = get_split(args)
    data_set_args = {
        'path': args.train_data,
        'seq_length': seq_length,
        'mem_length': args.mem_length,
        'delim': args.delim,
        'text_key': args.text_key,
        'label_key': 'label',
        'ds_type': args.data_set_type,
        'split': split,
        'loose': args.loose_json,
        'max_preds_per_seq': args.max_preds_per_seq,
        'presplit_sentences': args.presplit_sentences,
        'sample_one_document': args.sample_one_document,
        'filter_english': args.filter_english,
        'pre_tokenize': not args.no_pre_tokenize,
        'tokenizer': tokenizer,
        'save_splits': args.save_splits,
        'load_splits': args.load_splits,
        'save_test_data': args.save_test_data,
        'no_lazy_loader': args.no_lazy_loader,
        'loader_scatter': args.loader_scatter,
        'data_parallel_rank': mpu.get_data_parallel_rank(),
        "non_sentence_start": args.non_sentence_start,
        "half_lazy_loader": args.half_lazy_loader
    }

    eval_set_args = copy.copy(data_set_args)
    eval_set_args['split'] = [1.]
    # if optional eval args were set then replace their
    # equivalent values in the arg dict
    if eval_seq_length:
        eval_set_args['seq_length'] = eval_seq_length
    if args.eval_max_preds_per_seq:
        eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
    if args.eval_text_key is not None:
        eval_set_args['text_key'] = args.eval_text_key

    # make datasets splits and tokenizer
    train, valid, test = None, None, None

    if args.train_data is not None:
        train = data_utils.make_dataset(**data_set_args)
        if data_utils.should_split(split):
            train, valid, test = train
        eval_set_args['tokenizer'] = tokenizer

    # make training and val dataset if necessary
    if valid is None and args.valid_data is not None:
        eval_set_args['path'] = args.valid_data
        valid = data_utils.make_dataset(**eval_set_args)
        eval_set_args['tokenizer'] = tokenizer
    if test is None and args.test_data is not None:
        eval_set_args['path'] = args.test_data
        test = data_utils.make_dataset(**eval_set_args)

    # wrap datasets with data loader
    use_block = args.block_lm or args.encoder_decoder

    if train is not None and args.batch_size > 0:
        train = make_data_loader(train,
                                 tokenizer,
                                 batch_size,
                                 args.train_iters,
                                 args,
                                 shuffle=args.shuffle,
                                 block_collate=use_block)
        args.do_train = True
    else:
        args.do_train = False
    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
    if valid is not None:
        valid = make_data_loader(valid,
                                 tokenizer,
                                 eval_batch_size,
                                 args.train_iters,
                                 args,
                                 shuffle=args.shuffle,
                                 block_collate=use_block)
        args.do_valid = True
    else:
        args.do_valid = False
    if test is not None:
        test = make_data_loader(test,
                                tokenizer,
                                eval_batch_size,
                                len(test) // eval_batch_size + 1,
                                args,
                                shuffle=args.shuffle,
                                block_collate=use_block)
        args.do_test = True
    else:
        args.do_test = False

    return train, valid, test
    def __init__(self,
                 config,
                 batch_slices,
                 seq_slices,
                 distributed_init_method,
                 world_size,
                 data_parallel_size,
                 model_parallel_size,
                 pipeline_parallel_size,
                 rank,
                 local_rank,
                 mixed_precision=False,
                 use_mpi=False,
                 init_process_group=False,
                 checkpoint_gradients=False):
        self.config = config
        self.batch_slices = batch_slices
        self.seq_slices = seq_slices
        torch.cuda.set_device(local_rank)
        if init_process_group:
            dist.init_process_group(
                backend='nccl',
                init_method=distributed_init_method,
                world_size=world_size,
                rank=rank,
            )
        dist.all_reduce(torch.zeros(1).cuda())
        mpu.initialize_model_parallel(model_parallel_size,
                                      pipeline_parallel_size)
        set_random_seed(0)
        mpu.model_parallel_cuda_manual_seed(0)
        self.rank = rank
        self.local_rank = local_rank
        self.world_size = world_size
        self.data_parallel_size = data_parallel_size
        self.model_parallel_size = model_parallel_size
        self.pipeline_parallel_size = pipeline_parallel_size
        self.pipeline_parallel_group_rank = mpu.get_pipeline_parallel_group_rank(
        )
        self.data_parallel_group = mpu.get_data_parallel_group()
        self.model_parallel_group = mpu.get_model_parallel_group()
        self.pipeline_parallel_pred_group = mpu.get_pipeline_parallel_pred_group(
        )
        self.pipeline_parallel_succ_group = mpu.get_pipeline_parallel_succ_group(
        )
        self.model_parallel_src_rank = mpu.get_model_parallel_src_rank()
        self.model_parallel_dst_rank = mpu.get_model_parallel_dst_rank()
        self.model_parallel_next_src_rank = (
            self.model_parallel_src_rank + self.model_parallel_size if
            self.pipeline_parallel_group_rank < self.pipeline_parallel_size - 1
            else None)
        self.model_parallel_prev_dst_rank = (
            self.model_parallel_dst_rank - self.model_parallel_size
            if self.pipeline_parallel_group_rank > 0 else None)

        self.n_layers = (config.n_layers // pipeline_parallel_size +
                         int(rank < config.n_layers % pipeline_parallel_size))
        self.config = config
        self.mixed_precision = mixed_precision
        self.checkpoint_gradients = checkpoint_gradients

        self.layers = []
        for _ in range(self.n_layers):
            l = ModelParallelTransformerLayer(
                self.config.embedding_dim,
                self.config.ffn_embedding_dim,
                self.config.num_attention_heads,
                device="cuda",
                checkpoint_gradients=self.checkpoint_gradients)
            self.layers.append(l.half() if self.mixed_precision else l)

        self.all_parameters = []
        for layer in self.layers:
            self.all_parameters.extend(layer.parameters())
        self.n_params = len(self.all_parameters)

        if self.mixed_precision:
            self.master_parameters = [
                p.clone().detach().float() for p in self.all_parameters
            ]
            for p in self.master_parameters:
                p.requires_grad_()
            self.optimizer = optimizers.FusedAdam(self.master_parameters,
                                                  lr=1e-10)
        else:
            self.optimizer = torch.optim.Adam(self.all_parameters, lr=1e-10)
Ejemplo n.º 26
0
def train_step(data_iterator,
               model,
               optimizer,
               lr_scheduler,
               args,
               timers,
               forward_step_func,
               mems=None,
               single_step=False):
    """Single training step."""
    lm_loss_total, count = 0.0, 0
    mems = [] if mems is None else mems
    if not args.deepspeed:
        optimizer.zero_grad()
    while True:
        skipped_iter, complete = 0, False
        # Forward model for one step.
        timers('forward').start()
        lm_loss, mems, _ = forward_step_func(data_iterator, model, args,
                                             timers, mems)
        timers('forward').stop()
        # print_rank_0("Forward step")
        if not args.deepspeed:
            lm_loss /= args.gradient_accumulation_steps

        reduced_loss = lm_loss.detach().clone().view(1)
        torch.distributed.all_reduce(reduced_loss.data,
                                     group=mpu.get_data_parallel_group())
        reduced_loss.data = reduced_loss.data / (args.world_size /
                                                 args.model_parallel_size)

        if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
            lm_loss_total += reduced_loss
            count += 1

            # Calculate gradients, reduce across processes, and clip.
            timers('backward').start()
            backward_step(optimizer, model, lm_loss, args, timers)
            timers('backward').stop()
            # print_rank_0("Backward step")
            # Update parameters.
            timers('optimizer').start()
            if args.deepspeed:
                if model.is_gradient_accumulation_boundary():
                    model.step()
                    complete = True
                    if not (args.fp16 and optimizer.overflow):
                        lr_scheduler.step()
                    else:
                        skipped_iter = 1
                else:
                    model.step()
            else:
                if count == args.gradient_accumulation_steps:
                    optimizer.step()
                    complete = True
                    # Update learning rate.
                    if not (args.fp16 and optimizer.overflow):
                        lr_scheduler.step()
                    else:
                        skipped_iter = 1
            # print_rank_0("Optimizer step")
            timers('optimizer').stop()
            if complete:
                break
        else:
            print_rank_0("Found NaN loss, skip backward")
            del lm_loss, reduced_loss
            mems = []
        if single_step:
            break
    if args.deepspeed:
        lm_loss_total = lm_loss_total / count
    return lm_loss_total, skipped_iter, mems
Ejemplo n.º 27
0
def make_dataset(path,
                 seq_length,
                 text_key,
                 label_key,
                 lazy=False,
                 process_fn=None,
                 split=[1.],
                 delim=',',
                 loose=False,
                 binarize_sent=False,
                 drop_unlabeled=False,
                 tokenizer=None,
                 tokenizer_type='CharacterLevelTokenizer',
                 tokenizer_model_path=None,
                 vocab_size=None,
                 model_type='bpe',
                 pad_token=0,
                 character_converage=1.0,
                 non_binary_cols=None,
                 **kwargs):
    """function to create datasets+tokenizers for common options"""
    if isinstance(process_fn, str):
        process_fn = eval(process_fn)
    if non_binary_cols is not None:
        # multilabel dataset support (only for csvs)
        label_key = non_binary_cols

    def get_dataset_from_path(path_, dataset_len=None):
        if lazy:
            # get lazily loaded dataset
            named_corpora = False
            if supported_corpus(path_):
                named_corpora = True
                name = path_
                path_ = corpora.NAMED_CORPORA[path_].PATH
            if not exists_lazy(path_, data_type='data'):
                # create cached version of dataset for lazy loading if it doesn't exist
                text = get_dataset(name if named_corpora else path_,
                                   text_key=text_key,
                                   label_key=label_key,
                                   binarize_sent=binarize_sent,
                                   delim=delim,
                                   drop_unlabeled=drop_unlabeled,
                                   loose_json=loose)
                make_lazy(path_, text.X, data_type='data')
            text = lazy_array_loader(path_,
                                     data_type='data',
                                     map_fn=process_fn)
        else:
            # get dataset
            text = get_dataset(path_,
                               text_key=text_key,
                               label_key=label_key,
                               binarize_sent=binarize_sent,
                               delim=delim,
                               drop_unlabeled=drop_unlabeled,
                               loose_json=loose,
                               preprocess_fn=process_fn,
                               dataset_len=dataset_len)
        return text

    # get one or multiple datasets and concatenate

    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
    if isinstance(path, list) and len(path) == 1 and os.path.isdir(path[0]):
        path = [
            os.path.join(path[0], f) for f in os.listdir(path[0])
            if not os.path.isdir(os.path.join(path[0], f))
        ]
        random.shuffle(path)
        path = [
            path[start::world_size]
            for start in range(min(world_size, len(path)))
        ]
    elif isinstance(path, str):
        path = [[path]]
    elif isinstance(path, list) and len(path) == 1:
        path = [path]
    #print("path= ", path)
    #dataset_lens = []
    #if 'train_file_lens_path' in kwargs and kwargs['train_file_lens_path'] is not None:
    #    path_lens = {}
    #    flens = open(kwargs['train_file_lens_path'], 'r')
    #    for line in flens:
    #        split_line = line.rstrip('\n').split('\t')
    #        path_lens[split_line[0]] = int(split_line[1])
    #    flens.close()
    #    for p in path:
    #        if p in path_lens:
    #            dataset_lens.append(path_lens[p])
    #        else:
    #            dataset_lens.append(int(subprocess.check_output("wc -l " + p, shell=True).split()[0]))
    #else:
    #    for p in path:
    #        dataset_lens.append(int(subprocess.check_output("wc -l " + p, shell=True).split()[0]))

    #datasets = [get_dataset_from_path(p, dlen) for p, dlen in zip(path, dataset_lens)]
    #if len(datasets) == 1:
    #    ds = datasets[0]
    #else:
    #    ds = ConcatDataset(datasets)
    # make tokenizer for dataset
    if tokenizer is None:
        tokenizer = make_tokenizer(tokenizer_type, None, tokenizer_model_path,
                                   vocab_size, model_type, pad_token,
                                   character_converage, **kwargs)

    ds_type = ''
    if 'ds_type' in kwargs:
        ds_type = kwargs['ds_type']
    # Split dataset into train/val/test (and wrap bert dataset)
    #if should_split(split):
    #    ds = split_ds(ds, split)
    #    if ds_type.lower() == 'bert':
    #        presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
    #        ds = [binglr_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)  if d is not None else None  for d in ds]
    #    elif ds_type.lower() == 'gpt2':
    #        ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
    #else:

    if ds_type.lower() == 'bert':
        ds = []
        print((len(path), world_size))
        for i in range(min(world_size, len(path))):
            ds_iters = [
                binglr_iterator_dataset(
                    [p],
                    run_once=True,
                    max_seq_len=seq_length,
                    mask_lm_prob=kwargs['mask_lm_prob']
                    if 'mask_lm_prob' in kwargs else 0.15,
                    max_preds_per_seq=kwargs['max_preds_per_seq']
                    if 'max_preds_per_seq' in kwargs else 20,
                    tokenizer=tokenizer,
                    train=kwargs['train'] if 'train' in kwargs else False,
                    num_urls=kwargs['num_urls'] if 'num_urls' in kwargs else 4)
                for p in path[i]
            ]
            ds.append(MyChainDataset(ds_iters))
    elif ds_type.lower() == 'pretrain':
        ds = []
        for i in range(min(world_size, len(path))):
            ds_iters = [
                bert_iterator_dataset(
                    [p],
                    run_once=True,
                    max_seq_len=seq_length,
                    mask_lm_prob=kwargs['mask_lm_prob']
                    if 'mask_lm_prob' in kwargs else 0.15,
                    max_preds_per_seq=kwargs['max_preds_per_seq']
                    if 'max_preds_per_seq' in kwargs else 20,
                    tokenizer=tokenizer,
                    train=kwargs['train'] if 'train' in kwargs else False,
                    num_urls=kwargs['num_urls'] if 'num_urls' in kwargs else 1)
                for p in path[i]
            ]
            ds.append(MyChainDataset0(ds_iters))
        #ds = binglr_iterator_dataset(path, max_seq_len=seq_length, mask_lm_prob=kwargs['mask_lm_prob'] if 'mask_lm_prob' in kwargs else 0.15, max_preds_per_seq=kwargs['max_preds_per_seq'] if 'max_preds_per_seq' in kwargs else 20, tokenizer=tokenizer, train=kwargs['train'] if 'train' in kwargs else False, num_urls=kwargs['num_urls'] if 'num_urls' in kwargs else 4)
    elif ds_type.lower() == 'gpt2':
        ds = GPT2Dataset(ds, max_seq_len=seq_length)
    return ds, tokenizer
Ejemplo n.º 28
0
def get_model(args, model_type=None, multi_token=True, num_labels=None):
    """Build the model."""
    print_rank_0('building GLM model ...')

    output_predict, parallel_output = True, True
    if (model_type == "multiple_choice" or model_type == "classification") and not args.cloze_eval:
        output_predict = False
    if model_type is not None:
        parallel_output = False

    model = GLMModel(num_layers=args.num_layers,
                     vocab_size=args.vocab_size,
                     hidden_size=args.hidden_size,
                     num_attention_heads=args.num_attention_heads,
                     embedding_dropout_prob=args.hidden_dropout,
                     attention_dropout_prob=args.attention_dropout,
                     output_dropout_prob=args.hidden_dropout,
                     max_sequence_length=args.max_position_embeddings,
                     max_memory_length=args.mem_length,
                     checkpoint_activations=args.checkpoint_activations,
                     checkpoint_num_layers=args.checkpoint_num_layers,
                     parallel_output=parallel_output,
                     relative_encoding=args.transformer_xl,
                     block_position_encoding=args.block_lm and not args.masked_lm,
                     output_predict=output_predict)

    if model_type is not None:
        if model_type == 'cloze':
            if multi_token:
                if args.fast_decode:
                    model = GLMForMultiTokenClozeFast(model, length_penalty=args.length_penalty)
                else:
                    model = GLMForMultiTokenCloze(model, length_penalty=args.length_penalty)
            else:
                model = GLMForSingleTokenCloze(model)
        elif model_type == 'classification':
            model = GLMForSequenceClassification(model, args.hidden_size, args.output_dropout, args.pool_token,
                                                 num_class=num_labels)
        elif model_type == 'generation':
            pass
        else:
            raise NotImplementedError(model_type)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if hasattr(args, "deepspeed") and args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if not args.deepspeed:
        if args.DDP_impl == 'torch':
            i = torch.cuda.current_device()
            model = TorchDDP(model, device_ids=[i], output_device=i,
                             process_group=mpu.get_data_parallel_group())
        else:
            model = LocalDDP(model)

    return model
Ejemplo n.º 29
0
def make_loaders(args):
    """makes training/val/test"""

    if args.use_tfrecords:
        return make_tfrecord_loaders(args)
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
    batch_size = args.batch_size * world_size
    eval_batch_size = batch_size
    if args.eval_batch_size is not None:
        eval_batch_size = args.eval_batch_size * world_size
    seq_length = args.seq_length
    if seq_length < 0:
        seq_length = seq_length * world_size
    eval_seq_length = args.eval_seq_length
    if eval_seq_length is not None and eval_seq_length < 0:
        eval_seq_length = eval_seq_length * world_size
    split = get_split(args)
    data_set_args = {
        'local_rank': args.local_rank,
        'path': args.train_data,
        'seq_length': seq_length,
        'mem_length': args.mem_length,
        'lazy': args.lazy_loader,
        'xl_style': args.transformer_xl,
        'delim': args.delim,
        'text_key': args.text_key,
        'label_key': 'label',
        'non_binary_cols': None,
        'ds_type': args.data_set_type,
        'split': split,
        'loose': args.loose_json,
        'tokenizer_type': args.tokenizer_type,
        'tokenizer_model_path': args.tokenizer_path,
        'vocab_size': args.vocab_size,
        'model_type': args.tokenizer_model_type,
        'cache_dir': args.cache_dir,
        'max_preds_per_seq': args.max_preds_per_seq,
        'presplit_sentences': args.presplit_sentences,
        'sample_one_document': args.sample_one_document,
        'pre_tokenize': not args.not_pre_tokenize
    }

    eval_set_args = copy.copy(data_set_args)
    eval_set_args['split'] = [1.]
    # if optional eval args were set then replace their
    # equivalent values in the arg dict
    if eval_seq_length:
        eval_set_args['seq_length'] = eval_seq_length
    if args.eval_max_preds_per_seq:
        eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
    if args.eval_text_key is not None:
        eval_set_args['text_key'] = args.eval_text_key

    # make datasets splits and tokenizer
    train = None
    valid = None
    test = None

    if args.train_data is not None:
        train, tokenizer = data_utils.make_dataset(**data_set_args)
        if data_utils.should_split(split):
            train, valid, test = train
        eval_set_args['tokenizer'] = tokenizer

    # make training and val dataset if necessary
    if valid is None and args.valid_data is not None:
        eval_set_args['path'] = args.valid_data
        valid, tokenizer = data_utils.make_dataset(**eval_set_args)
        eval_set_args['tokenizer'] = tokenizer
    if test is None and args.test_data is not None:
        eval_set_args['path'] = args.test_data
        test, tokenizer = data_utils.make_dataset(**eval_set_args)

    # wrap datasets with data loader
    if train is not None and args.batch_size > 0:
        train = make_data_loader(train, batch_size, args)
        args.do_train = True
    else:
        args.do_train = False
    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
    if valid is not None:
        valid = make_data_loader(valid, eval_batch_size, args)
        args.do_valid = True
    else:
        args.do_valid = False
    if test is not None:
        test = make_data_loader(test, eval_batch_size, args)
        args.do_test = True
    else:
        args.do_test = False

    return (train, valid, test), tokenizer
Ejemplo n.º 30
0
def get_model(args,
              model_type=None,
              multi_token=True,
              num_labels=None,
              spell_length=None):
    """Build the model."""
    print_rank_0('building GPT2 model ...')
    if args.pretrained_bert:
        if model_type == "multiple_choice":
            model = BertForMultipleChoice.from_pretrained(
                args.tokenizer_model_type,
                cache_dir=args.cache_dir,
                fp32_layernorm=args.fp32_layernorm,
                fp32_embedding=args.fp32_embedding,
                layernorm_epsilon=args.layernorm_epsilon)
        elif model_type == "classification":
            model = BertForSequenceClassification.from_pretrained(
                args.tokenizer_model_type,
                cache_dir=args.cache_dir,
                fp32_layernorm=args.fp32_layernorm,
                fp32_embedding=args.fp32_embedding,
                layernorm_epsilon=args.layernorm_epsilon,
                num_labels=num_labels)
        else:
            raise NotImplementedError
    else:
        output_predict, paralle_output = True, True
        if (model_type == "multiple_choice"
                or model_type == "classification") and not args.cloze_eval:
            output_predict = False
        if model_type is not None:
            paralle_output = False
        if spell_length is not None:
            print_rank_0(f"Continuous spell length {spell_length}")
        model = GLMModel(num_layers=args.num_layers,
                         vocab_size=args.vocab_size,
                         hidden_size=args.hidden_size,
                         num_attention_heads=args.num_attention_heads,
                         embedding_dropout_prob=args.hidden_dropout,
                         attention_dropout_prob=args.attention_dropout,
                         output_dropout_prob=args.hidden_dropout,
                         max_sequence_length=args.max_position_embeddings,
                         max_memory_length=args.mem_length,
                         checkpoint_activations=args.checkpoint_activations,
                         checkpoint_num_layers=args.checkpoint_num_layers,
                         parallel_output=paralle_output,
                         relative_encoding=args.transformer_xl,
                         block_position_encoding=args.block_lm
                         and not args.masked_lm,
                         output_predict=output_predict,
                         spell_length=spell_length,
                         spell_func=args.prompt_func,
                         attention_scale=args.attention_scale)
        if args.freeze_transformer:
            model.freeze_transformer(
                tune_prefix_layers=args.tune_prefix_layers)
        if model_type is not None:
            if model_type == 'multiple_choice':
                if args.cloze_eval:
                    if multi_token:
                        if args.fast_decode:
                            model = GLMForMultiTokenClozeFast(
                                model, length_penalty=args.length_penalty)
                        else:
                            model = GLMForMultiTokenCloze(
                                model, length_penalty=args.length_penalty)
                    else:
                        model = GLMForSingleTokenCloze(
                            model, take_softmax=args.adapet)
                else:
                    model = GLMForSequenceClassification(model,
                                                         args.hidden_size,
                                                         args.output_dropout,
                                                         args.pool_token,
                                                         num_class=num_labels)
            elif model_type == 'classification':
                model = GLMForSequenceClassification(model,
                                                     args.hidden_size,
                                                     args.output_dropout,
                                                     args.pool_token,
                                                     num_class=num_labels)
            elif model_type == 'generation':
                pass
            else:
                raise NotImplementedError(model_type)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if not args.deepspeed and (args.train_iters or args.epochs):
        if args.DDP_impl == 'torch':
            i = torch.cuda.current_device()
            model = TorchDDP(model,
                             device_ids=[i],
                             output_device=i,
                             process_group=mpu.get_data_parallel_group())
        elif args.DDP_impl == 'local':
            model = LocalDDP(model)
        else:
            print_rank_0("Skip DDP model")
    return model