Example #1
0
def setup_model_and_optimizer(args,
                              model_type=None,
                              multi_token=True,
                              num_labels=None,
                              spell_length=None):
    """Setup model and optimizer."""

    model = get_model(args,
                      model_type=model_type,
                      multi_token=multi_token,
                      num_labels=num_labels,
                      spell_length=spell_length)
    param_groups = get_optimizer_param_groups(model)

    if args.train_data is not None or args.data_dir is not None and (
            args.epochs > 0 or args.train_iters > 0):
        if args.deepspeed:
            print_rank_0("DeepSpeed is enabled.")

            model, optimizer, _, _ = deepspeed.initialize(
                model=model,
                model_parameters=param_groups,
                args=args,
                mpu=mpu,
                dist_init_required=False)
        else:
            optimizer = get_optimizer(param_groups, args)
        lr_scheduler = get_learning_rate_scheduler(optimizer, args)
    else:
        optimizer, lr_scheduler = None, None

    return model, optimizer, lr_scheduler
Example #2
0
def prepare_tokenizer(args):
    tokenizer_args = {
        'tokenizer_type': args.tokenizer_type,
        'corpus': None,
        'model_path': args.tokenizer_path,
        'vocab_size': args.vocab_size,
        'model_type': args.tokenizer_model_type,
        'cache_dir': args.cache_dir}
    tokenizer = make_tokenizer(**tokenizer_args)

    num_tokens = tokenizer.num_tokens
    before = num_tokens
    after = before
    multiple = args.make_vocab_size_divisible_by * \
               mpu.get_model_parallel_world_size()
    while (after % multiple) != 0:
        after += 1
    print_rank_0('> padded vocab (size: {}) with {} dummy '
                 'tokens (new size: {})'.format(
        before, after - before, after))

    args.tokenizer_num_tokens = after
    args.tokenizer_num_type_tokens = tokenizer.num_type_tokens
    args.eod_token = tokenizer.get_command('eos').Id

    # after = tokenizer.num_tokens
    # while after % mpu.get_model_parallel_world_size() != 0:
    #     after += 1

    args.vocab_size = after
    print("prepare tokenizer done", flush=True)

    return tokenizer
def evaluate(data_iterator, model, args, timers, verbose = False):
    """Evaluation."""

    # Turn on evaluation mode which disables dropout.
    model.eval()

    total_lm_loss = 0
    #total_nsp_loss = 0

    with torch.no_grad():
        iteration = 0
        while iteration < args.eval_iters:
            iteration += 1
            if verbose and iteration % args.log_interval == 0:
                print_rank_0('Evaluating iter {}/{}'.format(iteration, args.eval_iters))
            # Forward evaluation.
            lm_loss = forward_step(data_iterator, model,
                                             args, timers)
            # Reduce across processes.
            if isinstance(model, args.DDP_type):
                reduced_losses = lm_loss.view(1)
                torch.distributed.all_reduce(reduced_losses.data)
                reduced_losses.data = reduced_losses.data/args.world_size
                lm_loss = reduced_losses[0]
                #nsp_loss = reduced_losses[1]

            total_lm_loss += lm_loss.data.detach().float().item()
            #total_nsp_loss += nsp_loss.data.detach().float().item()

    # Move model back to the train mode.
    model.train()

    total_lm_loss /= args.eval_iters
    #total_nsp_loss /= args.eval_iters
    return total_lm_loss#, total_nsp_loss
Example #4
0
    def __init__(self, args, task_name, data_dir, seq_length, split, tokenizer, for_train=False,
                 pattern_ensemble=False, pattern_text=False):
        self.processor = PROCESSORS[task_name](args)
        args.variable_num_choices = self.processor.variable_num_choices
        print_rank_0(
            f"Creating {task_name} dataset from file at {data_dir} (split={split})"
        )
        self.dataset_name = f"{task_name}-{split}"
        self.cloze_eval = args.cloze_eval
        self.seq_length = seq_length
        self.tokenizer = tokenizer
        self.pattern_ensemble = pattern_ensemble
        self.pattern_text = pattern_text
        if pattern_text:
            assert self.cloze_eval, "Labeled examples only exist in cloze evaluation"
        self.args = args
        if split == DEV_SET:
            example_list = self.processor.get_dev_examples(data_dir, for_train=for_train)
        elif split == TEST_SET:
            example_list = self.processor.get_test_examples(data_dir)
        elif split == TRUE_DEV_SET:
            example_list = self.processor.get_true_dev_examples(data_dir)
        elif split == TRAIN_SET:
            if task_name == "wsc":
                example_list = self.processor.get_train_examples(data_dir, cloze_eval=args.cloze_eval)
            else:
                example_list = self.processor.get_train_examples(data_dir)
        elif split == UNLABELED_SET:
            example_list = self.processor.get_unlabeled_examples(data_dir)
            for example in example_list:
                example.label = self.processor.get_labels()[0]
        else:
            raise ValueError(f"'split' must be one of {SPLIT_TYPES}, got '{split}' instead")
        if split == TEST_SET:
            self.labeled = False
        else:
            self.labeled = True

        label_distribution = Counter(example.label for example in example_list)
        print_rank_0(
            f"Returning {len(example_list)} {split} examples with label dist.: {list(label_distribution.items())}")
        self.samples = []
        example_list.sort(key=lambda x: x.num_choices)
        self.example_list = example_list
        if self.cloze_eval:
            if self.pattern_ensemble:
                pattern_ids = PVPS[task_name].available_patterns()
                self.pvps = []
                for pattern_id in pattern_ids:
                    self.pvps.append(PVPS[task_name](args, tokenizer, self.processor.get_labels(), seq_length,
                                                     pattern_id=pattern_id, num_prompt_tokens=args.num_prompt_tokens,
                                                     is_multi_token=args.multi_token,
                                                     max_segment_length=args.segment_length,
                                                     fast_decode=args.fast_decode, split=split))
            else:
                self.pvp = PVPS[task_name](args, tokenizer, self.processor.get_labels(), seq_length,
                                           pattern_id=args.pattern_id, num_prompt_tokens=args.num_prompt_tokens,
                                           is_multi_token=args.multi_token, max_segment_length=args.segment_length,
                                           fast_decode=args.fast_decode, split=split)
        self.examples = {example.guid: example for example in example_list}
Example #5
0
def setup_model_and_optimizer(args,
                              config,
                              need_optim=False,
                              ckpt_path=None,
                              do_fp16=False):
    """Setup model and optimizer."""

    model = get_model(args, config, do_fp16=do_fp16)
    optimizer = get_optimizer(model, args,
                              do_fp16=do_fp16) if need_optim else None

    lr_scheduler = get_learning_rate_scheduler(optimizer,
                                               args) if need_optim else None

    if args.deepspeed:
        print_rank_0("DeepSpeed is enabled.")

        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,
            optimizer=optimizer,
            args=args,
            lr_scheduler=lr_scheduler,
            mpu=mpu,
            dist_init_required=False)

    iteration = 0
    if ckpt_path is not None:
        iteration = load_checkpoint(ckpt_path, model, optimizer, lr_scheduler,
                                    args)

    return model, optimizer, lr_scheduler, iteration
Example #6
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                label = example_json['label'] if 'label' in example_json else None
                idx = example_json['idx']
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json['premise']
                meta = {
                    'choice1': example_json['choice1'],
                    'choice2': example_json['choice2'],
                    'question': example_json['question']
                }
                example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
                examples.append(example)

        if set_type == 'train' or set_type == 'unlabeled':
            mirror_examples = []
            for ex in examples:
                label = 1 if ex.label == 0 else 0
                meta = {
                    'choice1': ex.meta['choice2'],
                    'choice2': ex.meta['choice1'],
                    'question': ex.meta['question']
                }
                mirror_example = InputExample(guid=ex.guid + 'm', text_a=ex.text_a, label=label, meta=meta)
                mirror_examples.append(mirror_example)
            examples += mirror_examples
            print_rank_0(f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...")
        return examples
Example #7
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)

                passage_idx = example_json['idx']
                text = punctuation_standardization(example_json['passage']['text'])
                questions = example_json['passage']['questions']
                for question_json in questions:
                    question = punctuation_standardization(question_json["question"])
                    question_idx = question_json['idx']
                    answers = question_json["answers"]
                    for answer_json in answers:
                        label = answer_json["label"] if 'label' in answer_json else None
                        answer_idx = answer_json["idx"]
                        guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}'
                        meta = {
                            'passage_idx': passage_idx,
                            'question_idx': question_idx,
                            'answer_idx': answer_idx,
                            'answer': punctuation_standardization(answer_json["text"])
                        }
                        idx = [passage_idx, question_idx, answer_idx]
                        example = InputExample(guid=guid, text_a=text, text_b=question, label=label, meta=meta, idx=idx)
                        examples.append(example)

        question_indices = list(set(example.meta['question_idx'] for example in examples))
        label_distribution = Counter(example.label for example in examples)
        print_rank_0(
            f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label "
            f"distribution {list(label_distribution.items())}")
        return examples
Example #8
0
def setup_model_and_optimizer_C(args, model_cls=GPT2Model_C):
    """Setup model and optimizer."""

    model = get_model_C(args, model_cls)
    optimizer = get_optimizer(model, args)
    lr_scheduler = get_learning_rate_scheduler(optimizer, args)

    if args.deepspeed:
        print_rank_0("DeepSpeed is enabled.")

        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,
            optimizer=optimizer,
            args=args,
            lr_scheduler=lr_scheduler,
            mpu=mpu,
            dist_init_required=False
        )

    if args.load is not None:
        args.iteration = load_checkpoint(model, optimizer, lr_scheduler, args)
    else:
        args.iteration = 0

    return model, optimizer, lr_scheduler
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    model = DDP(model)

    return model
Example #10
0
def prepare_tokenizer(args):
    add_sentinel_token = 0
    if args.sentinel_token:
        add_sentinel_token = args.max_position_embeddings
    tokenizer = make_tokenizer(args.tokenizer_type, None, args.tokenizer_path, args.vocab_size,
                               args.tokenizer_model_type, add_block_symbols=args.block_lm, cache_dir=args.cache_dir,
                               add_sentinel_token=add_sentinel_token, add_task_mask=args.task_mask,
                               add_decoder_mask=args.block_mask_prob > 0.0 or args.context_mask_ratio > 0.0,
                               fix_command_token=args.fix_command_token)
    if mpu.get_model_parallel_rank() == 0:
        num_tokens = tokenizer.num_tokens
        eod_token = tokenizer.get_command('eos').Id
        assert eod_token == tokenizer.get_command('pad').Id
        before = num_tokens
        after = before
        multiple = args.make_vocab_size_divisible_by
        while (after % multiple) != 0:
            after += 1
        print_rank_0('> padded vocab (size: {}) with {} dummy '
                     'tokens (new size: {})'.format(before, after - before, after))
        print_rank_0('> found end-of-document token: {}'.format(eod_token))
        token_counts = torch.cuda.LongTensor([after, eod_token])
    else:
        token_counts = torch.cuda.LongTensor([0, 0])
    # Broadcast num tokens.
    torch.distributed.broadcast(token_counts,
                                mpu.get_model_parallel_src_rank(),
                                group=mpu.get_model_parallel_group())
    num_tokens = token_counts[0].item()
    eod_token = token_counts[1].item()
    args.vocab_size, args.eod_token = num_tokens, eod_token
    return tokenizer
Example #11
0
def build_lambada_dataset(tokenizer, args):
    """Build lambada dataset."""
    assert len(args.valid_data) == 1
    val_dataset = LambadaDataset(args, tokenizer, strict=True)
    print_rank_0(' > found {} samples, {} label tokens.'.format(
        len(val_dataset), sum(map(len, val_dataset.labels))))
    return val_dataset
Example #12
0
    def __init__(self, args, split, tokenizer):
        self.args = args
        self.task, self.data_dir = args.task.lower(), args.data_dir
        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
        self.split = split
        self.tokenizer = tokenizer
        self.dataset_name = split
        if self.task in ["gigaword", "cnn_dm", "cnn_dm_original"]:
            self.processor = SummmaryProcessor(self.task, self.data_dir,
                                               tokenizer)
        elif self.task in ["xsum"]:
            self.processor = XSumProcessor(self.data_dir, tokenizer)
        elif self.task in ["squad_generation"]:
            self.processor = SQuADGenerationProcessor(self.data_dir, tokenizer)
        elif self.task in ["squad", "squad_v1"]:
            self.processor = SQuADProcessor(self.data_dir, tokenizer,
                                            self.max_src_length, args)
        elif self.task in ['cmrc']:
            self.processor = CMRCProcessor(self.data_dir, tokenizer)
        else:
            raise NotImplementedError(self.task)
        example_list = self.processor.create_examples(split)
        self.example_list = example_list
        self.examples = {example.guid: example for example in example_list}

        print_rank_0(f"Return {len(self.examples)} {split} examples")
Example #13
0
def evaluate(model, dataloader, eval_metric, args):
    """Evaluation."""
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_output, total_count = 0.0, 0
    total_tokens = 0
    with torch.no_grad():
        # For all the batches in the dataset.
        for iteration, batch in enumerate(dataloader):
            if (iteration + 1) % args.log_interval == 0:
                print_rank_0('> working on iteration: {}'.format(iteration))
            # Forward evaluation.
            output, _, _ = lm_forward_step(batch,
                                           model,
                                           args,
                                           None, [],
                                           eval_metric=eval_metric)
            count = batch['text'].size(0)
            count = torch.cuda.LongTensor([count])
            # Reduce across processes.
            torch.distributed.all_reduce(output,
                                         group=mpu.get_data_parallel_group())
            torch.distributed.all_reduce(count,
                                         group=mpu.get_data_parallel_group())

            total_output += output.item()
            total_count += count.item()
            total_tokens += batch['loss_mask'].sum().item()
    print(total_tokens)
    return {eval_metric: total_output}, total_count
def get_model(args, version=None):
    """Build the model."""
    
    print_rank_0('building Bert model ...')
    if version is None:
        model = BertMixtureModel(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      layernorm_epsilon=args.layernorm_epsilon,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      num_experts=args.num_experts,
                      type_vocab_size=2)
    elif version == "v0":
        model = BertMixtureModel_v0(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      layernorm_epsilon=args.layernorm_epsilon,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      num_experts=args.num_experts,
                      type_vocab_size=2)
    
    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    #To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model, device_ids=[i], output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
Example #15
0
 def init_indices(self):
     if self.is_lazy:
         lens = np.array([self.ds.get_text_len(idx) for idx in range(len(self.ds))])
     else:
         lens = np.array([len(d['prompt']) + len(d['text']) if isinstance(d, dict) else len(d) for d in self.ds])
     self.indices = list(accumulate(lens))
     print_rank_0(f"Dataset document count {len(lens)}, token count {self.indices[-1]}")
     self.num_samples = self.indices[-1] // self.max_seq_len + 1
Example #16
0
 def init_weighting(self):
     if self.is_lazy:
         lens = np.array([self.ds.get_text_len(idx) for idx in range(len(self.ds))])
     else:
         lens = np.array([len(d['text']) if isinstance(d, dict) else len(d) for d in self.ds])
     self.total_len = np.sum(lens)
     print_rank_0(
         f"Dataset document count {len(lens)}, token count {self.total_len}, non sentence start{self.non_sentence_start}")
     self.weighting = list(accumulate(lens))
Example #17
0
 def read_input_to_queue():
     for path in paths:
         print_rank_0(f"Start reading {path}")
         with open(path) as file:
             for row in file:
                 task_queue.put(row)
     print_rank_0("Read input complete")
     for i in range(len(processes)):
         task_queue.put('STOP')
def evaluate(data_loader, model, args, timers,
             num_iterations=None):
    """Evaluation."""

    # Turn on evaluation mode which disables dropout.
    model.eval()

    total_lm_loss = 0
    if num_iterations is not None:
        max_iters = num_iterations
    else:
        if mpu.get_model_parallel_rank() == 0:
            max_iters_gpu = torch.cuda.LongTensor([len(data_loader)])
        else:
            max_iters_gpu = torch.cuda.LongTensor([0])
        torch.distributed.broadcast(max_iters_gpu,
                                    mpu.get_model_parallel_src_rank(),
                                    group=mpu.get_model_parallel_group())
        max_iters = max_iters_gpu[0].item()
        print_rank_0('global rank: {} | max iters: {}'.format(
            torch.distributed.get_rank(), max_iters))

    if data_loader is not None:
        data_iterator = iter(data_loader)
    else:
        data_iterator = None

    with torch.no_grad():
        iteration = 0
        while iteration < max_iters:
            if iteration % args.log_interval == 0:
                print_rank_0('global rank: {} | iteration: {}'.format(
                    torch.distributed.get_rank(), iteration))
            # Forward evaluation.
            lm_loss = forward_step(data_iterator, model, args, timers)
            if lm_loss is None:
                break
            # Reduce across processes.
            if isinstance(model, DDP):
                torch.distributed.all_reduce(lm_loss.data)
                if args.cloze_eval:
                    lm_loss.data = lm_loss.data / args.world_size
                else:
                    lm_loss.data = lm_loss.data / args.model_parallel_size

            if not args.cloze_eval:
                total_lm_loss += lm_loss.data.detach().float().item()/(args.num_tokenized_tokens-1)
            else:
                total_lm_loss += lm_loss.data.detach().float().item()

            iteration += 1

    # Move model back to the train mode.
    model.train()

    return total_lm_loss
Example #19
0
 def print_info(self, info):
     total_dict = defaultdict(int)
     while True:
         try:
             source_dict = info.get(block=False)
             for source, length in source_dict.items():
                 total_dict[source] += length
         except Empty:
             break
     print_rank_0(total_dict)
Example #20
0
 def check_and_set_(self, cls_value, sd_value, name):
     if self.override_lr_scheduler:
         print_rank_0(' > overriding {} value to {}'.format(
             name, cls_value))
         return cls_value
     else:
         if not self.use_checkpoint_lr_scheduler:
             assert cls_value == sd_value, 'AnnealingLR: class input value' \
                 'and checkpoint values for {} do not match'.format(name)
         print_rank_0(' > using checkpoint value {} for {}'.format(
             sd_value, name))
         return sd_value
Example #21
0
    def __init__(self, args, split, tokenizer, for_train=False):
        task_name = args.task.lower()
        data_dir = args.data_dir
        processor = PROCESSORS[task_name](args)
        print_rank_0(
            f"Creating {task_name} dataset from file at {data_dir} (split={split})"
        )
        self.dataset_name = f"{task_name}-{split}"
        if split == DEV_SET:
            examples = processor.get_dev_examples(data_dir,
                                                  for_train=for_train)
        elif split == TEST_SET:
            examples = processor.get_test_examples(data_dir)
        elif split == TRAIN_SET:
            examples = processor.get_train_examples(data_dir)
        elif split == UNLABELED_SET:
            examples = processor.get_unlabeled_examples(data_dir)
            for example in examples:
                example.label = processor.get_labels()[0]
        else:
            raise ValueError(
                f"'split' must be one of {SPLIT_TYPES}, got '{split}' instead")
        if split == TEST_SET:
            self.labeled = False
        else:
            self.labeled = True

        label_distribution = Counter(example.label for example in examples)
        print_rank_0(
            f"Returning {len(examples)} {split} examples with label dist.: {list(label_distribution.items())}"
        )
        self.samples = []
        examples.sort(key=lambda x: x.num_choices)
        if args.cloze_eval:
            pvp = PVPS[task_name](args,
                                  tokenizer,
                                  processor.get_labels(),
                                  args.seq_length,
                                  pattern_id=args.pattern_id,
                                  fast_decode=args.fast_decode,
                                  continuous_prompt=args.continuous_prompt)
            for example in examples:
                sample = pvp.encode(example)
                self.samples.append(sample)
            print_rank_0(f"Truncate {pvp.num_truncated} examples")
        else:
            for example in examples:
                sample = processor.encode(example, tokenizer, args)
                self.samples.append(sample)
            print_rank_0(f"Truncate {processor.num_truncated} examples")
        print_rank_0(f"Creating {len(self.samples)} samples")
        self.examples = {example.guid: example for example in examples}
Example #22
0
def evaluate(data_iterator,
             student_model,
             teacher_model,
             args,
             timers,
             verbose=False):
    """Evaluation."""

    # Turn on evaluation mode which disables dropout.
    student_model.eval()
    if teacher_model is not None:
        teacher_model.eval()

    total_losses = defaultdict(int)

    with torch.no_grad():
        for iter in range(args.eval_iters):
            if verbose and iter % args.log_interval == 0:
                print_rank_0('Evaluating iter {}/{}'.format(
                    iter, args.eval_iters))
                save_rank_0(
                    args,
                    'Evaluating iter {}/{}'.format(iter, args.eval_iters))

            # Forward evaluation.
            losses = forward_step(data_iterator, student_model, teacher_model,
                                  args, timers)
            # tot_loss = losses["tot_loss"]
            '''when contiguous memory optimizations are enabled, the buffers
            allocated by the optimizations are deallocated during backward pass
            in the absence of backward pass the buffers should be reset after each
            forward pass'''
            if args.deepspeed and args.deepspeed_activation_checkpointing:
                deepspeed.checkpointing.reset()

            # Reduce across processes.
            if isinstance(student_model, DDP):
                for k in losses:
                    torch.distributed.all_reduce(losses[k].data)
                    losses[k].data = losses[k].data / args.world_size

            for k in losses:
                total_losses[k] += losses[k].data.detach().float().item()

    # Move model back to the train mode.
    student_model.train()

    for k in total_losses:
        total_losses[k] /= args.eval_iters

    return total_losses
Example #23
0
def evaluate_and_print_results(prefix,
                               data_iterator,
                               model,
                               args,
                               writer,
                               iteration,
                               timers,
                               verbose=False):
    """Helper function to evaluate and dump results on screen."""
    lm_loss, nsp_loss = evaluate(data_iterator, model, args, timers, verbose)
    val_loss = lm_loss + nsp_loss
    print_rank_0('-' * 100)
    string = ' validation loss at {} | '.format(prefix)
    string += 'LM loss: {:.6E} | '.format(lm_loss)
    string += 'NSP loss: {:.6E} | '.format(nsp_loss)
    string += 'total loss: {:.6E}'.format(val_loss)
    length = len(string) + 1
    print_rank_0('-' * length)
    print_rank_0(string)
    print_rank_0('-' * length)

    if writer and args.rank == 0:
        writer.add_scalar('val_lm_loss', lm_loss, iteration)
        writer.add_scalar('val_nsp_loss', nsp_loss, iteration)
        writer.add_scalar('val_total_loss', val_loss, iteration)

    return val_loss
Example #24
0
 def read_input_to_queue():
     for path in paths:
         print_rank_0(f"Start reading {path}")
         with open(path) as file:
             if self.split_row:
                 for row in file:
                     task_queue.put(row)
             else:
                 items = json.load(file)
                 for item in items["RECORDS"]:
                     task_queue.put(item)
     print_rank_0("Read input complete")
     for i in range(len(processes)):
         task_queue.put('STOP')
Example #25
0
def report_evaluate_metrics(summary_writer, prefix, loss, ppl, gpt_loss,
                            bert_loss, sent_loss, multi_loss, step):
    string = ' validation loss at {}'.format(prefix)
    string += ' | LM loss: {:.6E}'.format(loss)
    string += ' | LM PPL: {:.6E}'.format(ppl)
    if gpt_loss != 0:
        string += ' | GPT loss: {:.6E}'.format(gpt_loss)
    if bert_loss != 0:
        string += ' | BERT loss: {:.6E}'.format(bert_loss)
    if sent_loss != 0:
        string += ' | Sent loss: {:.6E}'.format(sent_loss)
    if multi_loss != 0:
        string += ' | Multi loss: {:.6E}'.format(multi_loss)
    length = len(string) + 1
    print_rank_0('-' * 100)
    print_rank_0('-' * length)
    print_rank_0(string)
    print_rank_0('-' * length)
    if summary_writer is not None:
        summary_writer.add_scalar(f'Train/valid_ppl', ppl, step)
        summary_writer.add_scalar(f'Train/valid_loss', loss, step)
        if gpt_loss != 0:
            summary_writer.add_scalar(f'Train/valid_gpt_loss', gpt_loss, step)
        if bert_loss != 0:
            summary_writer.add_scalar(f'Train/valid_bert_loss', bert_loss,
                                      step)
        if sent_loss != 0:
            summary_writer.add_scalar(f'Train/valid_sent_loss', sent_loss,
                                      step)
        if multi_loss != 0:
            summary_writer.add_scalar(f'Train/valid_multi_loss', multi_loss,
                                      step)
Example #26
0
def evaluate_and_print_results(prefix,
                               data_iterator,
                               model,
                               args,
                               timers,
                               verbose=False,
                               writer=None,
                               iteration=0):
    """Helper function to evaluate and dump results on screen."""
    lm_loss = evaluate(data_iterator, model, args, timers, verbose)
    lm_ppl = math.exp(min(20, lm_loss))
    if writer and torch.distributed.is_initialized(
    ) and torch.distributed.get_rank() == 0:
        scalars = {'loss': lm_loss, 'perplexity': lm_ppl}
        for k, v in scalars.items():
            writer.add_scalar(k, v, iteration)
    print_rank_0('-' * 100)
    string = ' validation loss at {} | '.format(prefix)
    string += 'LM loss: {:.6E} | '.format(lm_loss)
    string += 'LM PPL: {:.6E}'.format(lm_ppl)
    length = len(string) + 1
    print_rank_0('-' * length)
    print_rank_0(string)
    print_rank_0('-' * length)

    return lm_loss
Example #27
0
def evaluate_and_print_results(prefix,
                               data_iterator,
                               student_model,
                               teacher_model,
                               args,
                               timers,
                               verbose=False):
    """Helper function to evaluate and dump results on screen."""
    losses = evaluate(data_iterator, student_model, teacher_model, args,
                      timers, verbose)
    lm_ppl = None
    if "lm_loss" in losses:
        lm_loss = losses["lm_loss"]
        lm_ppl = math.exp(min(20, lm_loss))
    print_rank_0('-' * 100)
    save_rank_0(args, '-' * 100)
    string = ' validation loss at {} | '.format(prefix)
    for k in losses:
        string += '{}: {:.6} | '.format(k, losses[k])
    if lm_ppl is not None:
        string += 'LM PPL: {:.6}'.format(lm_ppl)
    length = len(string) + 1
    print_rank_0('-' * length)
    save_rank_0(args, '-' * 100)
    print_rank_0(string)
    save_rank_0(args, string)
    print_rank_0('-' * length)
    save_rank_0(args, '-' * 100)

    return losses
Example #28
0
def evaluate_and_print_results(prefix,
                               data_iterator,
                               model,
                               args,
                               writer,
                               iteration,
                               timers,
                               verbose=False):
    """Helper function to evaluate and dump results on screen."""
    lm_loss = evaluate(data_iterator, model, args, timers, verbose)
    lm_ppl = math.exp(min(20, lm_loss))
    print_rank_0('-' * 100)
    string = ' validation loss at {} | '.format(prefix)
    string += 'LM loss: {:.6E} | '.format(lm_loss)
    string += 'LM PPL: {:.6E}'.format(lm_ppl)
    length = len(string) + 1
    print_rank_0('-' * length)
    print_rank_0(string)
    print_rank_0('-' * length)

    if writer and args.rank == 0:
        writer.add_scalar('val_loss', lm_loss, iteration)
        writer.add_scalar('val_ppl', lm_ppl, iteration)

    return lm_loss
Example #29
0
def build_lm_dataset(tokenizer, args):
    documents = []
    num_tokens, num_original_tokens = 0, 0
    with open(args.valid_data[0], encoding='utf-8') as file:
        for line in file:
            tokens = tokenizer.EncodeAsIds(line.strip()).tokenization
            num_tokens += len(tokens)
            num_original_tokens += len(line.strip().split(" "))
            documents.append(tokens)
    val_dataset = LMDataset(args, documents, tokenizer, num_original_tokens, num_tokens)
    print_rank_0(
        ' > number of document: {}, number of original tokens {}, number of detokenized tokens: {}'.format(
            len(documents), num_original_tokens, num_tokens))
    return val_dataset
Example #30
0
 def __init__(self, args, split, tokenizer):
     self.args = args
     task, data_dir = args.task.lower(), args.data_dir
     self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
     self.split = split
     self.tokenizer = tokenizer
     if split == "train":
         filename = "train"
     elif split == "dev":
         filename = "val"
     elif split == "test":
         filename = "test"
     else:
         raise NotImplementedError(split)
     print_rank_0(f"Creating {task}-{split} dataset from {data_dir}")
     self.dataset_name = split
     if task == "gigaword":
         detokenizer = gigaword_detokenize
     elif task == "cnn_dm":
         detokenizer = cnndm_detokenize
     else:
         detokenizer = None
     source_texts, target_texts = [], []
     with open(os.path.join(data_dir, f"{filename}.source"),
               encoding='utf-8') as file:
         for line in file:
             line = line.strip()
             line = detokenizer(line) if detokenizer else line
             source_texts.append(line)
     with open(os.path.join(data_dir, f"{filename}.target"),
               encoding='utf-8') as file:
         for line in file:
             line = line.strip()
             line = detokenizer(line,
                                is_target=True) if detokenizer else line
             target_texts.append(line)
     assert len(source_texts) == len(target_texts)
     self.examples, self.example_list = {}, []
     for idx, (source_text,
               target_text) in enumerate(zip(source_texts, target_texts)):
         if (idx + 1) % 20000 == 0:
             print_rank_0(f"Complete {idx + 1} examples")
         guid = "%s-%s" % (split, idx)
         meta = {
             "ref":
             tokenizer.DecodeIds(
                 tokenizer.EncodeAsIds(target_text).tokenization)
         }
         example = InputExample(guid=guid,
                                text_a=source_text,
                                text_b=target_text,
                                meta=meta)
         if idx < 10:
             print_rank_0(
                 (source_text.encode('utf-8'), target_text.encode('utf-8'),
                  meta["ref"].encode('utf-8')))
         self.examples[guid] = example
         self.example_list.append(example)
     print_rank_0(f"Return {len(self.examples)} {split} examples")