def __init__(self,
              model_name: str,
              start_tokens: List[str] = None,
              end_tokens: List[str] = None) -> None:
     self._tokenizer = AutoTokenizer.from_pretrained(model_name)
     default_start_tokens, default_end_tokens = _guess_start_and_end_token_defaults(
         model_name)
     self._start_tokens = start_tokens if start_tokens is not None else default_start_tokens
     self._end_tokens = end_tokens if end_tokens is not None else default_end_tokens
Exemple #2
0
 def test_as_array_produces_token_sequence(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",
                                               do_lowercase=True)
     indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                            do_lowercase=True)
     tokens = tokenizer.tokenize("AllenNLP is great")
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = [Token(token) for token in tokens]
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
 def __init__(self,
              model_name: str,
              namespace: str = "tags",
              token_min_padding_length: int = 0) -> None:
     super().__init__(token_min_padding_length)
     self._model_name = model_name
     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
     self._namespace = namespace
     self._added_to_vocabulary = False
     self._padding_value = self.tokenizer.convert_tokens_to_ids(
         [self.tokenizer.pad_token])[0]
     logger.info(
         f"Using token indexer padding value of {self._padding_value}")
 def __init__(self,
              model_name: str,
              do_lowercase: bool,
              start_tokens: List[str] = None,
              end_tokens: List[str] = None) -> None:
     if model_name.endswith("-cased") and do_lowercase:
         logger.warning("Your pretrained model appears to be cased, "
                        "but your tokenizer is lowercasing tokens.")
     elif model_name.endswith("-uncased") and not do_lowercase:
         logger.warning("Your pretrained model appears to be uncased, "
                        "but your tokenizer is not lowercasing tokens.")
     self._tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lowercase)
     default_start_tokens, default_end_tokens = _guess_start_and_end_token_defaults(model_name)
     self._start_tokens = start_tokens if start_tokens is not None else default_start_tokens
     self._end_tokens = end_tokens if end_tokens is not None else default_end_tokens
Exemple #5
0
 def __init__(self,
              xlm_model_name: str,
              do_lowercase: bool,
              token_indexers: Dict[str, TokenIndexer] = None,
              max_sent_len: int = 80,
              dataset_field_name: str = "dataset",
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._tokenizer = AutoTokenizer.from_pretrained(
         xlm_model_name, do_lower_case=do_lowercase)
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._max_sent_len = max_sent_len
     self._dataset_field_name = dataset_field_name
Exemple #6
0
 def __init__(self,
              model_name: str,
              do_lowercase: bool,
              namespace: str = "tags",
              token_min_padding_length: int = 0) -> None:
     super().__init__(token_min_padding_length)
     if model_name.endswith("-cased") and do_lowercase:
         logger.warning("Your pretrained model appears to be cased, "
                        "but your indexer is lowercasing tokens.")
     elif model_name.endswith("-uncased") and not do_lowercase:
         logger.warning("Your pretrained model appears to be uncased, "
                        "but your indexer is not lowercasing tokens.")
     self.tokenizer = AutoTokenizer.from_pretrained(
         model_name, do_lower_case=do_lowercase)
     self._namespace = namespace
     self._added_to_vocabulary = False
Exemple #7
0
    def __init__(
            self,
            xlm_model_name: str,
            do_lowercase: bool,
            token_indexers: Dict[str, TokenIndexer] = None,
            cuda_device: int = 1,
            max_sent_len: int = 128,
            dataset_field_name: str = "dataset",
            source_fname_prefix: str = "multinli.train.",
            lg_pairs:
        str = "ar-en bg-en de-en el-en en-es en-fr en-hi en-ru en-sw en-th en-tr en-ur en-vi en-zh",
            target_lang="en",
            scheme: str = "round_robin",
            lazy: bool = False) -> None:
        super().__init__(lazy)
        tokenizer = AutoTokenizer.from_pretrained(xlm_model_name,
                                                  do_lower_case=do_lowercase)
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }

        self._max_sent_len = max_sent_len
        self._dataset_field_name = dataset_field_name
        self._lg_pairs = lg_pairs.split(" ")

        self._scheme = scheme
        self._readers: Dict[str, DatasetReader] = {}
        for pair in self._lg_pairs:
            self._readers[pair] = ParaCorpusReader(
                xlm_tokenizer=tokenizer,
                lang_pair=pair,
                xlm_model_name=xlm_model_name,
                do_lowercase=do_lowercase,
                token_indexers=token_indexers,
                max_sent_len=max_sent_len,
                dataset_field_name=dataset_field_name,
                target_lang=target_lang,
                lazy=lazy,
                source_fname_prefix=source_fname_prefix,
                cuda_device=cuda_device)
Exemple #8
0
        os.makedirs(args.output_dir, exist_ok=True)

    # TPU devices
    devices = tpu_xm.get_xla_supported_devices()
    if args.one_tpu:
        devices = [devices[0]]
    n_tpu = len(devices)
    logging.info(f'Found {n_tpu} TPU cores')

    # set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.bert_model)
    logging.info(f"Saving tokenizer to: {args.output_dir}")
    tokenizer.save_pretrained(args.output_dir)

    # load model
    model = AutoModelWithLMHead.from_pretrained(
        args.bert_model)  # Only Masked Language Modeling
    logging.info(f"Saving initial checkpoint to: {args.output_dir}")
    model.save_pretrained(args.output_dir)

    # wrap model with TPU stuff
    model = tpu_dp.DataParallel(model, device_ids=devices)

    # expected total number of updates
    total_num_updates = utils.compute_num_updates_in_epoch(
        num_samples=args.total_num_training_examples,
Exemple #9
0
def main():
    parser = utils.get_args_parser_with_general_args()
    parser.add_argument(
        '--one_tpu',
        action='store_true',
        help=
        "Run on one tpu core for degugging. Makes it easy to use break points")
    parser.add_argument('--tpu_report',
                        action='store_true',
                        help="Print xla metric report")
    args = parser.parse_args()

    utils.init(args)  # set seeds, init logger, prepare output directory

    devices = tpu_xm.get_xla_supported_devices()
    if args.one_tpu:
        devices = [devices[0]]
    n_tpu = len(devices)
    logging.info(f'Found {n_tpu} TPU cores')

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model)
    tokenizer.save_pretrained(args.output_dir)

    args.start_epoch = utils.prepare_last_checkpoint(args.bert_model)
    model = AutoModelWithLMHead.from_pretrained(
        args.bert_model)  # Only Masked Language Modeling
    logging.info(f"Saving initial checkpoint to: {args.output_dir}")
    model.save_pretrained(args.output_dir)
    model = tpu_dp.DataParallel(model, device_ids=devices)

    num_data_epochs, num_train_optimization_steps = utils.get_dataset_stats(
        args, n_tpu)

    def tpu_training_loop(model, loader, device, context):
        """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch"""

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch
        optimizer = context.getattr_or(
            'optimizer',
            AdamW(optimizer_grouped_parameters,
                  lr=args.learning_rate,
                  eps=args.adam_epsilon,
                  betas=tuple(args.betas)))

        # derive warmup info
        if args.warmup_proportion is not None:
            warmup_steps = int(args.warmup_proportion *
                               num_train_optimization_steps + 0.5)
        elif args.warmup_steps is not None:
            warmup_steps = args.warmup_steps
        else:
            raise Exception(
                'What is the warmup?? Specify either warmup proportion or steps'
            )
        scheduler = context.getattr_or(
            'scheduler',
            WarmupLinearSchedule(optimizer,
                                 warmup_steps=warmup_steps,
                                 t_total=num_train_optimization_steps))

        tr_loss = None
        pbar = None
        if str(pbar_device) == str(
                device
        ):  # All threads are in sync. Use progress bar only on one of them
            pbar = tqdm(total=int(pbar_steps),
                        desc=f"device {device}",
                        dynamic_ncols=True)

        tracker = tpu_xm.RateTracker()

        model.train()
        for step, batch in loader:
            input_ids, input_mask, segment_ids, lm_label_ids, _ = batch
            outputs = model(input_ids, segment_ids, input_mask, lm_label_ids)
            loss = outputs[0]
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tracker.add(args.train_batch_size)

            tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps
            if pbar is not None:
                pbar.update(1)
                # pbar.set_description(desc=f'LR: {scheduler.get_lr()}')
            if (step + 1) % args.gradient_accumulation_steps == 0:
                tpu_xm.optimizer_step(optimizer)
                prev_lr = scheduler.get_last_lr()[0]
                scheduler.step()
                curr_lr = scheduler.get_last_lr()[0]
                if args.track_learning_rate:
                    if pbar is not None:
                        pbar.set_description(
                            f"Prev LR: {prev_lr} Curr LR: {curr_lr}")
                optimizer.zero_grad()

        return tr_loss.item(
        ) / step  # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=

    for epoch in range(args.start_epoch, args.epochs):
        # Load one training file into memory
        epoch_dataset = utils.PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        train_sampler = RandomSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        pbar_device = devices[0]
        pbar_steps = utils.compute_num_steps_in_epoch(
            num_samples=train_sampler.num_samples,
            batch_size=args.train_batch_size,
            grad_accum_steps=
            1,  # the pbar steps should not take into account grad accumulation steps
            n_tpu=n_tpu)
        logging.info(
            f'start training, epoch {epoch} on {len(devices)} cores for {pbar_steps} steps'
        )
        start = time.time()
        losses = model(
            tpu_training_loop, train_dataloader
        )  # calls `tpu_training_loop` multiple times, once per TPU core
        logging.info(
            f'Epoch {epoch} took {round(time.time() - start, 2)} seconds. Average loss: {sum(losses)/len(losses)}'
        )
        utils.save_checkpoint(model._models[0], epoch, args.output_dir)

    if args.tpu_report:
        logging.info(torch_xla._XLAC._xla_metrics_report())
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--lang", default=None, type=str, required=True)
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--out_dir", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="0", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('-d',
                        '--emb_dim',
                        type=int,
                        default=1024,
                        help="Embeddings size")
    parser.add_argument(
        '--vocab_file',
        type=str,
        default='vocabs/en_50k.vocab',
        help=
        "Path to vocab file with tokens (one per line) to include in output. Should also include <UNK> token. Can use $l as a placeholder for language"
    )

    args = parser.parse_args()

    lang = args.lang

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    vocab = vocabulary.Vocabulary()

    vocab.set_from_file(args.vocab_file, oov_token=tokenizer.unk_token)

    print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    examples = read_examples(args.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=args.max_seq_length,
                                            tokenizer=tokenizer,
                                            lang=lang)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = AutoModel.from_pretrained(args.bert_model)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    num_occurrences = [0] * vocab.get_vocab_size()
    anchors = {}
    norms = {}
    total_words = 0
    for l in layer_indexes:
        norms[l] = 0.0
        anchors[l] = np.zeros(shape=(vocab.get_vocab_size(), args.emb_dim))

    oov_ind = vocab.get_token_index(vocab._oov_token)
    model.eval()
    for input_ids, input_mask, example_indices in tqdm(eval_dataloader):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers = model(input_ids)
        all_encoder_layers = all_encoder_layers

        for b, example_index in enumerate(example_indices):
            feature = features[example_index.item()]

            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                w_id = vocab.get_token_index(token)
                if w_id == oov_ind:
                    continue

                n = num_occurrences[w_id]
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = all_encoder_layers[int(
                        layer_index)].detach().cpu().numpy()
                    layer_output = layer_output[b]
                    layers = collections.OrderedDict()
                    l = layer_index
                    values = layer_output[i]

                    anchors[l][w_id, :] = anchors[l][w_id, :] * (
                        n / (n + 1)) + values[:] / (n + 1)
                    norm = np.linalg.norm(values[:])
                    norms[l] = norms[l] * (total_words /
                                           (total_words + 1)) + norm / (
                                               total_words + 1)

                total_words += 1
                num_occurrences[w_id] += 1

    os.makedirs(args.out_dir, exist_ok=True)
    norm_dict = {}
    print('Saving outputs to {}'.format(args.out_dir))
    for l in tqdm(layer_indexes):
        norm_key = 'avg_norm_layer_{}'.format(l)
        norm_dict[norm_key] = norms[l]
        file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l))
        save_embeds(file_path, anchors[l], vocab, num_occurrences,
                    args.emb_dim)

    norm_dict['occurrences'] = num_occurrences
    file_path = os.path.join(args.out_dir, 'norms.json')
    json.dump(norm_dict, open(file_path, 'w'))
def main():
    parser = ArgumentParser()
    parser.add_argument('--train_corpus',
                        type=str,
                        required=True,
                        help="Path to training corpus in glob format")
    parser.add_argument("--output_dir", type=Path, required=True)
    parser.add_argument("--bert_model",
                        type=str,
                        required=True,
                        choices=[
                            "bert-base-uncased", "bert-large-uncased",
                            "bert-large-cased", "bert-base-cased",
                            "bert-base-multilingual-uncased",
                            "bert-base-chinese",
                            "bert-base-multilingual-cased", "roberta-base",
                            "roberta-large"
                        ])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--do_whole_word_mask",
        action="store_true",
        help=
        "Whether to use whole word masking rather than per-WordPiece masking.")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Reduce memory usage for large datasets by keeping data on disc rather than in memory"
    )

    parser.add_argument("--num_workers",
                        type=int,
                        default=1,
                        help="The number of workers to use to write the files")
    parser.add_argument("--epochs_to_generate",
                        type=int,
                        default=3,
                        help="Number of epochs of data to pregenerate")
    parser.add_argument("--max_seq_len", type=int, default=128)
    parser.add_argument(
        "--short_seq_prob",
        type=float,
        default=0.1,
        help="Probability of making a short sentence as a training example")
    parser.add_argument(
        "--masked_lm_prob",
        type=float,
        default=0.15,
        help="Probability of masking each token for the LM task")
    parser.add_argument(
        "--max_predictions_per_seq",
        type=int,
        default=20,
        help="Maximum number of tokens to mask in each sequence")
    parser.add_argument(
        "--do_next_sent_prediction",
        action="store_true",
        help=
        "Add the next sentence prediction task (as in BERT) or ignore it (as in RoBERTa)"
    )

    args = parser.parse_args()

    if args.num_workers > 1 and args.reduce_memory:
        raise ValueError("Cannot use multiple workers while reducing memory")

    args.output_dir.mkdir(exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model)
    tokenizer.vocab_list = list((tokenizer.encoder if hasattr(
        tokenizer, 'encoder') else tokenizer.vocab).keys())

    files = glob.glob(args.train_corpus)
    if args.num_workers > 1:
        pool = Pool(args.num_workers)
        arguments = [(args, input_file, i, tokenizer, len(files))
                     for i, input_file in enumerate(files)]
        pool.starmap(input_file_to_training_data, arguments)
    else:
        for i, input_file in enumerate(tqdm(files)):
            input_file_to_training_data(args, input_file, i, tokenizer,
                                        len(files))