Ejemplos de BertTokenizerFast.get_vocab en Python

Lenguaje de programación: Python

Namespace/Package Name: transformers

Clase / Tipo: BertTokenizerFast

Método / Función: get_vocab

Ejemplos en hotexamples.com: 3

Python BertTokenizerFast.get_vocab - 3 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de transformers.BertTokenizerFast.get_vocab extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

from_pretrained(30)

BertTokenizerFast(30)

tokenize(11)

convert_tokens_to_ids(7)

encode_plus(5)

encode(4)

batch_encode_plus(3)

convert_ids_to_tokens(3)

get_vocab(3)

add_special_tokens(2)

decode(2)

save_pretrained(2)

batch_decode(1)

convert_tokens_to_string(1)

sanitize_special_tokens(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: modeling.py Proyecto: lishuokp31/mimic-predictors-fastapi

def classify(
    model: BertForTokenClassification,
    tokenizer: BertTokenizerFast,
    # 输入
    sequence: str,
    labels: List[int] = None,
) -> List[Entity]:
    """
    classify的功能：
    给定model、tokenizer，sequence：
    给定label(对应以下label不为空的情况)，计算loss；
    label为空：预测entities

    训练好的模型，可以直接使用
    """
    # ensure model is configured to return dict
    # otherwise this code will break
    # 确保模型配置为返回dict，否则此代码将中断
    if not model.config.return_dict:
        raise ValueError(
            'Model should be instantiated with `return_dict=True`')

    # convert input sequence (and optional labels) into an inputs bundle
    # 将输入序列（和可选标签）转换为输入包
    inputs, mask = pack_sequence_as_inputs(
        tokenizer=tokenizer,
        sequence=sequence,
        labels=labels,
        max_token_length=model.config.max_position_embeddings,
    )

    # put data on the gpu (if available)
    # if torch.cuda.is_available():
    #     model.cuda()
    #     inputs = {k: v.cuda() for k, v in inputs.items()}

    # if labels is not None, it means that the caller is interested in the loss
    # value of the given input sequence. So, this should be done in a grad context.
    # 如果labels不是None，则表示调用者对给定输入序列的损失值感兴趣。所以，这应该在毕业的背景下进行。
    if labels is not None:
        return model(**inputs).loss

    # if labels is None, it means that the caller is interested in the entities
    # to be recognized by the model. In this case, the outputs can be computed
    # without a grad context
    # 如果labels为None，则表示调用者对模型要识别的实体感兴趣。在这种情况下，可以在没有梯度上下文的情况下计算输出
    with torch.no_grad():
        logits = model(**inputs).logits.cpu()

    # decode model's output
    # 解码模型输出
    entities = extract_entities(
        sequence=sequence,
        logits=logits[:, 1:-1][mask],
        encode=tokenizer.encode,
        decode=tokenizer.decode,
    )
    entities = realign_extracted_entities(
        sequence=sequence,
        tokens=tokenizer.tokenize(sequence),
        entities=entities,
        vocab=tokenizer.get_vocab(),
    )

    return list(entities)

Ejemplo n.º 2

Mostrar archivo

Archivo: run_language_modeling.py Proyecto: ncoop57/contracode

def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    print("Config before overwrite max_position_embeddings:", config)
    config.max_position_embeddings = 4096
    config.num_hidden_layers = 6
    config.num_attention_heads = 8
    config.hidden_size = 512
    config.intermediate_size = 2048
    print("Config after overwrite max_position_embeddings:", config)

    # if model_args.tokenizer_name:
    #     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    # elif model_args.model_name_or_path:
    #     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    # else:
    #     raise ValueError(
    #         "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
    #         "and load it from here, using --tokenizer_name"
    #     )

    logging.info("Loading tokenizer")
    if model_args.tokenizer_name:
        tokenizer = BertTokenizerFast(model_args.tokenizer_name,
                                      clean_text=True,
                                      lowercase=False,
                                      strip_accents=True)
    else:
        raise ValueError("Specify tokenizer name")

    logging.info("Loading model")
    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    logging.info("Resizing embeddings")
    model.resize_token_embeddings(len(tokenizer))
    print(len(tokenizer.get_vocab()), len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling).")

    # Get datasets
    logging.info("Loading train dataset")
    train_dataset = get_dataset(data_args) if training_args.do_train else None
    logging.info("Loading eval dataset")
    eval_dataset = (get_dataset(
        data_args,
        evaluate=True,
    ) if training_args.do_eval else None)
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=data_args.mlm,
            mlm_probability=data_args.mlm_probability,
        )

    # Initialize our Trainer
    logging.info("Initializing trainer")
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        logging.info("Training")
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results

Ejemplo n.º 3

Mostrar archivo

Archivo: modeling.py Proyecto: lishuokp31/mimic-predictors-fastapi

def batchify_long_inputs(
    tokenizer: BertTokenizerFast,
    inputs: Dict[str, torch.Tensor],
    labels: List[int] = None,
    max_token_length=512,
):
    """
    In cases where the input sequence token length is greater than 512,
    the input sequence is packed into a batch mainly for efficiency purposes.
    在输入序列token长度大于512的情况下，将输入序列打包到批次中主要是出于提高效率的目的。
    """
    token_ids = inputs['input_ids'][0, 1:-1].tolist()

    # get the special token IDs for later use
    # 获取特殊tokenID，以供以后使用
    cls = tokenizer.cls_token_id
    sep = tokenizer.sep_token_id
    pad = tokenizer.pad_token_id

    # the token IDs of the spliced sequence is collected in an array
    # in preparation for the creation of the needed matrices ahead
    # 剪接序列的tokenID被收集在一个数组中，以准备在前面创建所需的矩阵
    token_blocks, label_blocks = [], []
    for start, end in split_into_blocks(
            token_ids=token_ids,
            separator_token_id=tokenizer.get_vocab().get('。'),
            block_size=max_token_length - 2,
    ):
        # split the input tokens into blocks (separated by period)
        # 将输入tokens序列拆分为块（以句点分隔）
        token_blocks.append([cls] + token_ids[start:end] + [sep])

        # also split the labels into blocks (if passed)
        # 还将标签分成块（如果通过）
        if labels is not None:
            label_blocks.append([0] + labels[start:end] + [0])

    # create a matrix vertically stacking the token IDs.
    # the width of this matrix depends on the longest token block.
    # also, each row of the matrix contains [CLS] and [SEP] tokens.
    # 创建一个垂直堆叠token ID的矩阵。 该矩阵的宽度取决于最长的token块。 同样，矩阵的每一行都包含[CLS]和[SEP]token。
    max_block_len = max([len(block) for block in token_blocks])
    input_ids = torch.tensor([
        block + [pad] * (max_block_len - len(block)) for block in token_blocks
    ])
    attention_mask = torch.tensor([[1] * len(block) + [0] *
                                   (max_block_len - len(block))
                                   for block in token_blocks])
    label_ids = torch.tensor([
        block + [pad] * (max_block_len - len(block)) for block in label_blocks
    ]) if labels is not None else None

    # basically the same with `attention_mask` except that it doesn't
    # take into account the [CLS] and [SEP] positions.
    # this is created so that the final logits can be indexed conveniently
    # 与“ attention_mask”基本相同，除了它不考虑[CLS]和[SEP]位置。 创建它是为了方便最终索引登录
    mask = torch.tensor([[1] * (len(block) - 2) + [0] *
                         (max_block_len - len(block))
                         for block in token_blocks],
                        dtype=torch.bool)

    # combine inputs as one batch to be processed at once
    # 将输入合并为一批，一次处理
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': torch.zeros_like(input_ids),
    }

    # add the labels tensor (if applicable)
    # 添加标签张量（如果适用）
    if labels is not None:
        inputs['labels'] = label_ids

    return inputs, mask