Esempio n. 1
0
def build_indexers(args):
    indexers = {}
    if args.input_module in ["scratch", "glove", "fastText"]:
        indexers["words"] = SingleIdTokenIndexer()
    elif args.input_module in ["elmo", "elmo-chars-only"]:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}

    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")

    if input_module_uses_transformers(args.input_module):
        assert (
            not indexers
        ), "transformers modules like BERT/XLNet are not supported alongside other "
        "indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "transformers models use custom tokenization for each model, so tokenizer "
            "must match the specified model.")
        tokenizer_name = input_module_tokenizer_name(args.input_module)
        indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name)
    return indexers
Esempio n. 2
0
    def __init__(self, args):
        super().__init__()
        self.cache_dir = FLAGS.cache_dir
        utils.maybe_make_dir(self.cache_dir)

        self.output_mode = args.transformers_output_mode
        self.input_module = args.input_module
        self.tokenizer_required = input_module_tokenizer_name(
            args.input_module)

        # If set, treat these special tokens as part of input segments other than A/B.
        self._SEG_ID_CLS = None
        self._SEG_ID_SEP = None
        # self.model = transformers.RobertaModel.from_pretrained(
        #     args.input_module, cache_dir=self.cache_dir, output_hidden_states=True
        # )
        if FLAGS.saved_pretrained_model_path:
            self.model = load_pretrained_model_for_SG()
        else:
            self.model = MODEL_MAPPING[FLAGS.model](finetune_stage=True)
        self.max_pos = None

        self.tokenizer = get_my_tokenizer()
        self._sep_id = self.tokenizer.sep_token_id
        self._cls_id = self.tokenizer.cls_token_id
        self._pad_id = self.tokenizer.pad_token_id
        self._unk_id = self.tokenizer.unk_token_id

        self.parameter_setup(args)
Esempio n. 3
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name
    log.info('In add_transformers_vocab')
    log.info(tokenizer_name)
    if tokenizer_name.startswith(
            "bert-"
    ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith(
            "roberta-"):  # or 'roberta' in tokenizer_name:
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-roberta"):
        tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
Esempio n. 4
0
    def __init__(self, args):
        super(HuggingfaceTransformersEmbedderModule, self).__init__()

        self.cache_dir = os.getenv(
            "HUGGINGFACE_TRANSFORMERS_CACHE",
            os.path.join(args.exp_dir, "transformers_cache"))
        utils.maybe_make_dir(self.cache_dir)

        self.output_mode = args.transformers_output_mode
        self.input_module = args.input_module
        self.max_pos = None
        self.tokenizer_required = input_module_tokenizer_name(
            args.input_module)

        # Integer token indices for special symbols.
        self._cls_id = None
        self._sep_id = None
        self._pad_id = None
        self._unk_id = None

        # If set, treat these special tokens as part of input segments other than A/B.
        self._SEG_ID_CLS = None
        self._SEG_ID_SEP = None
Esempio n. 5
0
def build_tasks(
    args: config.Params, cuda_device: Any
) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]):
    """Main logic for preparing tasks:

    1. create or load the tasks
    2. configure classifiers for tasks
    3. set up indexers
    4. build and save vocab to disk
    5. load vocab from disk
    6. if specified, load word embeddings
    7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing
    8. index tasks using vocab and task-specific MPI, save to disk.
    9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings

    Parameters
    ----------
    args : Params
        config map

    Returns
    -------
    List[Task]
        list of pretrain Tasks.
    List[Task]
        list of target Tasks.
    allennlp.data.Vocabulary
        vocabulary from task data.
    Union[np.ndarray, float]
        Word embeddings.

    """
    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(args, cuda_device)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name, "use_classifier")
        setattr(task, "_classifier_name", task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert not len(set(tokenizer_names.values())) > 1, (
        f"Error: mixing tasks with different tokenizers!" " Tokenizations: {tokenizer_names:s}"
    )

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab",input_module_tokenizer_name(args.input_module)) #to allow roberta and albert (with diff vocabs) in one exp folder
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Set up model_preprocessing_interface
    model_preprocessing_interface = ModelPreprocessingInterface(args)

    # 5) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            # To store preprocessed data for models that use different indexers in the same exp directory
            indexer = input_module_tokenizer_name(args.input_module)
            relative_path = _get_serialized_record_path(task.name, split, "preproc",indexer)
            cache_found =_find_cached_file(
                args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix #TODO change global one to point to arwen, and local one to be in one exp folder with diff runs
            )
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(task.name, split, preproc_dir,indexer)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(
                    task, split, indexers, vocab, record_file, model_preprocessing_interface
                )

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text?

    log.info("\tFinished indexing tasks")

    # 6) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        indexer = input_module_tokenizer_name(args.input_module)
        # Replace lists of instances with lazy generators from disk.
        task.val_data = _get_instance_generator(task.name, "val", preproc_dir,indexer=indexer)
        task.test_data = _get_instance_generator(task.name, "test", preproc_dir,indexer=indexer)
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " + task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name, "train", preproc_dir, fraction=args.pretrain_data_fraction
            ,indexer=indexer)
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name, "train", preproc_dir, fraction=args.target_train_data_fraction
            ,indexer=indexer)
            target_tasks.append(task)



    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs