def build_indexers(args): indexers = {} if args.input_module in ["scratch", "glove", "fastText"]: indexers["words"] = SingleIdTokenIndexer() elif args.input_module in ["elmo", "elmo-chars-only"]: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if input_module_uses_transformers(args.input_module): assert ( not indexers ), "transformers modules like BERT/XLNet are not supported alongside other " "indexers due to tokenization." assert args.tokenizer == args.input_module, ( "transformers models use custom tokenization for each model, so tokenizer " "must match the specified model.") tokenizer_name = input_module_tokenizer_name(args.input_module) indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name) return indexers
def select_tokenizer(args): """ Select a sane default tokenizer. """ if args.tokenizer == "auto": if input_module_uses_transformers(args.input_module): tokenizer_name = args.input_module else: tokenizer_name = "MosesTokenizer" else: tokenizer_name = args.tokenizer return tokenizer_name
def _build_vocab(args: config.Params, tasks: List[Task], vocab_path: str): """Build vocabulary from scratch Read data from all tasks into namespaces, optionally add special vocab items, and save vocabulary file. Note ---- task-specific target vocabulary should be counted in the task object and provided via `task.all_labels()`. The namespace should be task-specific, i.e. not something generic like "targets". Parameters ---------- args : config.Params config map tasks : List[Task] list of Task from which to build vocab vocab_path : str vocab file save path """ log.info("\tBuilding vocab from scratch.") max_v_sizes = {"word": args.max_word_v_size, "char": args.max_char_v_size} word2freq, char2freq = get_words(tasks) vocab = get_vocab(word2freq, char2freq, max_v_sizes) for task in tasks: # add custom label namespaces # TODO: surface more docs for add_task_label_vocab: add_task_label_vocab(vocab, task) if args.force_include_wsj_vocabulary: # Add WSJ full vocabulary for PTB F1 parsing tasks. add_wsj_vocab(vocab, args.data_dir) if input_module_uses_transformers(args.input_module): # Add pre-computed vocabulary of corresponding tokenizer for transformers models. add_transformers_vocab(vocab, args.tokenizer) vocab.save_to_files(vocab_path) log.info("\tSaved vocab to %s", vocab_path)