def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.fixtures_path, 'vocab_test.txt'), 'r') as fin: tokens = fin.read().strip().split('\n') indexer = ELMoTokenCharactersIndexer() indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens] # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): sentences.append( indexer.pad_token_sequence( indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={} ) ) batch = Variable(torch.from_numpy(numpy.array(sentences))) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output['token_embedding'], elmo_token_embedder_output['mask'] )[0].data.numpy() actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.fixtures_path, 'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, 'r') as fin: expected_embeddings = fin['embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.tokens_to_indices([Token(token)], Vocabulary(), "correct") indices = torch.from_numpy(numpy.array(indices["correct"])).view(1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.tokens_to_indices([Token(token)], Vocabulary(), "correct") indices = torch.from_numpy(numpy.array(indices["correct"])).view( 1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def batch_to_ids(batch ) : u""" Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {u'character_ids': indexer}) instance = Instance({u"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
def __init__(self, bias: torch.Tensor = None, num_bias: int = 1, contraction: (torch.Tensor, torch.Tensor) = None, options_file: str = DEFAULT_OPTIONS_FILE, weight_file: str = DEFAULT_WEIGHT_FILE, cuda_device: int = -1) -> None: """ Parameters ---------- options_file : ``str``, optional A path or URL to an ELMo options file. weight_file : ``str``, optional A path or URL to an ELMo weights file. cuda_device : ``int``, optional, (default=-1) The GPU device to run on. """ self.indexer = ELMoTokenCharactersIndexer() logger.info("Initializing ELMo.") self.elmo_bilm = ElmoBilmDebias(options_file, weight_file) if cuda_device >= 0: self.elmo_bilm = self.elmo_bilm.cuda(device=cuda_device) self.cuda_device = cuda_device self.num_bias = num_bias self.bias=bias self.contraction = contraction if cuda_device >= 0: if self.bias is not None: self.bias = self.bias.cuda(device=cuda_device) if self.contraction is not None: self.contraction = (self.contraction[0].cuda(device=cuda_device), self.contraction[1].cuda(device=cuda_device))
def get_token_utils(name: str = config.embedder): if name == 'elmo': from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer # the token indexer is responsible for mapping tokens to integers token_indexer = ELMoTokenCharactersIndexer() def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words( x)[:config.max_seq_len] ] return token_indexer, tokenizer elif name == 'bert': from allennlp.data.token_indexers import PretrainedBertIndexer token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=config.max_seq_len, do_lowercase=True, ) def tokenizer(s: str): return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len - 2] return token_indexer, tokenizer
def _get_reader(config, skip_labels=False, bert_max_length=None, reader_max_length=150, read_first=None): indexers = {} for embedder_config in config.embedder.models: if embedder_config.name == 'elmo': indexers[embedder_config.name] = ELMoTokenCharactersIndexer() elif embedder_config.name.endswith('bert'): bert_path = os.path.join(config.data.pretrained_models_dir, embedder_config.name) indexers[ embedder_config.name] = PretrainedTransformerMismatchedIndexer( model_name=bert_path, tokenizer_kwargs={'do_lower_case': False}, max_length=bert_max_length) elif embedder_config.name == 'char_bilstm': indexers[embedder_config.name] = TokenCharactersIndexer() else: assert False, 'Unknown embedder {}'.format(embedder_config.name) return UDDatasetReader(indexers, skip_labels=skip_labels, max_length=reader_max_length, read_first=read_first)
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_token_embedder = _ElmoCharacterEncoder(options_file, weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.token_to_indices(Token(token), Vocabulary()) indices = Variable(torch.from_numpy(numpy.array(indices))).view( 1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def elmo(ll): for k in ll: sen_list = w[k] count += 1 sen_s = [] for s in sen_list: sen_s.append(s.split()) elmo = Elmo(options_filw, weight_file, 1) instances = [] indexer = ELMoTokenCharactersIndexer() for sen in sen_s: tokens = [Token(token) for token in sen] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) voca = Vocabulary() dataset.index_instances(voca) dic = {'elmo': {'num_tokens': 15}} character_ids = dataset.as_tensor_dict(dic)['elmo']['character_ids'] character_ids = character_ids sth = elmo(character_ids)['elmo_representations'] sth = list(torch.chunk(result, result.shape[0], 0)) re[k] = sth
def __init__(self, text_name, label_name, sep): super().__init__(lazy=False) self.sep = sep self.text_name = text_name self.label_name = label_name self.tokeniser = WordTokenizer() self.token_indexers = {"character_ids": ELMoTokenCharactersIndexer()}
def test_elmo(self): # load the test model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo = Elmo(options_file, weight_file, 2) # Correctness checks are in ElmoBiLm and ScalarMix, here we just add a shallow test # to ensure things execute. indexer = ELMoTokenCharactersIndexer() sentences = [['The', 'sentence', '.'], ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.']] # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) character_ids = dataset.as_array_dict()['elmo']['character_ids'] output = elmo(Variable(torch.from_numpy(character_ids))) elmo_representations = output['elmo_representations'] mask = output['mask'] assert len(elmo_representations) == 2 assert list(elmo_representations[0].size()) == [2, 7, 32] assert list(elmo_representations[1].size()) == [2, 7, 32] assert list(mask.size()) == [2, 7]
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). # Parameters batch : `List[List[str]]`, required A list of tokenized sentences. # Returns A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()["elmo"]["character_ids"]["tokens"]
def train_model(parameters, name): token_indexer = { "tokens": ELMoTokenCharactersIndexer() } if parameters['use_elmo'] else None reader = SSJ500KReader( token_indexer) if parameters["dataset"] == "ssj" else SentiCorefReader( token_indexer) train_dataset = reader.read("train") validation_dataset = reader.read("test") vocab = Vocabulary.from_instances(train_dataset + validation_dataset) # vocab = Vocabulary() if parameters['use_elmo'] else Vocabulary.from_instances(train_dataset + validation_dataset) model = get_model(vocab, parameters) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.Adam(model.parameters(), lr=parameters['lr'], weight_decay=parameters['weight_decay']) iterator = BucketIterator(batch_size=parameters['batch_size'], sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=parameters['patience'], num_epochs=parameters['num_epochs'], cuda_device=cuda_device) trainer.train() metrics = evaluate(model, validation_dataset, iterator, cuda_device, None) save_model_and_vocab(model, vocab, metrics, parameters, fname=name)
def train( model: Model, binary_class: str, train_data: DatasetType, valid_reader: DatasetReader, vocab: Vocabulary, optimizer_type: str, optimizer_learning_rate: float, optimizer_weight_decay: float, batch_size: int, patience: int, num_epochs: int, device: str, ) -> Tuple[Model, MetricsType]: train_reader = BIODatasetReader( ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class), token_indexers={ 'tokens': ELMoTokenCharactersIndexer(), }, ) train_dataset = train_reader.read('tmp.txt') valid_dataset = valid_reader.read('tmp.txt') cuda_device = -1 if device == 'cuda': cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD( model.parameters(), lr=optimizer_learning_rate, weight_decay=optimizer_weight_decay, ) iterator = BucketIterator( batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")], ) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, patience=patience, num_epochs=num_epochs, cuda_device=cuda_device, validation_metric='f1-measure-overall', ) metrics = trainer.train() return model, metrics
def __init__(self, options_file: str, weight_file: str, cuda_device: int): from allennlp.modules.elmo import _ElmoBiLm from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer self.indexer = ELMoTokenCharactersIndexer() self.elmo_bilm = _ElmoBiLm(options_file, weight_file) if cuda_device >= 0: self.elmo_bilm = self.elmo_bilm.cuda(device=cuda_device) self.cuda_device = cuda_device self.tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=False)
def elmo_evaluate(args, loader, train_frac=0.0): args.metadata = None train_batcher, test_batcher, train_df, test_df, used_sf_lf_map = loader( args, batch_size=args.batch_size, train_frac=train_frac) # Create model experiments directory or clear if it already exists weights_dir = os.path.join(home_dir, 'weights', 'acronyms', args.experiment) if os.path.exists(weights_dir): print('Clearing out previous weights in {}'.format(weights_dir)) rmtree(weights_dir) os.mkdir(weights_dir) results_dir = os.path.join(weights_dir, 'results') os.mkdir(results_dir) os.mkdir(os.path.join(results_dir, 'confusion')) elmo_model_path = '~/allennlp/{}/model.tar.gz'.format(args.lm_experiment) elmo = get_pretrained_elmo(lm_model_file=elmo_model_path) device_str = 'cuda' if torch.cuda.is_available() else 'cpu' if args.ckpt is not None: ckpt_str = 'best' if args.ckpt == 'best' else 'model_state_epoch_{}'.format( args.ckpt) ckpt_fp = os.path.join( os.path.expanduser('~'), 'allennlp/{}/{}.th'.format(args.lm_experiment, ckpt_str)) state_dict = torch.load(ckpt_fp) model_dict = elmo.state_dict() updated_state_dict = {('_lm.' + k): v for k, v in state_dict.items() if '_lm.' + k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(updated_state_dict) # 3. load the new state dict elmo.load_state_dict(model_dict) model = ELMoAcronymExpander(elmo).to(device_str) indexer = ELMoTokenCharactersIndexer() vocab = elmo._lm.vocab sf_tokenized_lf_map = defaultdict(list) for sf, lf_list in used_sf_lf_map.items(): for lf in lf_list: tokens = lf_tokenizer(lf) sf_tokenized_lf_map[sf].append(tokens) return elmo_analyze(test_batcher, model, used_sf_lf_map, vocab, sf_tokenized_lf_map, indexer, results_dir=results_dir)
def __init__(self, models_dir='models/allen/sentiment-regression'): Service.__init__(self, 'sentiment', 'allen-regression', ['parse']) self.models = {} self.descriptions = {} self.indexer = ELMoTokenCharactersIndexer() for lang in os.listdir(models_dir): if len(lang) == 2: self.models[lang] = self._load_model( os.path.join(models_dir, lang)) self.descriptions[lang] = _load_model_description( os.path.join(models_dir, lang))
def run_text_input(model_dir, text): model, params, _ = load_model_and_vocab(model_dir) token_indexer = { "tokens": ELMoTokenCharactersIndexer() } if params['use_elmo'] else None reader = SSJ500KReader(token_indexer) predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict(text)['tag_logits'] tag_ids = np.argmax(tag_logits, axis=-1) print([(w, model.vocab.get_token_from_index(i, 'labels')) for w, i in zip(text.split(" "), tag_ids)])
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, u'vocab_test.txt'), u'r') as fin: words = fin.read().strip().split(u'\n') vocab = Vocabulary() indexer = ELMoTokenCharactersIndexer() tokens = [Token(word) for word in words] indices = indexer.tokens_to_indices(tokens, vocab, u"elmo") # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): char_indices = indices[u"elmo"][(k * 50):((k + 1) * 50)] sentences.append( indexer.pad_token_sequence({u'key': char_indices}, desired_num_tokens={u'key': 50}, padding_lengths={})[u'key']) batch = torch.from_numpy(numpy.array(sentences)) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output[u'token_embedding'], elmo_token_embedder_output[u'mask'])[0].data.numpy() actual_embeddings = actual_embeddings.reshape( -1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, u'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, u'r') as fin: expected_embeddings = fin[u'embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def __init__(self, chunker_path: str, segmental_vocabulary: Vocabulary, preprocessed_chunk_file: str = None, max_span_width: int = 89, update_chunker_params: bool = False, remove_dropout: bool = False, bos_token: str = '<S>', eos_token: str = '</S>', namespace: str = 'chunky_elmo') -> None: self._namespace = namespace self._max_span_width = max_span_width # First initialize the chunker. if preprocessed_chunk_file is not None: self.chunks_dict: Dict(str, List[str]) = {} self.read_predicted_chunks(preprocessed_chunk_file) else: self.chunks_dict = None logger.info("Reading Chunker from %s", chunker_path) from allennlp.models.archival import load_archive chunker_archive = load_archive(chunker_path) self.chunker = chunker_archive.model if not update_chunker_params: for param in self.chunker.parameters(): param.requires_grad_(False) if remove_dropout: # Setting dropout to 0.0 for all parameters in chunker. self.chunker.dropout.p = 0.0 self.chunker.encoder._module.dropout = 0.0 self.chunker.text_field_embedder.token_embedder_elmo._elmo._dropout.p = 0.0 self.elmo_indexer = ELMoTokenCharactersIndexer( namespace='elmo_characters') self.token_indexer = SingleIdTokenIndexer() self.seglm_vocab = segmental_vocabulary #load_archive(segmental_path).model.vocab self.bos_token = bos_token self.eos_token = eos_token
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"), "r") as fin: words = fin.read().strip().split("\n") vocab = Vocabulary() indexer = ELMoTokenCharactersIndexer() tokens = [Token(word) for word in words] indices = indexer.tokens_to_indices(tokens, vocab) # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): char_indices = indices["elmo_tokens"][(k * 50):((k + 1) * 50)] sentences.append( indexer.as_padded_tensor_dict( {"elmo_tokens": char_indices}, padding_lengths={"elmo_tokens": 50})["elmo_tokens"]) batch = torch.stack(sentences) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output["token_embedding"], elmo_token_embedder_output["mask"])[0].data.numpy() actual_embeddings = actual_embeddings.reshape( -1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, "elmo_token_embeddings.hdf5") with h5py.File(embedding_file, "r") as fin: expected_embeddings = fin["embedding"][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def __init__(self, fold, mode): self.mode = mode self.fold = fold self.instances, self.vocab = load_lm_data(fold=self.fold, mode=self.mode) self.dataloader = DataLoader(dataset=self, batch_size=32, shuffle=self.mode == 'train', num_workers=0, collate_fn=self.collate, drop_last=self.mode == 'train') self.indexer = ELMoTokenCharactersIndexer()
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_bilm = _ElmoBiLm(options_file, weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) # Now finally we can iterate through batches. iterator = BasicIterator(3) for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)): batch_tensor = Variable(torch.from_numpy(batch['elmo']['character_ids'])) lm_embeddings = elmo_bilm(batch_tensor) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def manually_test_reader(): token_indexer = ELMoTokenCharactersIndexer() def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) ] reader = TextExpDataSetReader(token_indexers=token_indexer, tokenizer=tokenizer) instances = reader.read(os.path.join(data_directory, 'test_code_data.csv'))
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(FIXTURES, 'vocab_test.txt'), 'r') as fin: tokens = fin.read().strip().split('\n') indexer = ELMoTokenCharactersIndexer() indices = [ indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens ] # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): sentences.append( indexer.pad_token_sequence(indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={})) batch = Variable(torch.from_numpy(numpy.array(sentences))) options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_token_embedder = _ElmoCharacterEncoder(options_file, weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output['token_embedding'], elmo_token_embedder_output['mask'])[0].data.numpy() actual_embeddings = actual_embeddings.reshape( -1, actual_embeddings.shape[-1]) embedding_file = os.path.join(FIXTURES, 'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, 'r') as fin: expected_embeddings = fin['embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file, _fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data): # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces _tokenizer = None if _config["preprocessed_tokenized"] == True: _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) if _config["token_embedder_type"] == "embedding": _token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True) } _vocab = Vocabulary.from_files(_config["vocab_directory"]) elif _config["token_embedder_type"] == "fasttext": _token_indexers = { "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"]) } _vocab = FastTextVocab(_fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data, _config["fasttext_max_subwords"]) elif _config["token_embedder_type"] == "elmo": _token_indexers = {"tokens": ELMoTokenCharactersIndexer()} _vocab = None _triple_loader = IrTripleDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"], max_query_length=_config["max_query_length"]) _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]), sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")]) _iterator.index_with(_vocab) for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1): _queue.put( training_batch) # this moves the tensors in to shared memory _queue.close() # indicate this local thread is done _wait_for_exit.wait( ) # keep this process alive until all the shared memory is used and not needed anymore
def _get_reader(config, skip_labels=False, bert_max_length=None, reader_max_length=150, read_first=None): indexer = None if config.embedder.name == 'elmo': indexer = ELMoTokenCharactersIndexer() elif config.embedder.name.endswith('bert'): bert_path = os.path.join(config.data.pretrained_models_dir, config.embedder.name) indexer = PretrainedTransformerMismatchedIndexer( model_name=bert_path, tokenizer_kwargs={'do_lower_case': False}, max_length=bert_max_length) elif config.embedder.name == 'both': elmo_indexer = ELMoTokenCharactersIndexer() bert_path = os.path.join(config.data.pretrained_models_dir, 'ru_bert') bert_indexer = PretrainedTransformerMismatchedIndexer( model_name=bert_path, tokenizer_kwargs={'do_lower_case': False}, max_length=bert_max_length) return UDDatasetReader({ 'elmo': elmo_indexer, 'ru_bert': bert_indexer }, skip_labels=skip_labels, max_length=reader_max_length, read_first=read_first) else: assert False, 'Unknown embedder {}'.format(config.embedder.name) return UDDatasetReader({config.embedder.name: indexer}, skip_labels=skip_labels, max_length=reader_max_length, read_first=read_first)
def setup_reader(d_id: int, file_name: str, binary_class: str) -> DatasetReader: bio_dataset = BIODataset( dataset_id=d_id, file_name=file_name, binary_class=binary_class, ) bio_dataset.parse_file() return BIODatasetReader(bio_dataset=bio_dataset, token_indexers={ 'tokens': ELMoTokenCharactersIndexer(), })
def get_token_indexer(self, token_indexers): self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } # the token indexer is responsible for mapping tokens to integers if self.embeddings == 'elmo': self.token_indexers = {"tokens": ELMoTokenCharactersIndexer()} elif self.embeddings == 'bert': self.ber_embedder = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=128, do_lowercase=True, ) self.token_indexers = {"bert": self.ber_embedder}
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {"character_ids": indexer, "tokens": indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def __init__(self, options_file, weight_file, cuda_device, embedding_dim, dropout): self.indexer = ELMoTokenCharactersIndexer() logger.info("Initializing ELMo.") self.elmo = ElmoTokenEmbedder2(options_file, weight_file, dropout=dropout, projection_dim=embedding_dim) if cuda_device >= 0: self.elmo = self.elmo.cuda(device=cuda_device) self.cuda_device = cuda_device self.embedding_dim = embedding_dim
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate( iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, ))