Ejemplo n.º 1
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.fixtures_path, 'vocab_test.txt'), 'r') as fin:
            tokens = fin.read().strip().split('\n')

        indexer = ELMoTokenCharactersIndexer()
        indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens]
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            sentences.append(
                    indexer.pad_token_sequence(
                            indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={}
                    )
            )
        batch = Variable(torch.from_numpy(numpy.array(sentences)))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
                elmo_token_embedder_output['token_embedding'],
                elmo_token_embedder_output['mask']
        )[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.fixtures_path, 'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, 'r') as fin:
            expected_embeddings = fin['embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
Ejemplo n.º 2
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)

        for correct_index, token in [[0, '<S>'], [2, '</S>']]:
            indices = indexer.tokens_to_indices([Token(token)], Vocabulary(), "correct")
            indices = torch.from_numpy(numpy.array(indices["correct"])).view(1, 1, -1)
            embeddings = elmo_token_embedder(indices)['token_embedding']
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
Ejemplo n.º 3
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)

        for correct_index, token in [[0, '<S>'], [2, '</S>']]:
            indices = indexer.tokens_to_indices([Token(token)], Vocabulary(),
                                                "correct")
            indices = torch.from_numpy(numpy.array(indices["correct"])).view(
                1, 1, -1)
            embeddings = elmo_token_embedder(indices)['token_embedding']
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(),
                                  embeddings[0, 1, :].data.numpy())
Ejemplo n.º 4
0
def batch_to_ids(batch                 )                :
    u"""
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens,
                          {u'character_ids': indexer})
        instance = Instance({u"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
    def __init__(self,
                 bias: torch.Tensor = None,
                 num_bias: int = 1,
                 contraction: (torch.Tensor, torch.Tensor) = None,
                 options_file: str = DEFAULT_OPTIONS_FILE,
                 weight_file: str = DEFAULT_WEIGHT_FILE,
                 cuda_device: int = -1) -> None:
        """
        Parameters
        ----------
        options_file : ``str``, optional
            A path or URL to an ELMo options file.
        weight_file : ``str``, optional
            A path or URL to an ELMo weights file.
        cuda_device : ``int``, optional, (default=-1)
            The GPU device to run on.
        """
        self.indexer = ELMoTokenCharactersIndexer()

        logger.info("Initializing ELMo.")
        self.elmo_bilm = ElmoBilmDebias(options_file, weight_file)
        if cuda_device >= 0:
            self.elmo_bilm = self.elmo_bilm.cuda(device=cuda_device)

        self.cuda_device = cuda_device

        self.num_bias = num_bias
        self.bias=bias
        self.contraction = contraction
        if cuda_device >= 0:
            if self.bias is not None:
                self.bias = self.bias.cuda(device=cuda_device)
            if self.contraction is not None:
                self.contraction = (self.contraction[0].cuda(device=cuda_device), self.contraction[1].cuda(device=cuda_device))
Ejemplo n.º 6
0
def get_token_utils(name: str = config.embedder):
    if name == 'elmo':
        from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
        from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer

        # the token indexer is responsible for mapping tokens to integers
        token_indexer = ELMoTokenCharactersIndexer()

        def tokenizer(x: str):
            return [
                w.text for w in SpacyWordSplitter(language='en_core_web_sm',
                                                  pos_tags=False).split_words(
                                                      x)[:config.max_seq_len]
            ]

        return token_indexer, tokenizer
    elif name == 'bert':
        from allennlp.data.token_indexers import PretrainedBertIndexer

        token_indexer = PretrainedBertIndexer(
            pretrained_model="bert-base-uncased",
            max_pieces=config.max_seq_len,
            do_lowercase=True,
        )

        def tokenizer(s: str):
            return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len -
                                                        2]

        return token_indexer, tokenizer
Ejemplo n.º 7
0
def _get_reader(config,
                skip_labels=False,
                bert_max_length=None,
                reader_max_length=150,
                read_first=None):
    indexers = {}
    for embedder_config in config.embedder.models:
        if embedder_config.name == 'elmo':
            indexers[embedder_config.name] = ELMoTokenCharactersIndexer()
        elif embedder_config.name.endswith('bert'):
            bert_path = os.path.join(config.data.pretrained_models_dir,
                                     embedder_config.name)
            indexers[
                embedder_config.name] = PretrainedTransformerMismatchedIndexer(
                    model_name=bert_path,
                    tokenizer_kwargs={'do_lower_case': False},
                    max_length=bert_max_length)
        elif embedder_config.name == 'char_bilstm':
            indexers[embedder_config.name] = TokenCharactersIndexer()
        else:
            assert False, 'Unknown embedder {}'.format(embedder_config.name)

    return UDDatasetReader(indexers,
                           skip_labels=skip_labels,
                           max_length=reader_max_length,
                           read_first=read_first)
Ejemplo n.º 8
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')

        elmo_token_embedder = _ElmoCharacterEncoder(options_file, weight_file)

        for correct_index, token in [[0, '<S>'], [2, '</S>']]:
            indices = indexer.token_to_indices(Token(token), Vocabulary())
            indices = Variable(torch.from_numpy(numpy.array(indices))).view(
                1, 1, -1)
            embeddings = elmo_token_embedder(indices)['token_embedding']
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(),
                                  embeddings[0, 1, :].data.numpy())
Ejemplo n.º 9
0
def elmo(ll):
    for k in ll:
        sen_list = w[k]
        count += 1
        sen_s = []
        for s in sen_list:
            sen_s.append(s.split())
        elmo = Elmo(options_filw, weight_file, 1)
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        for sen in sen_s:
            tokens = [Token(token) for token in sen]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)
        dataset = Batch(instances)
        voca = Vocabulary()
        dataset.index_instances(voca)

        dic = {'elmo': {'num_tokens': 15}}
        character_ids = dataset.as_tensor_dict(dic)['elmo']['character_ids']
        character_ids = character_ids
        sth = elmo(character_ids)['elmo_representations']
        sth = list(torch.chunk(result, result.shape[0], 0))
        re[k] = sth
Ejemplo n.º 10
0
 def __init__(self, text_name, label_name, sep):
     super().__init__(lazy=False)
     self.sep = sep
     self.text_name = text_name
     self.label_name = label_name
     self.tokeniser = WordTokenizer()
     self.token_indexers = {"character_ids": ELMoTokenCharactersIndexer()}
Ejemplo n.º 11
0
    def test_elmo(self):
        # load the test model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        elmo = Elmo(options_file, weight_file, 2)

        # Correctness checks are in ElmoBiLm and ScalarMix, here we just add a shallow test
        # to ensure things execute.
        indexer = ELMoTokenCharactersIndexer()
        sentences = [['The', 'sentence', '.'],
                     ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.']]

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        character_ids = dataset.as_array_dict()['elmo']['character_ids']

        output = elmo(Variable(torch.from_numpy(character_ids)))
        elmo_representations = output['elmo_representations']
        mask = output['mask']

        assert len(elmo_representations) == 2
        assert list(elmo_representations[0].size()) == [2, 7, 32]
        assert list(elmo_representations[1].size()) == [2, 7, 32]
        assert list(mask.size()) == [2, 7]
Ejemplo n.º 12
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    # Parameters

    batch : `List[List[str]]`, required
        A list of tokenized sentences.

    # Returns

        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {"character_ids": indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()["elmo"]["character_ids"]["tokens"]
Ejemplo n.º 13
0
def train_model(parameters, name):
    token_indexer = {
        "tokens": ELMoTokenCharactersIndexer()
    } if parameters['use_elmo'] else None
    reader = SSJ500KReader(
        token_indexer) if parameters["dataset"] == "ssj" else SentiCorefReader(
            token_indexer)
    train_dataset = reader.read("train")
    validation_dataset = reader.read("test")
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
    # vocab = Vocabulary() if parameters['use_elmo'] else Vocabulary.from_instances(train_dataset + validation_dataset)
    model = get_model(vocab, parameters)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    optimizer = optim.Adam(model.parameters(),
                           lr=parameters['lr'],
                           weight_decay=parameters['weight_decay'])
    iterator = BucketIterator(batch_size=parameters['batch_size'],
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=parameters['patience'],
                      num_epochs=parameters['num_epochs'],
                      cuda_device=cuda_device)
    trainer.train()
    metrics = evaluate(model, validation_dataset, iterator, cuda_device, None)
    save_model_and_vocab(model, vocab, metrics, parameters, fname=name)
Ejemplo n.º 14
0
def train(
    model: Model,
    binary_class: str,
    train_data: DatasetType,
    valid_reader: DatasetReader,
    vocab: Vocabulary,
    optimizer_type: str,
    optimizer_learning_rate: float,
    optimizer_weight_decay: float,
    batch_size: int,
    patience: int,
    num_epochs: int,
    device: str,
) -> Tuple[Model, MetricsType]:
    train_reader = BIODatasetReader(
        ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class),
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
        },
    )

    train_dataset = train_reader.read('tmp.txt')
    valid_dataset = valid_reader.read('tmp.txt')

    cuda_device = -1

    if device == 'cuda':
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(
        model.parameters(),
        lr=optimizer_learning_rate,
        weight_decay=optimizer_weight_decay,
    )

    iterator = BucketIterator(
        batch_size=batch_size,
        sorting_keys=[("sentence", "num_tokens")],
    )

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        patience=patience,
        num_epochs=num_epochs,
        cuda_device=cuda_device,
        validation_metric='f1-measure-overall',
    )
    metrics = trainer.train()

    return model, metrics
Ejemplo n.º 15
0
 def __init__(self, options_file: str, weight_file: str, cuda_device: int):
     from allennlp.modules.elmo import _ElmoBiLm
     from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
     self.indexer = ELMoTokenCharactersIndexer()
     self.elmo_bilm = _ElmoBiLm(options_file, weight_file)
     if cuda_device >= 0:
         self.elmo_bilm = self.elmo_bilm.cuda(device=cuda_device)
     self.cuda_device = cuda_device
     self.tokenizer = pyonmttok.Tokenizer("conservative",
                                          joiner_annotate=False)
Ejemplo n.º 16
0
def elmo_evaluate(args, loader, train_frac=0.0):
    args.metadata = None
    train_batcher, test_batcher, train_df, test_df, used_sf_lf_map = loader(
        args, batch_size=args.batch_size, train_frac=train_frac)

    # Create model experiments directory or clear if it already exists
    weights_dir = os.path.join(home_dir, 'weights', 'acronyms',
                               args.experiment)
    if os.path.exists(weights_dir):
        print('Clearing out previous weights in {}'.format(weights_dir))
        rmtree(weights_dir)
    os.mkdir(weights_dir)
    results_dir = os.path.join(weights_dir, 'results')
    os.mkdir(results_dir)
    os.mkdir(os.path.join(results_dir, 'confusion'))

    elmo_model_path = '~/allennlp/{}/model.tar.gz'.format(args.lm_experiment)
    elmo = get_pretrained_elmo(lm_model_file=elmo_model_path)
    device_str = 'cuda' if torch.cuda.is_available() else 'cpu'

    if args.ckpt is not None:
        ckpt_str = 'best' if args.ckpt == 'best' else 'model_state_epoch_{}'.format(
            args.ckpt)
        ckpt_fp = os.path.join(
            os.path.expanduser('~'),
            'allennlp/{}/{}.th'.format(args.lm_experiment, ckpt_str))

        state_dict = torch.load(ckpt_fp)
        model_dict = elmo.state_dict()
        updated_state_dict = {('_lm.' + k): v
                              for k, v in state_dict.items()
                              if '_lm.' + k in model_dict}
        # 2. overwrite entries in the existing state dict
        model_dict.update(updated_state_dict)
        # 3. load the new state dict
        elmo.load_state_dict(model_dict)

    model = ELMoAcronymExpander(elmo).to(device_str)
    indexer = ELMoTokenCharactersIndexer()
    vocab = elmo._lm.vocab

    sf_tokenized_lf_map = defaultdict(list)
    for sf, lf_list in used_sf_lf_map.items():
        for lf in lf_list:
            tokens = lf_tokenizer(lf)
            sf_tokenized_lf_map[sf].append(tokens)

    return elmo_analyze(test_batcher,
                        model,
                        used_sf_lf_map,
                        vocab,
                        sf_tokenized_lf_map,
                        indexer,
                        results_dir=results_dir)
Ejemplo n.º 17
0
 def __init__(self, models_dir='models/allen/sentiment-regression'):
     Service.__init__(self, 'sentiment', 'allen-regression', ['parse'])
     self.models = {}
     self.descriptions = {}
     self.indexer = ELMoTokenCharactersIndexer()
     for lang in os.listdir(models_dir):
         if len(lang) == 2:
             self.models[lang] = self._load_model(
                 os.path.join(models_dir, lang))
             self.descriptions[lang] = _load_model_description(
                 os.path.join(models_dir, lang))
Ejemplo n.º 18
0
def run_text_input(model_dir, text):
    model, params, _ = load_model_and_vocab(model_dir)
    token_indexer = {
        "tokens": ELMoTokenCharactersIndexer()
    } if params['use_elmo'] else None
    reader = SSJ500KReader(token_indexer)
    predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
    tag_logits = predictor.predict(text)['tag_logits']
    tag_ids = np.argmax(tag_logits, axis=-1)
    print([(w, model.vocab.get_token_from_index(i, 'labels'))
           for w, i in zip(text.split(" "), tag_ids)])
Ejemplo n.º 19
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, u'vocab_test.txt'),
                  u'r') as fin:
            words = fin.read().strip().split(u'\n')

        vocab = Vocabulary()
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(word) for word in words]

        indices = indexer.tokens_to_indices(tokens, vocab, u"elmo")
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            char_indices = indices[u"elmo"][(k * 50):((k + 1) * 50)]
            sentences.append(
                indexer.pad_token_sequence({u'key': char_indices},
                                           desired_num_tokens={u'key': 50},
                                           padding_lengths={})[u'key'])
        batch = torch.from_numpy(numpy.array(sentences))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
            elmo_token_embedder_output[u'token_embedding'],
            elmo_token_embedder_output[u'mask'])[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(
            -1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path,
                                      u'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, u'r') as fin:
            expected_embeddings = fin[u'embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)],
                              expected_embeddings,
                              atol=1e-6)
Ejemplo n.º 20
0
    def __init__(self,
                 chunker_path: str,
                 segmental_vocabulary: Vocabulary,
                 preprocessed_chunk_file: str = None,
                 max_span_width: int = 89,
                 update_chunker_params: bool = False,
                 remove_dropout: bool = False,
                 bos_token: str = '<S>',
                 eos_token: str = '</S>',
                 namespace: str = 'chunky_elmo') -> None:
        self._namespace = namespace
        self._max_span_width = max_span_width

        # First initialize the chunker.
        if preprocessed_chunk_file is not None:
            self.chunks_dict: Dict(str, List[str]) = {}
            self.read_predicted_chunks(preprocessed_chunk_file)
        else:
            self.chunks_dict = None
            logger.info("Reading Chunker from %s", chunker_path)
            from allennlp.models.archival import load_archive
            chunker_archive = load_archive(chunker_path)
            self.chunker = chunker_archive.model

            if not update_chunker_params:
                for param in self.chunker.parameters():
                    param.requires_grad_(False)

            if remove_dropout:
                # Setting dropout to 0.0 for all parameters in chunker.
                self.chunker.dropout.p = 0.0
                self.chunker.encoder._module.dropout = 0.0
                self.chunker.text_field_embedder.token_embedder_elmo._elmo._dropout.p = 0.0

        self.elmo_indexer = ELMoTokenCharactersIndexer(
            namespace='elmo_characters')
        self.token_indexer = SingleIdTokenIndexer()

        self.seglm_vocab = segmental_vocabulary  #load_archive(segmental_path).model.vocab
        self.bos_token = bos_token
        self.eos_token = eos_token
Ejemplo n.º 21
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"),
                  "r") as fin:
            words = fin.read().strip().split("\n")

        vocab = Vocabulary()
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(word) for word in words]

        indices = indexer.tokens_to_indices(tokens, vocab)
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            char_indices = indices["elmo_tokens"][(k * 50):((k + 1) * 50)]
            sentences.append(
                indexer.as_padded_tensor_dict(
                    {"elmo_tokens": char_indices},
                    padding_lengths={"elmo_tokens": 50})["elmo_tokens"])
        batch = torch.stack(sentences)

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
            elmo_token_embedder_output["token_embedding"],
            elmo_token_embedder_output["mask"])[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(
            -1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path,
                                      "elmo_token_embeddings.hdf5")
        with h5py.File(embedding_file, "r") as fin:
            expected_embeddings = fin["embedding"][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)],
                              expected_embeddings,
                              atol=1e-6)
Ejemplo n.º 22
0
 def __init__(self, fold, mode):
     self.mode = mode
     self.fold = fold
     self.instances, self.vocab = load_lm_data(fold=self.fold,
                                               mode=self.mode)
     self.dataloader = DataLoader(dataset=self,
                                  batch_size=32,
                                  shuffle=self.mode == 'train',
                                  num_workers=0,
                                  collate_fn=self.collate,
                                  drop_last=self.mode == 'train')
     self.indexer = ELMoTokenCharactersIndexer()
Ejemplo n.º 23
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        elmo_bilm = _ElmoBiLm(options_file, weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)):
            batch_tensor = Variable(torch.from_numpy(batch['elmo']['character_ids']))
            lm_embeddings = elmo_bilm(batch_tensor)
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings['activations'][2],
                    lm_embeddings['mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                )
Ejemplo n.º 24
0
def manually_test_reader():
    token_indexer = ELMoTokenCharactersIndexer()

    def tokenizer(x: str):
        return [
            w.text for w in SpacyWordSplitter(language='en_core_web_sm',
                                              pos_tags=False).split_words(x)
        ]

    reader = TextExpDataSetReader(token_indexers=token_indexer,
                                  tokenizer=tokenizer)
    instances = reader.read(os.path.join(data_directory, 'test_code_data.csv'))
Ejemplo n.º 25
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(FIXTURES, 'vocab_test.txt'), 'r') as fin:
            tokens = fin.read().strip().split('\n')

        indexer = ELMoTokenCharactersIndexer()
        indices = [
            indexer.token_to_indices(Token(token), Vocabulary())
            for token in tokens
        ]
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            sentences.append(
                indexer.pad_token_sequence(indices[(k * 50):((k + 1) * 50)],
                                           desired_num_tokens=50,
                                           padding_lengths={}))
        batch = Variable(torch.from_numpy(numpy.array(sentences)))

        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')

        elmo_token_embedder = _ElmoCharacterEncoder(options_file, weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
            elmo_token_embedder_output['token_embedding'],
            elmo_token_embedder_output['mask'])[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(
            -1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(FIXTURES, 'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, 'r') as fin:
            expected_embeddings = fin['embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)],
                              expected_embeddings,
                              atol=1e-6)
def multiprocess_training_loader(process_number: int, _config,
                                 _queue: mp.Queue, _wait_for_exit: mp.Event,
                                 _local_file, _fasttext_vocab_cached_mapping,
                                 _fasttext_vocab_cached_data):

    # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
    # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
    _tokenizer = None
    if _config["preprocessed_tokenized"] == True:
        _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    if _config["token_embedder_type"] == "embedding":
        _token_indexers = {
            "tokens": SingleIdTokenIndexer(lowercase_tokens=True)
        }
        _vocab = Vocabulary.from_files(_config["vocab_directory"])

    elif _config["token_embedder_type"] == "fasttext":
        _token_indexers = {
            "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])
        }
        _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,
                               _fasttext_vocab_cached_data,
                               _config["fasttext_max_subwords"])

    elif _config["token_embedder_type"] == "elmo":
        _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
        _vocab = None

    _triple_loader = IrTripleDatasetReader(
        lazy=True,
        tokenizer=_tokenizer,
        token_indexers=_token_indexers,
        max_doc_length=_config["max_doc_length"],
        max_query_length=_config["max_query_length"])

    _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
                               sorting_keys=[("doc_pos_tokens", "num_tokens"),
                                             ("doc_neg_tokens", "num_tokens")])

    _iterator.index_with(_vocab)

    for training_batch in _iterator(_triple_loader.read(_local_file),
                                    num_epochs=1):

        _queue.put(
            training_batch)  # this moves the tensors in to shared memory

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait(
    )  # keep this process alive until all the shared memory is used and not needed anymore
Ejemplo n.º 27
0
def _get_reader(config,
                skip_labels=False,
                bert_max_length=None,
                reader_max_length=150,
                read_first=None):
    indexer = None
    if config.embedder.name == 'elmo':
        indexer = ELMoTokenCharactersIndexer()
    elif config.embedder.name.endswith('bert'):
        bert_path = os.path.join(config.data.pretrained_models_dir,
                                 config.embedder.name)
        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=bert_path,
            tokenizer_kwargs={'do_lower_case': False},
            max_length=bert_max_length)
    elif config.embedder.name == 'both':
        elmo_indexer = ELMoTokenCharactersIndexer()

        bert_path = os.path.join(config.data.pretrained_models_dir, 'ru_bert')
        bert_indexer = PretrainedTransformerMismatchedIndexer(
            model_name=bert_path,
            tokenizer_kwargs={'do_lower_case': False},
            max_length=bert_max_length)

        return UDDatasetReader({
            'elmo': elmo_indexer,
            'ru_bert': bert_indexer
        },
                               skip_labels=skip_labels,
                               max_length=reader_max_length,
                               read_first=read_first)
    else:
        assert False, 'Unknown embedder {}'.format(config.embedder.name)

    return UDDatasetReader({config.embedder.name: indexer},
                           skip_labels=skip_labels,
                           max_length=reader_max_length,
                           read_first=read_first)
Ejemplo n.º 28
0
def setup_reader(d_id: int, file_name: str,
                 binary_class: str) -> DatasetReader:
    bio_dataset = BIODataset(
        dataset_id=d_id,
        file_name=file_name,
        binary_class=binary_class,
    )

    bio_dataset.parse_file()

    return BIODatasetReader(bio_dataset=bio_dataset,
                            token_indexers={
                                'tokens': ELMoTokenCharactersIndexer(),
                            })
Ejemplo n.º 29
0
 def get_token_indexer(self, token_indexers):
     self.token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     # the token indexer is responsible for mapping tokens to integers
     if self.embeddings == 'elmo':
         self.token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
     elif self.embeddings == 'bert':
         self.ber_embedder = PretrainedBertIndexer(
             pretrained_model="bert-base-uncased",
             max_pieces=128,
             do_lowercase=True,
         )
         self.token_indexers = {"bert": self.ber_embedder}
Ejemplo n.º 30
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {"character_ids": indexer, "tokens": indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
    def __init__(self, options_file, weight_file, cuda_device, embedding_dim,
                 dropout):

        self.indexer = ELMoTokenCharactersIndexer()
        logger.info("Initializing ELMo.")
        self.elmo = ElmoTokenEmbedder2(options_file,
                                       weight_file,
                                       dropout=dropout,
                                       projection_dim=embedding_dim)
        if cuda_device >= 0:
            self.elmo = self.elmo.cuda(device=cuda_device)

        self.cuda_device = cuda_device
        self.embedding_dim = embedding_dim
Ejemplo n.º 32
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(
                iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6,
                    ))