Ejemplo n.º 1
0
 def _fit_label_encoder(self, entities_lexicon):
     self.label_encoder = PyramidLabelEncoder()
     self.label_encoder.set_tokenizer(self.tokenizer)
     if entities_lexicon is not None:
         self.label_encoder.fit(entities_lexicon)
     else:
         self.label_encoder.fit([e.name for x in self.data for e in x.entities])
Ejemplo n.º 2
0
    def __init__(self,
                 word_lexicon,
                 word_embeddings,
                 entities_lexicon,
                 language_model=None,
                 language_model_casing=True,
                 char_embeddings_dim=60,
                 encoder_hidden_size=100,
                 encoder_output_size=200,
                 decoder_hidden_size=100,
                 pyramid_max_depth=None,
                 inverse_pyramid=False,
                 custom_tokenizer=None,
                 decoder_dropout=0.4,
                 encoder_dropout=0.4,
                 device='cpu'):
        if isinstance(word_embeddings, str):
            word_embeddings = [word_embeddings]
        if isinstance(word_embeddings, list) and isinstance(
                word_embeddings[0], str):
            word_embeddings = FlairWordEmbeddings(word_embeddings,
                                                  lexicon=word_lexicon,
                                                  padding_idx=0)

        self._model_args = {
            'word_embeddings': word_embeddings.to(device),
            'language_model': language_model,
            'char_embeddings_dim': char_embeddings_dim,
            'encoder_hidden_size': encoder_hidden_size,
            'encoder_output_size': encoder_output_size,
            'decoder_hidden_size': decoder_hidden_size,
            'pyramid_max_depth': pyramid_max_depth,
            'batch_first': True,
            'inverse_pyramid': inverse_pyramid,
            'decoder_dropout': decoder_dropout,
            'encoder_dropout': encoder_dropout,
            'device': device
        }

        self.tokenizer = custom_tokenizer or (lambda t: t.split())
        self.label_encoder = PyramidLabelEncoder()
        self.word_vectorizer = WordVectorizer()
        self.char_vectorizer = CharVectorizer()
        self.label_encoder.fit(entities_lexicon)
        self.word_vectorizer.fit(word_lexicon)
        self.char_vectorizer.fit()
        for component in [
                self.word_vectorizer, self.char_vectorizer, self.label_encoder
        ]:
            component.set_tokenizer(self.tokenizer)
        if language_model is not None:
            self._model_args['language_model'] = TransformerWordEmbeddings(
                language_model,
                word_lexicon,
                padding_idx=0,
                device=device,
                casing=language_model_casing)

        self.nnet = self._init_nnet()
    def __init__(self,
                 data_reader,
                 token_lexicon=None,
                 custom_tokenizer=None,
                 char_vectorizer=False,
                 pyramid_max_depth=None):
        """

        :param data_reader: generator of DataPoint objects representing the samples in the dataset.
        :param token_lexicon: iterable of strings containing the lexicon. If it's None, this will
                              be automatically generated from the data.
        :param custom_tokenizer: callable that performs tokenization given a single text input. If
                                 left to None, uses utils.text.default_tokenizer.
        :param char_vectorizer: adds char encodings.
        :param pyramid_max_depth: None for infinite depth.
        """

        self.data = [data_point for data_point in data_reader]
        self.tokenizer = custom_tokenizer or default_tokenizer

        if not token_lexicon:
            token_lexicon = {
                token
                for x in self.data for token in self.tokenizer(x.text)
            }

        self.word_vectorizer = WordVectorizer()
        self.word_vectorizer.set_tokenizer(self.tokenizer)
        self.word_vectorizer.fit(token_lexicon)
        if char_vectorizer:
            self.char_vectorizer = CharVectorizer()
            self.char_vectorizer.set_tokenizer(self.tokenizer)
            self.char_vectorizer.fit()
        else:
            self.char_vectorizer = None

        self.pyramid_max_depth = pyramid_max_depth
        self.label_encoder = PyramidLabelEncoder()
        self.label_encoder.set_tokenizer(self.tokenizer)
        self.label_encoder.fit([e.name for x in self.data for e in x.entities])
Ejemplo n.º 4
0
class PyramidNer(object):
    class _Model(nn.Module):
        def __init__(self, encoder, pyramid, classifier):
            super(PyramidNer._Model, self).__init__()
            self.encoder = encoder
            self.pyramid = pyramid
            self.classifier = classifier

        def forward(self, *args, **kwargs):
            x, mask = self.encoder(*args, **kwargs)
            h, h_remedy = self.pyramid(x, mask)
            return self.classifier(h, h_remedy)

    def __init__(self,
                 word_lexicon,
                 word_embeddings,
                 entities_lexicon,
                 language_model=None,
                 language_model_casing=True,
                 char_embeddings_dim=60,
                 encoder_hidden_size=100,
                 encoder_output_size=200,
                 decoder_hidden_size=100,
                 pyramid_max_depth=None,
                 inverse_pyramid=False,
                 custom_tokenizer=None,
                 decoder_dropout=0.4,
                 encoder_dropout=0.4,
                 device='cpu'):
        if isinstance(word_embeddings, str):
            word_embeddings = [word_embeddings]
        if isinstance(word_embeddings, list) and isinstance(
                word_embeddings[0], str):
            word_embeddings = FlairWordEmbeddings(word_embeddings,
                                                  lexicon=word_lexicon,
                                                  padding_idx=0)

        self._model_args = {
            'word_embeddings': word_embeddings.to(device),
            'language_model': language_model,
            'char_embeddings_dim': char_embeddings_dim,
            'encoder_hidden_size': encoder_hidden_size,
            'encoder_output_size': encoder_output_size,
            'decoder_hidden_size': decoder_hidden_size,
            'pyramid_max_depth': pyramid_max_depth,
            'batch_first': True,
            'inverse_pyramid': inverse_pyramid,
            'decoder_dropout': decoder_dropout,
            'encoder_dropout': encoder_dropout,
            'device': device
        }

        self.tokenizer = custom_tokenizer or (lambda t: t.split())
        self.label_encoder = PyramidLabelEncoder()
        self.word_vectorizer = WordVectorizer()
        self.char_vectorizer = CharVectorizer()
        self.label_encoder.fit(entities_lexicon)
        self.word_vectorizer.fit(word_lexicon)
        self.char_vectorizer.fit()
        for component in [
                self.word_vectorizer, self.char_vectorizer, self.label_encoder
        ]:
            component.set_tokenizer(self.tokenizer)
        if language_model is not None:
            self._model_args['language_model'] = TransformerWordEmbeddings(
                language_model,
                word_lexicon,
                padding_idx=0,
                device=device,
                casing=language_model_casing)

        self.nnet = self._init_nnet()

    @property
    def device(self):
        return self._model_args['device']

    def reset_weights(self):
        self.nnet = self._init_nnet()
        print('New model created!')

    def logits_to_classes(self, logits):
        return [
            torch.argmax(nn.functional.softmax(logit, dim=-1), dim=-1)
            for logit in logits
        ]

    def remedy_to_classes(self, logits):
        if logits is None:
            return
        return torch.round(torch.sigmoid(logits))

    def classes_to_iob2(self, classes, remedy=None):
        labels = self.label_encoder.inverse_transform(classes)
        if remedy is not None:
            labels += self.label_encoder.inverse_remedy_transform(remedy)
        return labels

    def parse(self, x):
        if isinstance(x, list):
            return [self._parse_text(text) for text in x]
        return self._parse_text(x)

    def _parse_text(self, text):
        assert isinstance(text, str), f'Can not parse {text} (not a string).'
        x = " ".join(self.tokenizer(text))  # tokenization
        device = self.device
        x_word, word_mask = self.word_vectorizer.pad_sequences(
            self.word_vectorizer.transform([DataPoint(x)]))
        x_char, char_mask = self.char_vectorizer.pad_sequences(
            self.char_vectorizer.transform([DataPoint(x)]))

        self.nnet.eval()
        with torch.no_grad():
            layers, remedy = self.nnet(x_word.to(device), word_mask.to(device),
                                       x_char.to(device), char_mask.to(device))
        self.nnet.train(mode=True)

        layers_classes = self.logits_to_classes(layers)
        remedy_classes = self.remedy_to_classes(remedy)
        labels = self.classes_to_iob2(layers_classes, remedy=remedy_classes)

        entities = list()
        tokens = x.split()
        for l, layer in enumerate(labels):
            assert len(layer) == 1
            sequence = layer[0]
            for token, tag in enumerate(sequence):
                if tag == 'O':
                    continue
                entity = tag[2:]  # discard the IOB2 notation
                value = " ".join(tokens[token:token + l + 1])
                stop = len(" ".join(tokens[:token + l + 1]))
                start = stop - len(value)
                entities.append(Entity(entity, value, start, stop))

        return DataPoint(x, entities)

    def _build_char_embeddings(self, char_embeddings_dim):
        if not char_embeddings_dim:
            return None
        if char_embeddings_dim % 2:
            raise ValueError(f'Dimension of character embeddings must be '
                             f'an even number (got {char_embeddings_dim})')
        return CharEmbedding(self.char_vectorizer.X,
                             rnn=nn.LSTM,
                             bidirectional=True,
                             embedding_dim=int(char_embeddings_dim / 2))

    def _init_nnet(self):
        sentence_encoder = SentenceEncoder(
            self._model_args['word_embeddings'],
            char_embeddings=self._build_char_embeddings(
                self._model_args['char_embeddings_dim']),
            hidden_size=self._model_args['encoder_hidden_size'],
            output_size=self._model_args['encoder_output_size'],
            rnn_class=nn.LSTM,
            language_model=self._model_args['language_model'],
            dropout=self._model_args['encoder_dropout'],
        )

        if self._model_args['inverse_pyramid']:
            pyramid_cls = BidirectionalPyramidDecoder
        else:
            pyramid_cls = PyramidDecoder
        pyramid_decoder = pyramid_cls(
            input_size=self._model_args['encoder_output_size'],
            hidden_size=self._model_args['decoder_hidden_size'],
            dropout=self._model_args['decoder_dropout'],
            max_depth=self._model_args['pyramid_max_depth'])
        decoder_output_size = self._model_args['decoder_hidden_size'] * 2 * (
            1 + int(self._model_args['inverse_pyramid']))

        classifier = LinearDecoder(decoder_output_size,
                                   classes=len(self.label_encoder.entities))
        return self._Model(sentence_encoder, pyramid_decoder,
                           classifier).to(self.device)

    def save(self, path, name='pyramid_ner'):
        """
        Saves the model weights and metadata to the specified path,
        so it can be loaded later using the `load` class method.
        :param path:
        :param name:
        :return:
        """
        if not os.path.isdir(path):
            raise ValueError(f"{path} is not a directory.")

        folder = os.path.join(path, name)
        os.makedirs(folder, exist_ok=True)

        model_metadata = self._model_args
        if isinstance(self._model_args['word_embeddings'],
                      FlairWordEmbeddings):
            model_metadata['word_embeddings'] = self._model_args[
                'word_embeddings'].word_embeddings_names
        else:
            embedding_layer = model_metadata['word_embeddings']
            model_metadata['word_embeddings'] = {
                'num_embedding': embedding_layer.num_embeddings,
                'embedding_dim': embedding_layer.embedding_dim,
                'padding_idx': embedding_layer.padding_idx,
                'freeze': not embedding_layer.requires_grad
            }
        if isinstance(self._model_args['language_model'],
                      TransformerWordEmbeddings):
            model_metadata['language_model'] = self._model_args[
                'language_model'].transformer
        # persist metadata
        yaml.safe_dump(model_metadata,
                       open(os.path.join(folder, 'metadata.yml'), 'w'))

        # persist token lexicon
        with open(os.path.join(folder, 'lexicon.txt'), 'w') as lex:
            for token in self.word_vectorizer.lexicon:
                lex.write(f"{token}\n")
        # persist label lexicon
        with open(os.path.join(folder, 'entities.txt'), 'w') as en:
            for entity in self.label_encoder.entities:
                if entity is not None:
                    en.write(f"{entity}\n")
        state_dict = self.nnet.state_dict()
        # persist state_dict (model weight)
        torch.save(state_dict, os.path.join(folder, 'weights.bin'))

    @classmethod
    def load(cls,
             path,
             custom_tokenizer=None,
             force_device=None,
             force_language_model=None,
             force_embeddings=None):
        if not os.path.isdir(path):
            raise ValueError(f"{path} is not a directory.")
        try:
            model_metadata = yaml.safe_load(path)
        except Exception as e:
            raise ValueError(
                f"Could not load 'metadata.yml' file at {path}: {e}")

        model_metadata['device'] = force_device or model_metadata['device']
        model_metadata[
            'language_model'] = force_language_model or model_metadata[
                'language_model']
        model_metadata['word_embeddings'] = force_embeddings or model_metadata[
            'word_embeddings']

        if isinstance(model_metadata['word_embeddings'], dict):
            # rebuild the word embeddings matrix (the weights will be loaded from the state_dict)
            freeze = model_metadata['word_embeddings'].pop('freeze')
            place_holder = nn.Embedding(**model_metadata['word_embeddings'])
            place_holder.weight.requires_grad = not freeze
            model_metadata['word_embeddings'] = place_holder

        with open(os.path.join(path, 'lexicon.txt'), 'r') as lex:
            lexicon = [token for token in lex.read().split('\n') if token]
        with open(os.path.join(path, 'entities.txt'), 'r') as en:
            entities = [entity for entity in en.read().split('\n') if entity]

        kwargs = model_metadata
        kwargs['word_lexicon'] = lexicon
        kwargs['entities_lexicon'] = entities
        kwargs['custom_tokenizer'] = custom_tokenizer
        obj = cls(**kwargs)

        state_dict = torch.load(os.path.join(path, 'weights.bin'))
        obj.nnet.load_state_dict(state_dict)
        return obj
class PyramidNerDataset(Dataset):
    """
    Dataset class. Use its `get_dataloader` method to recover a DataLoader
    that can properly recover batches of samples from this Dataset, since
    this Dataset performs dynamic padding of tensors and collation is not
    straight-forward.
    """
    class _Collator(Dataset):
        def __init__(self, dataset, device='cpu'):
            self._device = device
            self._wrapped_dataset = dataset
            self._indices = torch.arange(len(dataset))

        def collate_fn(self, batch):
            batch = torch.stack(batch)
            actual_batch = dict()
            for name, tensors in self._wrapped_dataset[batch].items():
                if name == 'y':
                    actual_batch[name] = [
                        tensor.to(self._device) for tensor in tensors
                    ]
                elif isinstance(batch[name], torch.Tensor):
                    actual_batch[name] = tensors.to(self._device)
                elif isinstance(batch[name], dict):
                    actual_batch[name] = self.collate_fn(batch[name])
            return actual_batch

        def __len__(self):
            return len(self._wrapped_dataset)

        def __getitem__(self, i):
            return self._indices[i]

    def __init__(self,
                 data_reader,
                 token_lexicon=None,
                 custom_tokenizer=None,
                 char_vectorizer=False,
                 pyramid_max_depth=None):
        """

        :param data_reader: generator of DataPoint objects representing the samples in the dataset.
        :param token_lexicon: iterable of strings containing the lexicon. If it's None, this will
                              be automatically generated from the data.
        :param custom_tokenizer: callable that performs tokenization given a single text input. If
                                 left to None, uses utils.text.default_tokenizer.
        :param char_vectorizer: adds char encodings.
        :param pyramid_max_depth: None for infinite depth.
        """

        self.data = [data_point for data_point in data_reader]
        self.tokenizer = custom_tokenizer or default_tokenizer

        if not token_lexicon:
            token_lexicon = {
                token
                for x in self.data for token in self.tokenizer(x.text)
            }

        self.word_vectorizer = WordVectorizer()
        self.word_vectorizer.set_tokenizer(self.tokenizer)
        self.word_vectorizer.fit(token_lexicon)
        if char_vectorizer:
            self.char_vectorizer = CharVectorizer()
            self.char_vectorizer.set_tokenizer(self.tokenizer)
            self.char_vectorizer.fit()
        else:
            self.char_vectorizer = None

        self.pyramid_max_depth = pyramid_max_depth
        self.label_encoder = PyramidLabelEncoder()
        self.label_encoder.set_tokenizer(self.tokenizer)
        self.label_encoder.fit([e.name for x in self.data for e in x.entities])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        if isinstance(i, int):
            ids = torch.tensor([i])
            sample = [self.data[i]]
        else:
            indices = torch.arange(len(self.data)).long()
            sample = [self.data[index] for index in indices[i]]
            ids = torch.tensor([index for index in indices[i]])

        data = self._transform_x(sample)
        max_depth = self.pyramid_max_depth
        data['y'], data['y_remedy'] = self.label_encoder.transform(
            sample, max_depth=max_depth)
        data['id'] = ids.long()

        return data

    def _transform_x(self, sample):
        x = dict()
        for vect, role in [(self.word_vectorizer, 'word'),
                           (self.char_vectorizer, 'char')]:
            if vect is not None:
                vectors, mask = vect.pad_sequences(vect.transform(sample))
                x[f'{role}_vectors'], x[f'{role}_mask'] = vectors, mask

        return x

    def get_dataloader(self,
                       batch_size=32,
                       shuffle=True,
                       device='cpu',
                       bucketing=False):
        def _collate_fn(batch, device=device):
            batch = batch[0]
            for name in batch.keys():
                if name == 'y':
                    batch[name] = [tensor.to(device) for tensor in batch[name]]
                elif isinstance(batch[name], torch.Tensor):
                    batch[name] = batch[name].to(device)
                elif isinstance(batch[name], dict):
                    batch[name] = _collate_fn([batch[name]], device)
            return batch

        if bucketing:  # use sequence bucketing
            sequence_lengths = torch.tensor(
                [len(self.tokenizer(sample.text)) for sample in self.data])
            dataloader = SequenceBucketing.as_dataloader(
                self, sequence_lengths, batch_size, shuffle)
        else:
            collator = self._Collator(self, device)
            dataloader = DataLoader(collator,
                                    batch_size=batch_size,
                                    shuffle=shuffle)
            _collate_fn = collator.collate_fn
        dataloader.collate_fn = _collate_fn

        return dataloader
Ejemplo n.º 6
0
 def _initialize_label_encoder(self, entities_lexicon):
     self.label_encoder = PyramidLabelEncoder()
     self.label_encoder.fit(entities_lexicon)