Esempio n. 1
0
    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               max_seq_length: int,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        input_ids, input_masks, segment_ids = [], [], []
        for text in tqdm(texts, desc="Converting texts to features"):
            input_id, input_mask, segment_id = self._model_single_input(
                text, max_seq_length, is_tokenized)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        albert_inputs = dict(input_ids=np.array(input_ids),
                             input_mask=np.array(input_masks),
                             segment_ids=np.array(segment_ids))

        bert_outputs = self.albert_module(albert_inputs,
                                          signature="tokens",
                                          as_dict=True)
        sequence_output = bert_outputs["sequence_output"]

        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
Esempio n. 2
0
    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               max_seq_length: int,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        tokenized_texts = texts
        if not is_tokenized:
            tokenized_texts = [Embeddings.tokenize(text) for text in texts]
        tokenized_text_words = [[
            self.word2idx.get(w, self.word2idx['unk']) for w in text
        ] for text in tokenized_texts]
        embeddings = []

        for x in tokenized_text_words:
            x = np.reshape(x, (1, len(x)))
            embeddings.append(self.ulmfit_model.predict(x)[1][0])
        if not pooling:
            return embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(
                    f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(embeddings, axis=1)
            return pooled
Esempio n. 3
0
    def encode(self, texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs
               ) -> Optional[np.array]:

        text_tokens = texts
        if not is_tokenized:
            text_tokens = [Embeddings.tokenize(text) for text in texts]
        text_tokens = [Embeddings.padded_tokens(tokens, self.max_seq_length) for tokens in text_tokens]
        seq_length = [self.max_seq_length] * len(texts)

        elmo_inputs = {
            self.tokens: np.array(text_tokens),
            self.sequence_len: np.array(seq_length)
        }

        token_embeddings = self.sess.run(self.elmo_outputs, feed_dict=elmo_inputs)["elmo"]

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
    def encode(self, texts: Union[List[str], List[List[str]]],
               pooling: str,
               max_seq_length: int,
               is_tokenized: bool = False,
               **kwargs
               ) -> Optional[np.array]:

        text_tokens = texts
        if not is_tokenized:
            text_tokens = [Embeddings.tokenize(text) for text in texts]
        if max_seq_length:
            text_tokens = [Embeddings.padded_tokens(tokens, max_seq_length) for tokens in text_tokens]
            seq_length = [max_seq_length] * len(texts)
        else:
            seq_length = [len(tokens) for tokens in text_tokens]

        sequence_output = self.elmo_module(inputs={"tokens": text_tokens, "sequence_len": seq_length},
                                           signature="tokens", as_dict=True)["elmo"]

        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
    def encode(self, texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs
               ) -> Optional[np.array]:
        oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32")
        token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized)
                                     for text in texts])

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               max_seq_length: int,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        input_ids, input_masks, segment_ids = [], [], []
        for text in tqdm(texts, desc="Converting texts to features"):
            input_id, input_mask, segment_id = self._model_single_input(
                text, max_seq_length, is_tokenized)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        # Construct an XLNet model
        xlnet_model = xlnet.XLNetModel(xlnet_config=self.xlnet_config,
                                       run_config=self.run_config,
                                       input_ids=np.array(input_ids,
                                                          dtype=np.int32),
                                       seg_ids=np.array(segment_ids,
                                                        dtype=np.int32),
                                       input_mask=np.array(input_masks,
                                                           dtype=np.float32))
        self.sess.run(tf.initializers.global_variables())

        # Get a sequence output
        sequence_output = xlnet_model.get_sequence_output()
        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(
                    f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled