class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
                        Embedding(name=u'use_dan',
                                  dimensions=512,
                                  corpus_size='na',
                                  vocabulary_size='230k',
                                  download_url='https://storage.googleapis.com/tfhub-modules/'
                                               'google/universal-sentence-encoder/2.tar.gz',
                                  format='tar.gz',
                                  architecture='DAN',
                                  trained_data='wikipedia and other sources',
                                  language='en'),
                        Embedding(name=u'use_transformer_large',
                                  dimensions=512,
                                  corpus_size='na',
                                  vocabulary_size='230k',
                                  download_url='https://storage.googleapis.com/tfhub-modules/'
                                               'google/universal-sentence-encoder-large/3.tar.gz',
                                  format='tar.gz',
                                  architecture='Transformer',
                                  trained_data='wikipedia and other sources',
                                  language='en'),
                        Embedding(name=u'use_transformer_lite',
                                  dimensions=512,
                                  corpus_size='na',
                                  vocabulary_size='na',
                                  download_url='https://storage.googleapis.com/tfhub-modules/'
                                               'google/universal-sentence-encoder-lite/2.tar.gz',
                                  format='tar.gz',
                                  architecture='Transformer',
                                  trained_data='wikipedia and other sources',
                                  language='en')
                        ]
    EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}

    def __init__(self):
        self.sess = tf.Session()
        self.sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        self.use_module = None
        self.model_name = None

    def load_model(self, model: str, model_path: str):
        self.use_module = hub.Module(model_path)
        self.sess.run(tf.initializers.global_variables())
        self.model_name = model

    def encode(self, texts: Union[List[str], List[List[str]]],
               pooling: str,
               max_seq_length: int,
               is_tokenized: bool = False,
               **kwargs
               ) -> Optional[np.array]:
        return self.sess.run(self.use_module(texts))
Exemple #2
0
class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'ulmfit_forward',
                  dimensions=300,
                  corpus_size='570k human-generated English sentence pairs',
                  vocabulary_size='230k',
                  download_url='https://www.dropbox.com/s/4k5zwvmqdhwql8t/ulmfit_forward.zip?dl=1',
                  format='zip',
                  architecture='Transformer',
                  trained_data='Stephen Merity’s Wikitext 103 dataset',
                  language='en'),

        Embedding(name=u'ulmfit_backward',
                  dimensions=300,
                  corpus_size='570k human-generated English sentence pairs',
                  vocabulary_size='230k',
                  download_url='https://www.dropbox.com/s/152w8wtv3hxmazp/ulmfit_backword.zip?dl=1',
                  format='zip',
                  architecture='Transformer',
                  trained_data='Stephen Merity’s Wikitext 103 dataset',
                  language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}

    def __init__(self):
        self.ulmfit_model = None
        self.model_name = None
        self.word2idx = None
        self.idx2word = None

    @classmethod
    def tokenize(cls, text: str):
        return [word.strip() for word in text.lower().strip().split()]

    def load_model(self, model: str, model_path: str):
        """
            Loads architecture and weights from saved model.
            Args:
                model: Name of the model
                model_path: directory path of saved model and architecture file.
        """

        weights_path = os.path.join(model_path,  'model.h5')
        id2word_path = os.path.join(model_path, 'itos_wt103.pkl')

        with open(id2word_path, 'rb') as f:
            idx2word = pickle.load(f)

        self.word2idx = {word: idx for idx, word in enumerate(idx2word)}
        self.idx2word = {i: w for w, i in self.word2idx.items()}

        self.ulmfit_model = build_language_model()
        self.ulmfit_model.load_weights(weights_path)
        self.model_name = model

    def encode(self, texts: Union[List[str], List[List[str]]],
               pooling: str,
               max_seq_length: int,
               is_tokenized: bool = False,
               **kwargs
               ) -> Optional[np.array]:
        tokenized_texts = texts
        if not is_tokenized:
            tokenized_texts = [Embeddings.tokenize(text) for text in texts]
        tokenized_text_words = [[self.word2idx[w] for w in text] for text in tokenized_texts]
        embeddings = []

        for x in tokenized_text_words:
            x = np.reshape(x, (1, len(x)))
            embeddings.append(self.ulmfit_model.predict(x)[1][0])
        if not pooling:
            return embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(embeddings, axis=1)
            return pooled
Exemple #3
0
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'wiki_news_300',
                  dimensions=300,
                  corpus_size='16B',
                  vocabulary_size='1M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'wiki-news-300d-1M.vec.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Wikipedia 2017',
                  language='en'),
        Embedding(name=u'wiki_news_300_sub',
                  dimensions=300,
                  corpus_size='16B',
                  vocabulary_size='1M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'wiki-news-300d-1M-subword.vec.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Wikipedia 2017',
                  language='en'),
        Embedding(name=u'common_crawl_300',
                  dimensions=300,
                  corpus_size='600B',
                  vocabulary_size='2M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'crawl-300d-2M.vec.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Common Crawl (600B tokens)',
                  language='en'),
        Embedding(name=u'common_crawl_300_sub',
                  dimensions=300,
                  corpus_size='600B',
                  vocabulary_size='2M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'crawl-300d-2M-subword.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Common Crawl (600B tokens)',
                  language='en'),
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    def __init__(self):
        self.word_vectors: Dict[Any, Any] = {}
        self.model_name = None
        self.max_seq_length = None

    @classmethod
    def tokenize(cls, text):
        return [x.lower().strip() for x in text.split()]

    def load_model(self, model: str, model_path: str, max_seq_length: int):
        try:
            model_file = [
                f for f in os.listdir(model_path)
                if os.path.isfile(os.path.join(model_path, f))
            ]
            f = open(os.path.join(model_path, model_file[0]), 'r')
            next(f)
            for line in tqdm(f):
                split_line = line.split()
                word = split_line[0]
                self.word_vectors[word] = np.array(
                    [float(val) for val in split_line[1:]])
            print("Model loaded Successfully !")
            self.model_name = model
            self.max_seq_length = max_seq_length
            return self
        except Exception as e:
            print('Error loading Model, ', str(e))
        return self

    def _single_encode_text(self, text: Union[str, List[str]],
                            oov_vector: np.array, is_tokenized: bool):
        max_seq_length = self.max_seq_length
        tokens = text
        if not is_tokenized:
            tokens = Embeddings.tokenize(text)
        if len(tokens) > max_seq_length:
            tokens = tokens[0:max_seq_length]
        while len(tokens) < max_seq_length:
            tokens.append('<pad>')
        return np.array(
            [self.word_vectors.get(token, oov_vector) for token in tokens])

    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        oov_vector = np.zeros(
            Embeddings.EMBEDDING_MODELS[self.model_name].dimensions,
            dtype="float32")
        token_embeddings = np.array([
            self._single_encode_text(text, oov_vector, is_tokenized)
            for text in texts
        ])

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(
                    f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
Exemple #4
0
class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(
            name='albert_base',
            dimensions=768,
            corpus_size='3300M',
            vocabulary_size='30522(sub-word)',
            download_url=
            'https://tfhub.dev/google/albert_base/1?tf-hub-format=compressed',
            format='tar.gz',
            architecture='Transformer, Layers=12, Hidden = 768, heads = 12',
            trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
            language='en'),
        Embedding(
            name='albert_large',
            dimensions=1024,
            corpus_size='3300M',
            vocabulary_size='30522(sub-word)',
            download_url=
            'https://tfhub.dev/google/albert_large/1?tf-hub-format=compressed',
            format='tar.gz',
            architecture='Transformer Layers=24, Hidden = 1024, heads = 12',
            trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
            language='en'),
        Embedding(
            name='albert_xlarge',
            dimensions=2048,
            corpus_size='3300M',
            vocabulary_size='30522 (sub-word)',
            download_url=
            'https://tfhub.dev/google/albert_xlarge/1?tf-hub-format=compressed',
            format='tar.gz',
            architecture='Transformer Layers=24, Hidden = 2048, heads = 12',
            trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
            language='en'),
        Embedding(
            name='albert_xxlarge',
            dimensions=4096,
            corpus_size='3300M',
            vocabulary_size='30522 (sub-word)',
            download_url=
            'https://tfhub.dev/google/albert_xxlarge/1?tf-hub-format=compressed',
            format='tar.gz',
            architecture='Transformer Layers=12, Hidden = 4096, heads = 16',
            trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
            language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    tokenizer: FullTokenizer = None

    def __init__(self):
        self.sess = tf.Session()
        self.albert_module = None
        self.model_name = None

    def create_tokenizer_from_hub_module(self):
        """Get the vocab file and casing info from the Hub module."""
        tokenization_info = self.albert_module(signature="tokenization_info",
                                               as_dict=True)

        sentence_piece_file, do_lower_case = self.sess.run([
            tokenization_info["vocab_file"], tokenization_info["do_lower_case"]
        ])

        Embeddings.tokenizer = FullTokenizer(
            vocab_file=None,
            do_lower_case=do_lower_case,
            spm_model_file=sentence_piece_file)

    @classmethod
    def tokenize(cls, text):
        return cls.tokenizer.tokenize(text)

    @staticmethod
    def _model_single_input(
            text: Union[str, List[str]],
            max_seq_length: int,
            is_tokenized: bool = False
    ) -> Tuple[List[int], List[int], List[int]]:
        tokens_a = text
        if not is_tokenized:
            tokens_a = Embeddings.tokenize(text)
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        input_ids = Embeddings.tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        return input_ids, input_mask, segment_ids

    def load_model(self, model: str, model_path: str):
        self.albert_module = hub.Module(model_path)
        self.sess.run(tf.initializers.global_variables())
        self.create_tokenizer_from_hub_module()
        self.model_name = model
        print("Model loaded Successfully !")

    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               max_seq_length: int,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        input_ids, input_masks, segment_ids = [], [], []
        for text in tqdm(texts, desc="Converting texts to features"):
            input_id, input_mask, segment_id = self._model_single_input(
                text, max_seq_length, is_tokenized)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        albert_inputs = dict(input_ids=np.array(input_ids),
                             input_mask=np.array(input_masks),
                             segment_ids=np.array(segment_ids))

        bert_outputs = self.albert_module(albert_inputs,
                                          signature="tokens",
                                          as_dict=True)
        sequence_output = bert_outputs["sequence_output"]

        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled

    @staticmethod
    def remove_prefix(inp):
        PREFIX_CHAR = '▁'
        if len(inp.split(PREFIX_CHAR)[0]) < 1:
            return PREFIX_CHAR.join(inp.split(PREFIX_CHAR)[1:])
        return inp

    def get_full_word_list(self):
        return [
            self.remove_prefix(x)
            for x in list(Embeddings.tokenizer.vocab.keys())
        ]

    def get_all_embeddings(self):
        return dict(
            zip([
                self.remove_prefix(x)
                for x in list(Embeddings.tokenizer.vocab.keys())
            ], list(Embeddings.tokenizer.vocab.values())))
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(
            name=u'xlnet_large_cased',
            dimensions=1024,
            corpus_size='32.89B',
            vocabulary_size='32000',
            download_url='https://storage.googleapis.com/xlnet/released_models/'
            'cased_L-24_H-1024_A-16.zip',
            format='zip',
            architecture='Transformer, 24-layer, 1024-hidden, 16-heads',
            trained_data=
            'BooksCorpus(800M) English Wikipedia (2500M) words, Giga5 (16gb), '
            'ClueWeb 2012-B(19gb),  Common Crawl(78gb)',
            language='en'),
        Embedding(
            name=u'xlnet_base_cased',
            dimensions=768,
            corpus_size='3.86B',
            vocabulary_size='32000',
            download_url='https://storage.googleapis.com/xlnet/released_models/'
            'cased_L-12_H-768_A-12.zip',
            format='zip',
            architecture='Transformer 12-layer, 768-hidden, 12-heads.',
            trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
            language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    tokenizer: spm.SentencePieceProcessor = None
    mode_config_path: str = 'xlnet_config.json'
    sentence_piece_model_path: str = 'spiece.model'

    def __init__(self):
        self.xlnet_config = None
        self.run_config = None
        self.model_name = None
        self.max_seq_length = None
        self.sess = tf.Session()

    @staticmethod
    def load_tokenizer(model_path: str):
        """Get the vocab file and casing info from the Hub module."""
        sp_model = spm.SentencePieceProcessor()
        sp_model.Load(
            os.path.join(model_path, Embeddings.sentence_piece_model_path))
        Embeddings.tokenizer = sp_model

    @classmethod
    def tokenize(cls, text):
        text = preprocess_text(text, lower=False)
        return encode_pieces(cls.tokenizer, text)

    def _model_single_input(
            self, text: Union[str, List[str]],
            is_tokenized: bool) -> Tuple[List[int], List[int], List[int]]:
        max_seq_length = self.max_seq_length
        tokens_a = text
        if not is_tokenized:
            tokens_a = Embeddings.tokenize(text)

        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        segment_ids = []

        tokens_a = [Embeddings.tokenizer.PieceToId(token) for token in tokens]
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(SEG_ID_A)
        tokens.append(SEP_ID)
        segment_ids.append(SEG_ID_A)

        tokens.append(CLS_ID)
        segment_ids.append(SEG_ID_CLS)

        input_ids = tokens

        # The mask has 0 for real tokens and 1 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [0] * len(input_ids)

        # Zero-pad up to the sequence length.
        if len(input_ids) < max_seq_length:
            delta_len = max_seq_length - len(input_ids)
            input_ids = [0] * delta_len + input_ids
            input_mask = [1] * delta_len + input_mask
            segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        return input_ids, input_mask, segment_ids

    def load_model(self, model: str, model_path: str, max_seq_length: int):
        model_path = os.path.join(model_path, next(os.walk(model_path))[1][0])
        self.xlnet_config = xlnet.XLNetConfig(
            json_path=os.path.join(model_path, Embeddings.mode_config_path))
        self.run_config = xlnet.create_run_config(is_training=True,
                                                  is_finetune=True,
                                                  FLAGS=Flags)
        self.load_tokenizer(model_path)
        self.max_seq_length = max_seq_length
        self.model_name = model
        print("Model loaded Successfully !")

    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        input_ids, input_masks, segment_ids = [], [], []
        for text in tqdm(texts, desc="Converting texts to features"):
            input_id, input_mask, segment_id = self._model_single_input(
                text, is_tokenized)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        # Construct an XLNet model
        xlnet_model = xlnet.XLNetModel(xlnet_config=self.xlnet_config,
                                       run_config=self.run_config,
                                       input_ids=np.array(input_ids,
                                                          dtype=np.int32),
                                       seg_ids=np.array(segment_ids,
                                                        dtype=np.int32),
                                       input_mask=np.array(input_masks,
                                                           dtype=np.float32))
        self.sess.run(tf.initializers.global_variables())

        # Get a sequence output
        sequence_output = xlnet_model.get_sequence_output()
        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(
                    f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'google_news_300',
                  dimensions=300,
                  corpus_size='100B',
                  vocabulary_size='3M',
                  download_url='https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz',
                  format='gz',
                  architecture='skip-gram',
                  trained_data='Google News',
                  language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}

    def __init__(self):
        self.word_vectors: Dict[Any, Any] = {}
        self.model_name = None
        self.max_seq_length = None

    @classmethod
    def tokenize(cls, text: str) -> List[str]:
        return [x.lower().strip() for x in text.split()]

    def load_model(self, model: str, model_path: str, max_seq_length: int):
        try:
            encoding = 'utf-8'
            unicode_errors = 'strict'

            model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))]
            f = open(os.path.join(model_path, model_file[0]), 'rb')

            header = to_unicode(f.readline(), encoding=encoding)
            vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format

            binary_len = dtype(real).itemsize * vector_size
            for _ in tqdm(range(vocab_size)):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)

                weights = fromstring(f.read(binary_len), dtype=real).astype(real)

                self.word_vectors[word] = weights
            self.model_name = model
            self.max_seq_length = max_seq_length
            print("Model loaded Successfully !")
            return self
        except Exception as e:
            print('Error loading Model, ', str(e))

    def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array,
                            is_tokenized: bool):
        max_seq_length = self.max_seq_length
        tokens = text
        if not is_tokenized:
            tokens = Embeddings.tokenize(text)
        if len(tokens) > max_seq_length:
            tokens = tokens[0: max_seq_length]
        while len(tokens) < max_seq_length:
            tokens.append('<pad>')
        return np.array([self.word_vectors.get(token, oov_vector) for token in tokens])

    def encode(self, texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs
               ) -> Optional[np.array]:
        oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32")
        token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized)
                                     for text in texts])

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
Exemple #7
0
class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'use_dan',
                  dimensions=512,
                  corpus_size='na',
                  vocabulary_size='230k',
                  download_url='https://storage.googleapis.com/tfhub-modules/'
                  'google/universal-sentence-encoder/2.tar.gz',
                  format='tar.gz',
                  architecture='DAN',
                  trained_data='wikipedia and other sources',
                  language='en'),
        Embedding(name=u'use_transformer_large',
                  dimensions=512,
                  corpus_size='na',
                  vocabulary_size='230k',
                  download_url='https://storage.googleapis.com/tfhub-modules/'
                  'google/universal-sentence-encoder-large/3.tar.gz',
                  format='tar.gz',
                  architecture='Transformer',
                  trained_data='wikipedia and other sources',
                  language='en'),
        Embedding(name=u'use_transformer_lite',
                  dimensions=512,
                  corpus_size='na',
                  vocabulary_size='na',
                  download_url='https://storage.googleapis.com/tfhub-modules/'
                  'google/universal-sentence-encoder-lite/2.tar.gz',
                  format='tar.gz',
                  architecture='Transformer',
                  trained_data='wikipedia and other sources',
                  language='en')
    ]
    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    def __init__(self):
        self.sess = tf.Session()
        self.sess.run(
            [tf.global_variables_initializer(),
             tf.tables_initializer()])
        self.use_outputs = None
        self.model_name = None
        self.max_seq_length = None

        # placeholder for dan and large model
        self.sentences = None

        # sentencepiece and place holder model for lite version
        self.sp_model = spm.SentencePieceProcessor()
        self.input_placeholder = None

    def process_to_ids_in_sparse_format(self, sentences):
        # An utility method that processes sentences with the sentence piece processor
        # 'sp' and returns the results in tf.SparseTensor-similar format:
        # (values, indices, dense_shape)
        ids = [self.sp_model.EncodeAsIds(x) for x in sentences]
        max_len = max(len(x) for x in ids)
        dense_shape = (len(ids), max_len)
        values = [item for sublist in ids for item in sublist]
        indices = [[row, col] for row in range(len(ids))
                   for col in range(len(ids[row]))]
        return values, indices, dense_shape

    def load_model(self, model: str, model_path: str, max_seq_length: int):
        spm_path_info = None
        g = tf.Graph()
        with g.as_default():
            hub_module = hub.Module(model_path)
            if model == 'use_transformer_lite':
                self.input_placeholder = tf.sparse_placeholder(
                    tf.int64, shape=[None, None])
                self.use_outputs = hub_module(inputs=dict(
                    values=self.input_placeholder.values,
                    indices=self.input_placeholder.indices,
                    dense_shape=self.input_placeholder.dense_shape))
                spm_path_info = hub_module(signature="spm_path")
            else:
                self.sentences = tf.placeholder(tf.string, shape=[None])
                self.use_outputs = hub_module(self.sentences, as_dict=True)
            init_op = tf.group(
                [tf.global_variables_initializer(),
                 tf.tables_initializer()])

        g.finalize()
        self.sess = tf.Session(graph=g)
        self.sess.run(init_op)

        if model == 'use_transformer_lite':
            spm_path = self.sess.run(spm_path_info)
            self.sp_model.Load(spm_path)

        self.model_name = model
        self.max_seq_length = max_seq_length

    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        if self.model_name == 'use_transformer_lite':
            values, indices, dense_shape = self.process_to_ids_in_sparse_format(
                texts)
            embeddings = self.sess.run(self.use_outputs,
                                       feed_dict={
                                           self.input_placeholder.values:
                                           values,
                                           self.input_placeholder.indices:
                                           indices,
                                           self.input_placeholder.dense_shape:
                                           dense_shape
                                       })
        else:
            embeddings = self.sess.run(self.use_outputs,
                                       feed_dict={self.sentences:
                                                  texts})["default"]
        return embeddings
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(
            name=u'twitter_100',
            dimensions=100,
            corpus_size='27B',
            vocabulary_size='1.2M',
            download_url=
            'https://www.dropbox.com/s/q2wof83a0yq7q74/glove.twitter.27B.100d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Twitter 2B Tweets',
            language='en'),
        Embedding(
            name=u'twitter_200',
            dimensions=200,
            corpus_size='27B',
            vocabulary_size='1.2M',
            download_url=
            'https://www.dropbox.com/s/hfw00m77ibz24y5/glove.twitter.27B.200d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Twitter 2B Tweets',
            language='en'),
        Embedding(
            name=u'twitter_25',
            dimensions=25,
            corpus_size='27B',
            vocabulary_size='1.2M',
            download_url=
            'https://www.dropbox.com/s/jx97sz8skdp276k/glove.twitter.27B.25d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Twitter 2B Tweets',
            language='en'),
        Embedding(
            name=u'twitter_50',
            dimensions=50,
            corpus_size='27B',
            vocabulary_size='1.2M',
            download_url=
            'https://www.dropbox.com/s/9mutj8syz3q20e3/glove.twitter.27B.50d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Twitter 2B Tweets',
            language='en'),
        Embedding(
            name=u'wiki_100',
            dimensions=100,
            corpus_size='6B',
            vocabulary_size='0.4M',
            download_url=
            'https://www.dropbox.com/s/g0inzrsy1ds3u63/glove.6B.100d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Wikipedia+Gigaword',
            language='en'),
        Embedding(
            name=u'wiki_200',
            dimensions=200,
            corpus_size='6B',
            vocabulary_size='0.4M',
            download_url=
            'https://www.dropbox.com/s/pmj2ycd882qkae5/glove.6B.200d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Wikipedia+Gigaword',
            language='en'),
        Embedding(
            name=u'wiki_300',
            dimensions=300,
            corpus_size='6B',
            vocabulary_size='0.4M',
            download_url=
            'https://www.dropbox.com/s/9jbbk99p0d0n1bw/glove.6B.300d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Wikipedia+Gigaword',
            language='en'),
        Embedding(
            name=u'wiki_50',
            dimensions=50,
            corpus_size='6B',
            vocabulary_size='0.4M',
            download_url=
            'https://www.dropbox.com/s/o3axsz1j47043si/glove.6B.50d.txt.zip?dl=1',
            format='zip',
            architecture='glove',
            trained_data='Wikipedia+Gigaword',
            language='en'),
        Embedding(
            name=u'crawl_42B_300',
            dimensions=300,
            corpus_size='42B',
            vocabulary_size='1.9M',
            download_url='http://nlp.stanford.edu/data/glove.42B.300d.zip',
            format='zip',
            architecture='glove',
            trained_data='Common Crawl (42B tokens)',
            language='en'),
        Embedding(
            name=u'crawl_840B_300',
            dimensions=300,
            corpus_size='840B',
            vocabulary_size='2.2M',
            download_url='http://nlp.stanford.edu/data/glove.840B.300d.zip',
            format='zip',
            architecture='glove',
            trained_data='Common Crawl (840B tokens)',
            language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    def __init__(self):
        self.word_vectors: Dict[Any, Any] = {}
        self.model_name = None
        self.max_seq_length = None

    @classmethod
    def tokenize(cls, text: str) -> List[str]:
        return [x.lower().strip() for x in text.split()]

    def load_model(self, model: str, model_path: str, max_seq_length: int):
        try:
            model_file = [
                f for f in os.listdir(model_path)
                if os.path.isfile(os.path.join(model_path, f))
            ]
            f = open(os.path.join(model_path, model_file[0]),
                     'r',
                     encoding="utf-8")
            for line in tqdm(f):
                split_line = line.split()
                word = split_line[0]
                self.word_vectors[word] = np.array(
                    [float(val) for val in split_line[1:]])
            print("Model loaded Successfully !")
            self.model_name = model
            self.max_seq_length = max_seq_length
            return self
        except Exception as e:
            print('Error loading Model, ', str(e))
        return self

    def _single_encode_text(self, text: Union[str, List[str]],
                            oov_vector: np.array, is_tokenized: bool):
        max_seq_length = self.max_seq_length
        tokens = text
        if not is_tokenized:
            tokens = Embeddings.tokenize(text)
        if len(tokens) > max_seq_length:
            tokens = tokens[0:max_seq_length]
        while len(tokens) < max_seq_length:
            tokens.append('<pad>')
        return np.array(
            [self.word_vectors.get(token, oov_vector) for token in tokens])

    def encode(self,
               texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs) -> Optional[np.array]:
        oov_vector = np.zeros(
            Embeddings.EMBEDDING_MODELS[self.model_name].dimensions,
            dtype="float32")
        token_embeddings = np.array([
            self._single_encode_text(text, oov_vector, is_tokenized)
            for text in texts
        ])

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                raise NotImplementedError(
                    f"Pooling method \"{pooling}\" not implemented")
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
Exemple #9
0
class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'elmo_bi_lm',
                  dimensions=512,
                  corpus_size='1B',
                  vocabulary_size='5.5B',
                  download_url='https://storage.googleapis.com/tfhub-modules/google/elmo/2.tar.gz',
                  format='tar.gz',
                  architecture='Embedding layer,cnn_layer_with_maxpool,2 lstm layers',
                  trained_data='One Billion Word Benchmark',
                  language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}

    def __init__(self):
        self.elmo_outputs = None
        self.model_name = None
        self.max_seq_length = None
        self.sess = tf.Session()

        # placeholder
        self.tokens = None
        self.sequence_len = None

    @classmethod
    def tokenize(cls, text: str):
        return [word.strip() for word in text.lower().strip().split()]

    @classmethod
    def padded_tokens(cls, tokens: List[str], max_seq_length: int):
        padded_token = ""
        len_tokens = len(tokens)
        if len_tokens >= max_seq_length:
            return tokens[:max_seq_length]
        else:
            padded_len = max_seq_length - len_tokens
            return tokens + [padded_token] * padded_len

    def load_model(self, model: str, model_path: str, max_seq_length: int):
        g = tf.Graph()
        with g.as_default():
            hub_module = hub.Module(model_path)
            self.tokens = tf.placeholder(dtype=tf.string, shape=[None, max_seq_length])
            self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None])

            elmo_inputs = dict(
                tokens=self.tokens,
                sequence_len=self.sequence_len
            )
            self.elmo_outputs = hub_module(elmo_inputs, signature="tokens", as_dict=True)
            init_op = tf.group([tf.global_variables_initializer()])
        g.finalize()
        self.sess = tf.Session(graph=g)
        self.sess.run(init_op)

        self.model_name = model
        self.max_seq_length = max_seq_length

    def encode(self, texts: Union[List[str], List[List[str]]],
               pooling: str,
               is_tokenized: bool = False,
               **kwargs
               ) -> Optional[np.array]:

        text_tokens = texts
        if not is_tokenized:
            text_tokens = [Embeddings.tokenize(text) for text in texts]
        text_tokens = [Embeddings.padded_tokens(tokens, self.max_seq_length) for tokens in text_tokens]
        seq_length = [self.max_seq_length] * len(texts)

        elmo_inputs = {
            self.tokens: np.array(text_tokens),
            self.sequence_len: np.array(seq_length)
        }

        token_embeddings = self.sess.run(self.elmo_outputs, feed_dict=elmo_inputs)["elmo"]

        if not pooling:
            return token_embeddings
        else:
            if pooling not in POOL_FUNC_MAP.keys():
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled