class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'use_dan', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder/2.tar.gz', format='tar.gz', architecture='DAN', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_transformer_large', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder-large/3.tar.gz', format='tar.gz', architecture='Transformer', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_transformer_lite', dimensions=512, corpus_size='na', vocabulary_size='na', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder-lite/2.tar.gz', format='tar.gz', architecture='Transformer', trained_data='wikipedia and other sources', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} def __init__(self): self.sess = tf.Session() self.sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) self.use_module = None self.model_name = None def load_model(self, model: str, model_path: str): self.use_module = hub.Module(model_path) self.sess.run(tf.initializers.global_variables()) self.model_name = model def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: return self.sess.run(self.use_module(texts))
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'ulmfit_forward', dimensions=300, corpus_size='570k human-generated English sentence pairs', vocabulary_size='230k', download_url='https://www.dropbox.com/s/4k5zwvmqdhwql8t/ulmfit_forward.zip?dl=1', format='zip', architecture='Transformer', trained_data='Stephen Merity’s Wikitext 103 dataset', language='en'), Embedding(name=u'ulmfit_backward', dimensions=300, corpus_size='570k human-generated English sentence pairs', vocabulary_size='230k', download_url='https://www.dropbox.com/s/152w8wtv3hxmazp/ulmfit_backword.zip?dl=1', format='zip', architecture='Transformer', trained_data='Stephen Merity’s Wikitext 103 dataset', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} def __init__(self): self.ulmfit_model = None self.model_name = None self.word2idx = None self.idx2word = None @classmethod def tokenize(cls, text: str): return [word.strip() for word in text.lower().strip().split()] def load_model(self, model: str, model_path: str): """ Loads architecture and weights from saved model. Args: model: Name of the model model_path: directory path of saved model and architecture file. """ weights_path = os.path.join(model_path, 'model.h5') id2word_path = os.path.join(model_path, 'itos_wt103.pkl') with open(id2word_path, 'rb') as f: idx2word = pickle.load(f) self.word2idx = {word: idx for idx, word in enumerate(idx2word)} self.idx2word = {i: w for w, i in self.word2idx.items()} self.ulmfit_model = build_language_model() self.ulmfit_model.load_weights(weights_path) self.model_name = model def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: tokenized_texts = texts if not is_tokenized: tokenized_texts = [Embeddings.tokenize(text) for text in texts] tokenized_text_words = [[self.word2idx[w] for w in text] for text in tokenized_texts] embeddings = [] for x in tokenized_text_words: x = np.reshape(x, (1, len(x))) embeddings.append(self.ulmfit_model.predict(x)[1][0]) if not pooling: return embeddings else: if pooling not in POOL_FUNC_MAP.keys(): raise NotImplementedError(f"Pooling method \"{pooling}\" not implemented") pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(embeddings, axis=1) return pooled
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'wiki_news_300', dimensions=300, corpus_size='16B', vocabulary_size='1M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'wiki-news-300d-1M.vec.zip', format='zip', architecture='CBOW', trained_data='Wikipedia 2017', language='en'), Embedding(name=u'wiki_news_300_sub', dimensions=300, corpus_size='16B', vocabulary_size='1M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'wiki-news-300d-1M-subword.vec.zip', format='zip', architecture='CBOW', trained_data='Wikipedia 2017', language='en'), Embedding(name=u'common_crawl_300', dimensions=300, corpus_size='600B', vocabulary_size='2M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'crawl-300d-2M.vec.zip', format='zip', architecture='CBOW', trained_data='Common Crawl (600B tokens)', language='en'), Embedding(name=u'common_crawl_300_sub', dimensions=300, corpus_size='600B', vocabulary_size='2M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'crawl-300d-2M-subword.zip', format='zip', architecture='CBOW', trained_data='Common Crawl (600B tokens)', language='en'), ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model_name = None self.max_seq_length = None @classmethod def tokenize(cls, text): return [x.lower().strip() for x in text.split()] def load_model(self, model: str, model_path: str, max_seq_length: int): try: model_file = [ f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f)) ] f = open(os.path.join(model_path, model_file[0]), 'r') next(f) for line in tqdm(f): split_line = line.split() word = split_line[0] self.word_vectors[word] = np.array( [float(val) for val in split_line[1:]]) print("Model loaded Successfully !") self.model_name = model self.max_seq_length = max_seq_length return self except Exception as e: print('Error loading Model, ', str(e)) return self def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, is_tokenized: bool): max_seq_length = self.max_seq_length tokens = text if not is_tokenized: tokens = Embeddings.tokenize(text) if len(tokens) > max_seq_length: tokens = tokens[0:max_seq_length] while len(tokens) < max_seq_length: tokens.append('<pad>') return np.array( [self.word_vectors.get(token, oov_vector) for token in tokens]) def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, is_tokenized: bool = False, **kwargs) -> Optional[np.array]: oov_vector = np.zeros( Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32") token_embeddings = np.array([ self._single_encode_text(text, oov_vector, is_tokenized) for text in texts ]) if not pooling: return token_embeddings else: if pooling not in POOL_FUNC_MAP.keys(): raise NotImplementedError( f"Pooling method \"{pooling}\" not implemented") pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding( name='albert_base', dimensions=768, corpus_size='3300M', vocabulary_size='30522(sub-word)', download_url= 'https://tfhub.dev/google/albert_base/1?tf-hub-format=compressed', format='tar.gz', architecture='Transformer, Layers=12, Hidden = 768, heads = 12', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en'), Embedding( name='albert_large', dimensions=1024, corpus_size='3300M', vocabulary_size='30522(sub-word)', download_url= 'https://tfhub.dev/google/albert_large/1?tf-hub-format=compressed', format='tar.gz', architecture='Transformer Layers=24, Hidden = 1024, heads = 12', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en'), Embedding( name='albert_xlarge', dimensions=2048, corpus_size='3300M', vocabulary_size='30522 (sub-word)', download_url= 'https://tfhub.dev/google/albert_xlarge/1?tf-hub-format=compressed', format='tar.gz', architecture='Transformer Layers=24, Hidden = 2048, heads = 12', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en'), Embedding( name='albert_xxlarge', dimensions=4096, corpus_size='3300M', vocabulary_size='30522 (sub-word)', download_url= 'https://tfhub.dev/google/albert_xxlarge/1?tf-hub-format=compressed', format='tar.gz', architecture='Transformer Layers=12, Hidden = 4096, heads = 16', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } tokenizer: FullTokenizer = None def __init__(self): self.sess = tf.Session() self.albert_module = None self.model_name = None def create_tokenizer_from_hub_module(self): """Get the vocab file and casing info from the Hub module.""" tokenization_info = self.albert_module(signature="tokenization_info", as_dict=True) sentence_piece_file, do_lower_case = self.sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"] ]) Embeddings.tokenizer = FullTokenizer( vocab_file=None, do_lower_case=do_lower_case, spm_model_file=sentence_piece_file) @classmethod def tokenize(cls, text): return cls.tokenizer.tokenize(text) @staticmethod def _model_single_input( text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool = False ) -> Tuple[List[int], List[int], List[int]]: tokens_a = text if not is_tokenized: tokens_a = Embeddings.tokenize(text) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = Embeddings.tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length return input_ids, input_mask, segment_ids def load_model(self, model: str, model_path: str): self.albert_module = hub.Module(model_path) self.sess.run(tf.initializers.global_variables()) self.create_tokenizer_from_hub_module() self.model_name = model print("Model loaded Successfully !") def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, max_seq_length: int, is_tokenized: bool = False, **kwargs) -> Optional[np.array]: input_ids, input_masks, segment_ids = [], [], [] for text in tqdm(texts, desc="Converting texts to features"): input_id, input_mask, segment_id = self._model_single_input( text, max_seq_length, is_tokenized) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) albert_inputs = dict(input_ids=np.array(input_ids), input_mask=np.array(input_masks), segment_ids=np.array(segment_ids)) bert_outputs = self.albert_module(albert_inputs, signature="tokens", as_dict=True) sequence_output = bert_outputs["sequence_output"] token_embeddings = self.sess.run(sequence_output) if not pooling: return token_embeddings else: if pooling not in POOL_FUNC_MAP.keys(): print(f"Pooling method \"{pooling}\" not implemented") return None pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled @staticmethod def remove_prefix(inp): PREFIX_CHAR = '▁' if len(inp.split(PREFIX_CHAR)[0]) < 1: return PREFIX_CHAR.join(inp.split(PREFIX_CHAR)[1:]) return inp def get_full_word_list(self): return [ self.remove_prefix(x) for x in list(Embeddings.tokenizer.vocab.keys()) ] def get_all_embeddings(self): return dict( zip([ self.remove_prefix(x) for x in list(Embeddings.tokenizer.vocab.keys()) ], list(Embeddings.tokenizer.vocab.values())))
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding( name=u'xlnet_large_cased', dimensions=1024, corpus_size='32.89B', vocabulary_size='32000', download_url='https://storage.googleapis.com/xlnet/released_models/' 'cased_L-24_H-1024_A-16.zip', format='zip', architecture='Transformer, 24-layer, 1024-hidden, 16-heads', trained_data= 'BooksCorpus(800M) English Wikipedia (2500M) words, Giga5 (16gb), ' 'ClueWeb 2012-B(19gb), Common Crawl(78gb)', language='en'), Embedding( name=u'xlnet_base_cased', dimensions=768, corpus_size='3.86B', vocabulary_size='32000', download_url='https://storage.googleapis.com/xlnet/released_models/' 'cased_L-12_H-768_A-12.zip', format='zip', architecture='Transformer 12-layer, 768-hidden, 12-heads.', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } tokenizer: spm.SentencePieceProcessor = None mode_config_path: str = 'xlnet_config.json' sentence_piece_model_path: str = 'spiece.model' def __init__(self): self.xlnet_config = None self.run_config = None self.model_name = None self.max_seq_length = None self.sess = tf.Session() @staticmethod def load_tokenizer(model_path: str): """Get the vocab file and casing info from the Hub module.""" sp_model = spm.SentencePieceProcessor() sp_model.Load( os.path.join(model_path, Embeddings.sentence_piece_model_path)) Embeddings.tokenizer = sp_model @classmethod def tokenize(cls, text): text = preprocess_text(text, lower=False) return encode_pieces(cls.tokenizer, text) def _model_single_input( self, text: Union[str, List[str]], is_tokenized: bool) -> Tuple[List[int], List[int], List[int]]: max_seq_length = self.max_seq_length tokens_a = text if not is_tokenized: tokens_a = Embeddings.tokenize(text) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens_a = [Embeddings.tokenizer.PieceToId(token) for token in tokens] for token in tokens_a: tokens.append(token) segment_ids.append(SEG_ID_A) tokens.append(SEP_ID) segment_ids.append(SEG_ID_A) tokens.append(CLS_ID) segment_ids.append(SEG_ID_CLS) input_ids = tokens # The mask has 0 for real tokens and 1 for padding tokens. Only real # tokens are attended to. input_mask = [0] * len(input_ids) # Zero-pad up to the sequence length. if len(input_ids) < max_seq_length: delta_len = max_seq_length - len(input_ids) input_ids = [0] * delta_len + input_ids input_mask = [1] * delta_len + input_mask segment_ids = [SEG_ID_PAD] * delta_len + segment_ids assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length return input_ids, input_mask, segment_ids def load_model(self, model: str, model_path: str, max_seq_length: int): model_path = os.path.join(model_path, next(os.walk(model_path))[1][0]) self.xlnet_config = xlnet.XLNetConfig( json_path=os.path.join(model_path, Embeddings.mode_config_path)) self.run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=Flags) self.load_tokenizer(model_path) self.max_seq_length = max_seq_length self.model_name = model print("Model loaded Successfully !") def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, is_tokenized: bool = False, **kwargs) -> Optional[np.array]: input_ids, input_masks, segment_ids = [], [], [] for text in tqdm(texts, desc="Converting texts to features"): input_id, input_mask, segment_id = self._model_single_input( text, is_tokenized) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) # Construct an XLNet model xlnet_model = xlnet.XLNetModel(xlnet_config=self.xlnet_config, run_config=self.run_config, input_ids=np.array(input_ids, dtype=np.int32), seg_ids=np.array(segment_ids, dtype=np.int32), input_mask=np.array(input_masks, dtype=np.float32)) self.sess.run(tf.initializers.global_variables()) # Get a sequence output sequence_output = xlnet_model.get_sequence_output() token_embeddings = self.sess.run(sequence_output) if not pooling: return token_embeddings else: if pooling not in POOL_FUNC_MAP.keys(): raise NotImplementedError( f"Pooling method \"{pooling}\" not implemented") pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'google_news_300', dimensions=300, corpus_size='100B', vocabulary_size='3M', download_url='https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', format='gz', architecture='skip-gram', trained_data='Google News', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model_name = None self.max_seq_length = None @classmethod def tokenize(cls, text: str) -> List[str]: return [x.lower().strip() for x in text.split()] def load_model(self, model: str, model_path: str, max_seq_length: int): try: encoding = 'utf-8' unicode_errors = 'strict' model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))] f = open(os.path.join(model_path, model_file[0]), 'rb') header = to_unicode(f.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format binary_len = dtype(real).itemsize * vector_size for _ in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': break if ch == b'': raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(f.read(binary_len), dtype=real).astype(real) self.word_vectors[word] = weights self.model_name = model self.max_seq_length = max_seq_length print("Model loaded Successfully !") return self except Exception as e: print('Error loading Model, ', str(e)) def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, is_tokenized: bool): max_seq_length = self.max_seq_length tokens = text if not is_tokenized: tokens = Embeddings.tokenize(text) if len(tokens) > max_seq_length: tokens = tokens[0: max_seq_length] while len(tokens) < max_seq_length: tokens.append('<pad>') return np.array([self.word_vectors.get(token, oov_vector) for token in tokens]) def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32") token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized) for text in texts]) if not pooling: return token_embeddings else: if pooling not in POOL_FUNC_MAP.keys(): raise NotImplementedError(f"Pooling method \"{pooling}\" not implemented") pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'use_dan', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder/2.tar.gz', format='tar.gz', architecture='DAN', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_transformer_large', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder-large/3.tar.gz', format='tar.gz', architecture='Transformer', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_transformer_lite', dimensions=512, corpus_size='na', vocabulary_size='na', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder-lite/2.tar.gz', format='tar.gz', architecture='Transformer', trained_data='wikipedia and other sources', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } def __init__(self): self.sess = tf.Session() self.sess.run( [tf.global_variables_initializer(), tf.tables_initializer()]) self.use_outputs = None self.model_name = None self.max_seq_length = None # placeholder for dan and large model self.sentences = None # sentencepiece and place holder model for lite version self.sp_model = spm.SentencePieceProcessor() self.input_placeholder = None def process_to_ids_in_sparse_format(self, sentences): # An utility method that processes sentences with the sentence piece processor # 'sp' and returns the results in tf.SparseTensor-similar format: # (values, indices, dense_shape) ids = [self.sp_model.EncodeAsIds(x) for x in sentences] max_len = max(len(x) for x in ids) dense_shape = (len(ids), max_len) values = [item for sublist in ids for item in sublist] indices = [[row, col] for row in range(len(ids)) for col in range(len(ids[row]))] return values, indices, dense_shape def load_model(self, model: str, model_path: str, max_seq_length: int): spm_path_info = None g = tf.Graph() with g.as_default(): hub_module = hub.Module(model_path) if model == 'use_transformer_lite': self.input_placeholder = tf.sparse_placeholder( tf.int64, shape=[None, None]) self.use_outputs = hub_module(inputs=dict( values=self.input_placeholder.values, indices=self.input_placeholder.indices, dense_shape=self.input_placeholder.dense_shape)) spm_path_info = hub_module(signature="spm_path") else: self.sentences = tf.placeholder(tf.string, shape=[None]) self.use_outputs = hub_module(self.sentences, as_dict=True) init_op = tf.group( [tf.global_variables_initializer(), tf.tables_initializer()]) g.finalize() self.sess = tf.Session(graph=g) self.sess.run(init_op) if model == 'use_transformer_lite': spm_path = self.sess.run(spm_path_info) self.sp_model.Load(spm_path) self.model_name = model self.max_seq_length = max_seq_length def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, is_tokenized: bool = False, **kwargs) -> Optional[np.array]: if self.model_name == 'use_transformer_lite': values, indices, dense_shape = self.process_to_ids_in_sparse_format( texts) embeddings = self.sess.run(self.use_outputs, feed_dict={ self.input_placeholder.values: values, self.input_placeholder.indices: indices, self.input_placeholder.dense_shape: dense_shape }) else: embeddings = self.sess.run(self.use_outputs, feed_dict={self.sentences: texts})["default"] return embeddings
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding( name=u'twitter_100', dimensions=100, corpus_size='27B', vocabulary_size='1.2M', download_url= 'https://www.dropbox.com/s/q2wof83a0yq7q74/glove.twitter.27B.100d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding( name=u'twitter_200', dimensions=200, corpus_size='27B', vocabulary_size='1.2M', download_url= 'https://www.dropbox.com/s/hfw00m77ibz24y5/glove.twitter.27B.200d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding( name=u'twitter_25', dimensions=25, corpus_size='27B', vocabulary_size='1.2M', download_url= 'https://www.dropbox.com/s/jx97sz8skdp276k/glove.twitter.27B.25d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding( name=u'twitter_50', dimensions=50, corpus_size='27B', vocabulary_size='1.2M', download_url= 'https://www.dropbox.com/s/9mutj8syz3q20e3/glove.twitter.27B.50d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding( name=u'wiki_100', dimensions=100, corpus_size='6B', vocabulary_size='0.4M', download_url= 'https://www.dropbox.com/s/g0inzrsy1ds3u63/glove.6B.100d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding( name=u'wiki_200', dimensions=200, corpus_size='6B', vocabulary_size='0.4M', download_url= 'https://www.dropbox.com/s/pmj2ycd882qkae5/glove.6B.200d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding( name=u'wiki_300', dimensions=300, corpus_size='6B', vocabulary_size='0.4M', download_url= 'https://www.dropbox.com/s/9jbbk99p0d0n1bw/glove.6B.300d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding( name=u'wiki_50', dimensions=50, corpus_size='6B', vocabulary_size='0.4M', download_url= 'https://www.dropbox.com/s/o3axsz1j47043si/glove.6B.50d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding( name=u'crawl_42B_300', dimensions=300, corpus_size='42B', vocabulary_size='1.9M', download_url='http://nlp.stanford.edu/data/glove.42B.300d.zip', format='zip', architecture='glove', trained_data='Common Crawl (42B tokens)', language='en'), Embedding( name=u'crawl_840B_300', dimensions=300, corpus_size='840B', vocabulary_size='2.2M', download_url='http://nlp.stanford.edu/data/glove.840B.300d.zip', format='zip', architecture='glove', trained_data='Common Crawl (840B tokens)', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model_name = None self.max_seq_length = None @classmethod def tokenize(cls, text: str) -> List[str]: return [x.lower().strip() for x in text.split()] def load_model(self, model: str, model_path: str, max_seq_length: int): try: model_file = [ f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f)) ] f = open(os.path.join(model_path, model_file[0]), 'r', encoding="utf-8") for line in tqdm(f): split_line = line.split() word = split_line[0] self.word_vectors[word] = np.array( [float(val) for val in split_line[1:]]) print("Model loaded Successfully !") self.model_name = model self.max_seq_length = max_seq_length return self except Exception as e: print('Error loading Model, ', str(e)) return self def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, is_tokenized: bool): max_seq_length = self.max_seq_length tokens = text if not is_tokenized: tokens = Embeddings.tokenize(text) if len(tokens) > max_seq_length: tokens = tokens[0:max_seq_length] while len(tokens) < max_seq_length: tokens.append('<pad>') return np.array( [self.word_vectors.get(token, oov_vector) for token in tokens]) def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, is_tokenized: bool = False, **kwargs) -> Optional[np.array]: oov_vector = np.zeros( Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32") token_embeddings = np.array([ self._single_encode_text(text, oov_vector, is_tokenized) for text in texts ]) if not pooling: return token_embeddings else: if pooling not in POOL_FUNC_MAP.keys(): raise NotImplementedError( f"Pooling method \"{pooling}\" not implemented") pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'elmo_bi_lm', dimensions=512, corpus_size='1B', vocabulary_size='5.5B', download_url='https://storage.googleapis.com/tfhub-modules/google/elmo/2.tar.gz', format='tar.gz', architecture='Embedding layer,cnn_layer_with_maxpool,2 lstm layers', trained_data='One Billion Word Benchmark', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} def __init__(self): self.elmo_outputs = None self.model_name = None self.max_seq_length = None self.sess = tf.Session() # placeholder self.tokens = None self.sequence_len = None @classmethod def tokenize(cls, text: str): return [word.strip() for word in text.lower().strip().split()] @classmethod def padded_tokens(cls, tokens: List[str], max_seq_length: int): padded_token = "" len_tokens = len(tokens) if len_tokens >= max_seq_length: return tokens[:max_seq_length] else: padded_len = max_seq_length - len_tokens return tokens + [padded_token] * padded_len def load_model(self, model: str, model_path: str, max_seq_length: int): g = tf.Graph() with g.as_default(): hub_module = hub.Module(model_path) self.tokens = tf.placeholder(dtype=tf.string, shape=[None, max_seq_length]) self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None]) elmo_inputs = dict( tokens=self.tokens, sequence_len=self.sequence_len ) self.elmo_outputs = hub_module(elmo_inputs, signature="tokens", as_dict=True) init_op = tf.group([tf.global_variables_initializer()]) g.finalize() self.sess = tf.Session(graph=g) self.sess.run(init_op) self.model_name = model self.max_seq_length = max_seq_length def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: text_tokens = texts if not is_tokenized: text_tokens = [Embeddings.tokenize(text) for text in texts] text_tokens = [Embeddings.padded_tokens(tokens, self.max_seq_length) for tokens in text_tokens] seq_length = [self.max_seq_length] * len(texts) elmo_inputs = { self.tokens: np.array(text_tokens), self.sequence_len: np.array(seq_length) } token_embeddings = self.sess.run(self.elmo_outputs, feed_dict=elmo_inputs)["elmo"] if not pooling: return token_embeddings else: if pooling not in POOL_FUNC_MAP.keys(): print(f"Pooling method \"{pooling}\" not implemented") return None pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled