def test_document_bidirectional_lstm_embeddings(): (sentence, glove, charlm) = init_document_embeddings() embeddings = DocumentRNNEmbeddings( [glove, charlm], hidden_size=128, bidirectional=True) embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 512) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def test_keep_batch_order(): (sentence, glove, charlm) = init_document_embeddings() embeddings = DocumentRNNEmbeddings([glove]) sentences_1 = [Sentence('First sentence'), Sentence('This is second sentence')] sentences_2 = [Sentence('This is second sentence'), Sentence('First sentence')] embeddings.embed(sentences_1) embeddings.embed(sentences_2) assert (sentences_1[0].to_original_text() == 'First sentence') assert (sentences_1[1].to_original_text() == 'This is second sentence') assert (torch.norm( (sentences_1[0].embedding - sentences_2[1].embedding)) == 0.0) assert (torch.norm( (sentences_1[0].embedding - sentences_2[1].embedding)) == 0.0)
def test_keep_batch_order(): embeddings = DocumentRNNEmbeddings([glove]) sentences_1 = [Sentence("First sentence"), Sentence("This is second sentence")] sentences_2 = [Sentence("This is second sentence"), Sentence("First sentence")] embeddings.embed(sentences_1) embeddings.embed(sentences_2) assert sentences_1[0].to_original_text() == "First sentence" assert sentences_1[1].to_original_text() == "This is second sentence" assert torch.norm(sentences_1[0].embedding - sentences_2[1].embedding) == 0.0 assert torch.norm(sentences_1[0].embedding - sentences_2[1].embedding) == 0.0 del embeddings
def test_fine_tunable_flair_embedding(): language_model_forward = LanguageModel(Dictionary.load( 'chars'), is_forward_lm=True, hidden_size=32, nlayers=1) embeddings = DocumentRNNEmbeddings([FlairEmbeddings( language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 128) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0) embeddings = DocumentLMEmbeddings( [FlairEmbeddings(language_model_forward, fine_tune=True)]) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 32) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
import numpy as np from pandas import read_csv import pickle from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings, Sentence data = read_csv('data/abcnews-date-text.csv', error_bad_lines=False) documents = data[['headline_text']].values.reshape(-1).tolist() # documents = list(pickle.load(open( "./corpus/df_proyectosFECYT.pkl", "rb" ) )['LEMAS_UC3M']) glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentRNNEmbeddings([glove_embedding], hidden_size=512) embeddings = [] count = 0 try: for document in documents: count += 1 sentence = Sentence(document) document_embeddings.embed(sentence) embeddings.append(sentence.get_embedding().tolist()) if (count % 1000 == 0): print(count) finally: # In case an error occurs before finish, we store previous results embedings_array = np.array(embeddings) np.save("embeds_abcnews_512_2.npy", embedings_array)
class EasyDocumentEmbeddings: """ Document Embeddings generated by pool and rnn methods applied to the word embeddings of text Usage: ```python >>> embeddings = adaptnlp.EasyDocumentEmbeddings("bert-base-cased", "xlnet-base-cased", methods["rnn"]) ``` **Parameters:** * ***embeddings** - Non-keyword variable number of strings referring to model names or paths * **methods** - A list of strings to specify which document embeddings to use i.e. ["rnn", "pool"] (avoids unncessary loading of models if only using one) * **configs** - A dictionary of configurations for flair's rnn and pool document embeddings ```python >>>example_configs = {"pool_configs": {"fine_tune_mode": "linear", "pooling": "mean", }, ... "rnn_configs": {"hidden_size": 512, ... "rnn_layers": 1, ... "reproject_words": True, ... "reproject_words_dimension": 256, ... "bidirectional": False, ... "dropout": 0.5, ... "word_dropout": 0.0, ... "locked_dropout": 0.0, ... "rnn_type": "GRU", ... "fine_tune": True, }, ... } ``` """ __allowed_methods = ["rnn", "pool"] __allowed_configs = ("pool_configs", "rnn_configs") def __init__( self, *embeddings: str, methods: List[str] = ["rnn", "pool"], configs: Dict = { "pool_configs": { "fine_tune_mode": "linear", "pooling": "mean" }, "rnn_configs": { "hidden_size": 512, "rnn_layers": 1, "reproject_words": True, "reproject_words_dimension": 256, "bidirectional": False, "dropout": 0.5, "word_dropout": 0.0, "locked_dropout": 0.0, "rnn_type": "GRU", "fine_tune": True, }, }, ): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Check methods for m in methods: assert m in self.__class__.__allowed_methods # Set configs for pooling and rnn parameters for k, v in configs.items(): assert k in self.__class__.__allowed_configs setattr(self, k, v) # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 if "pool" in methods: self.pool_embeddings = DocumentPoolEmbeddings( self.embedding_stack, **self.pool_configs) print("Pooled embedding loaded") if "rnn" in methods: self.rnn_embeddings = DocumentRNNEmbeddings( self.embedding_stack, **self.rnn_configs) print("RNN embeddings loaded") def embed_pool( self, text: Union[List[Sentence], Sentence, List[str], str], ) -> List[Sentence]: """ Stacked embeddings * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats **return** - A list of Flair's `Sentence`s """ if isinstance(text, str): sentences = [Sentence(text)] elif isinstance(text, list) and all(isinstance(t, str) for t in text): sentences = [Sentence(t) for t in text] elif isinstance(text, Sentence): sentences = [text] else: sentences = text self.pool_embeddings.embed(sentences) return sentences def embed_rnn( self, text: Union[List[Sentence], Sentence, List[str], str], ) -> List[Sentence]: """ Stacked embeddings * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats **return** - A list of Flair's `Sentence`s """ if isinstance(text, str): sentences = [Sentence(text)] elif isinstance(text, list) and all(isinstance(t, str) for t in text): sentences = [Sentence(t) for t in text] elif isinstance(text, Sentence): sentences = [text] else: sentences = text self.rnn_embeddings.embed(sentences) return sentences
class EasyDocumentEmbeddings: """Document Embeddings generated by pool and rnn methods applied to the word embeddings of text Usage: ```python >>> embeddings = adaptnlp.EasyDocumentEmbeddings("bert-base-cased", "xlnet-base-cased", methods["rnn"]) ``` **Parameters:** * `*embeddings` - Non-keyword variable number of strings referring to model names or paths * `methods` - A list of strings to specify which document embeddings to use i.e. ["rnn", "pool"] (avoids unncessary loading of models if only using one) * `configs` - A dictionary of configurations for flair's rnn and pool document embeddings ```python >>> example_configs = {"pool_configs": {"fine_tune_mode": "linear", "pooling": "mean", }, ... "rnn_configs": {"hidden_size": 512, ... "rnn_layers": 1, ... "reproject_words": True, ... "reproject_words_dimension": 256, ... "bidirectional": False, ... "dropout": 0.5, ... "word_dropout": 0.0, ... "locked_dropout": 0.0, ... "rnn_type": "GRU", ... "fine_tune": True, }, ... } ``` """ __allowed_methods = ["rnn", "pool"] __allowed_configs = ("pool_configs", "rnn_configs") def __init__( self, *embeddings: str, methods: List[str] = ["rnn", "pool"], configs: Dict = { "pool_configs": { "fine_tune_mode": "linear", "pooling": "mean" }, "rnn_configs": { "hidden_size": 512, "rnn_layers": 1, "reproject_words": True, "reproject_words_dimension": 256, "bidirectional": False, "dropout": 0.5, "word_dropout": 0.0, "locked_dropout": 0.0, "rnn_type": "GRU", "fine_tune": True, }, }, ): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Check methods for m in methods: assert m in self.__class__.__allowed_methods # Set configs for pooling and rnn parameters for k, v in configs.items(): assert k in self.__class__.__allowed_configs setattr(self, k, v) # Load correct Embeddings module for model_name_or_path in embeddings: self.embedding_stack.append( _get_embedding_model(model_name_or_path)) assert len(self.embedding_stack) != 0 if "pool" in methods: self.pool_embeddings = DocumentPoolEmbeddings( self.embedding_stack, **self.pool_configs) print("Pooled embedding loaded") if "rnn" in methods: self.rnn_embeddings = DocumentRNNEmbeddings( self.embedding_stack, **self.rnn_configs) print("RNN embeddings loaded") def embed_pool( self, text: Union[List[Sentence], Sentence, List[str], str], ) -> List[Sentence]: """Generate stacked embeddings with `DocumentPoolEmbeddings` **Parameters**: * `text` - Text input, it can be a string or any of Flair's `Sentence` input formats **Return**: * A list of Flair's `Sentence`s """ sentences = _make_sentences(text, as_list=True) self.pool_embeddings.embed(sentences) return sentences def embed_rnn( self, text: Union[List[Sentence], Sentence, List[str], str], ) -> List[Sentence]: """Generate stacked embeddings with `DocumentRNNEmbeddings` **Parameters**: * `text` - Text input, it can be a string or any of Flair's `Sentence` input formats **Return**: * A list of Flair's `Sentence`s """ sentences = _make_sentences(text, as_list=True) self.rnn_embeddings.embed(sentences) return sentences