def _run_test(self, requires_grad): options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') embedder = ElmoTokenEmbedder(options_file, weight_file, requires_grad=requires_grad) batch_size = 3 seq_len = 4 char_ids = Variable( torch.from_numpy( numpy.random.randint(0, 262, (batch_size, seq_len, 50)))) embeddings = embedder(char_ids) loss = embeddings.sum() loss.backward() elmo_grads = [ param.grad for name, param in embedder.named_parameters() if '_elmo_lstm' in name ] if requires_grad: # None of the elmo grads should be None. assert all([grad is not None for grad in elmo_grads]) else: # All of the elmo grads should be None. assert all([grad is None for grad in elmo_grads])
class ElmoWordEmbedding(torch.nn.Module): """ Compute a single layer of ELMo word representations. """ def __init__(self, options_file: str, weight_file: str, vocab_to_cache: List[str], do_layer_norm: bool = False, dropout: float = 0.5, requires_grad: bool = False, projection_dim: int = None) -> None: super(ElmoWordEmbedding, self).__init__() self._elmo = ElmoTokenEmbedder(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache) self._projection = self._elmo._projection def get_output_dim(self): if self._projection is not None: return self._projection.out_features else: return self._elmo.get_output_dim() def forward(self, word_inputs: torch.Tensor) -> torch.Tensor: if len(word_inputs.shape) == 1: word_inputs = word_inputs.unsqueeze(dim=-1) return self._elmo.forward(word_inputs, word_inputs) @property def weight(self): embedding_weight = torch.cat( (self.word_embedding.weight, self.word_embedding.weight), dim=1) if self._projection: embedding_weight = self._projection(embedding_weight) return embedding_weight @property def num_embeddings(self): return self.word_embedding.num_embeddings @property def word_embedding(self): return self._elmo._elmo._elmo_lstm._word_embedding
def load_elmo_embeddings(large=True): """ Loads pre-trained ELMo embeddings ('large' model by default). Parameters ---------- large: bool Set to True to load the Large ELMo model; False for small ELMo model Returns ------- TextFieldEmbedder """ if large: # use the Large pre-trained model print("Loading LARGE ELMo..") options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' else: # use the Small pre-trained model print("Loading SMALL ELMo..") options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) print("Pre-trained ELMo loaded..") return word_embeddings
def get_elmo_embedder(): options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) return word_embeddings
def get_embedder_info( embedder_type: str ) -> Tuple[TokenEmbedder, TokenIndexer, str, Dict[str, Any]]: embedder_type = embedder_type.lower() text_field_embedder_kwargs: Dict[str, Any] = {} if embedder_type == 'ner_elmo': return NERElmoTokenEmbedder(), ELMoTokenCharactersIndexer( ), text_field_embedder_kwargs elif embedder_type == 'elmo': return ElmoTokenEmbedder(ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE), ELMoTokenCharactersIndexer( ), text_field_embedder_kwargs elif embedder_type == 'bert': bert_embedder = PretrainedBertEmbedder( pretrained_model="bert-base-uncased", top_layer_only=True, # conserve memory ) token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=512, # max pieces allowed for positional embeddings do_lowercase=True, use_starting_offsets=True, ) text_field_embedder_kwargs['allow_unmatched_keys'] = True text_field_embedder_kwargs['embedder_to_indexer_map'] = { "tokens": ["tokens", "tokens-offsets"] } return bert_embedder, token_indexer, text_field_embedder_kwargs else: raise Exception(f'Unknown embedder type: {embedder_type}')
def get_embeddings(embedder_type, vocab, embedding_dim=300, bert_trainable=True): if embedder_type not in valid_embedders: raise Exception(f'Unknown embedder type {embedder_type}') vocab_size = vocab.get_vocab_size('tokens') token_embedders = {} if embedder_type == 'random': token_embedding = Embedding(vocab_size, embedding_dim, trainable=True) token_embedders['tokens'] = token_embedding if embedder_type in ['glove', 'elmo_and_glove']: weights = load_glove_weights(vocab) token_embedding = Embedding(vocab_size, embedding_dim, weight=weights, trainable=True) token_embedders['tokens'] = token_embedding if embedder_type in ['elmo', 'elmo_and_glove']: elmo_token_embedder = ElmoTokenEmbedder( 'embeddings/elmo_2x4096_512_2048cnn_2xhighway_options.json', 'embeddings/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5', do_layer_norm=False, dropout=0.5) token_embedders['elmo'] = elmo_token_embedder if 'bert' in embedder_type: token_embedders['bert'] = BertEmbedder(bert_type=embedder_type, trainable=bert_trainable) word_embeddings = BasicTextFieldEmbedder(token_embedders) return word_embeddings
def test_forward_works_with_projection_layer(self): params = Params({ 'options_file': self.FIXTURES_ROOT / 'elmo' / 'options.json', 'weight_file': self.FIXTURES_ROOT / 'elmo' / 'lm_weights.hdf5', 'projection_dim': 20 }) word1 = [0] * 50 word2 = [0] * 50 word1[0] = 6 word1[1] = 5 word1[2] = 4 word1[3] = 3 word2[0] = 3 word2[1] = 2 word2[2] = 1 word2[3] = 0 embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params) input_tensor = torch.LongTensor([[word1, word2]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 2, 20) input_tensor = torch.LongTensor([[[word1]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 1, 20)
def test_forward_works_with_projection_layer(self): params = Params({ "options_file": self.FIXTURES_ROOT / "elmo" / "options.json", "weight_file": self.FIXTURES_ROOT / "elmo" / "lm_weights.hdf5", "projection_dim": 20, }) word1 = [0] * 50 word2 = [0] * 50 word1[0] = 6 word1[1] = 5 word1[2] = 4 word1[3] = 3 word2[0] = 3 word2[1] = 2 word2[2] = 1 word2[3] = 0 embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params) assert embedding_layer.get_output_dim() == 20 input_tensor = torch.LongTensor([[word1, word2]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 2, 20) input_tensor = torch.LongTensor([[[word1]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 1, 20)
def _run_test(self, requires_grad): embedder = ElmoTokenEmbedder(self.options_file, self.weight_file, requires_grad=requires_grad) batch_size = 3 seq_len = 4 char_ids = torch.from_numpy(numpy.random.randint(0, 262, (batch_size, seq_len, 50))) embeddings = embedder(char_ids) loss = embeddings.sum() loss.backward() elmo_grads = [param.grad for name, param in embedder.named_parameters() if '_elmo_lstm' in name] if requires_grad: # None of the elmo grads should be None. assert all([grad is not None for grad in elmo_grads]) else: # All of the elmo grads should be None. assert all([grad is None for grad in elmo_grads])
def test_forward_works_with_projection_layer(self): params = Params({ 'options_file': self.FIXTURES_ROOT / 'elmo' / 'options.json', 'weight_file': self.FIXTURES_ROOT / 'elmo' / 'lm_weights.hdf5', 'projection_dim': 20 }) word1 = [0] * 50 word2 = [0] * 50 word1[0] = 6 word1[1] = 5 word1[2] = 4 word1[3] = 3 word2[0] = 3 word2[1] = 2 word2[2] = 1 word2[3] = 0 embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params) assert embedding_layer.get_output_dim() == 20 input_tensor = torch.LongTensor([[word1, word2]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 2, 20) input_tensor = torch.LongTensor([[[word1]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 1, 20)
def test_context_sequence_encoding(self): elmo_credbank_model_path = load_abs_path( os.path.join( os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5" )) elmo_embedder = ElmoTokenEmbedder( options_file= "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", weight_file=elmo_credbank_model_path, do_layer_norm=False, dropout=0.5) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) EXPECTED_CONTEXT_INPUT_SIZE = 60 rumor_classifier = RumorTweetsClassifer( word_embeddings, None, None, None, classifier_feedforward=None, cxt_content_encoder=None, cxt_metadata_encoder=None, social_context_self_attention_encoder=None, cuda_device=-1) tweet_id = "500327120770301952" single_source_tweet_tensor_1 = self.tweet_context_encoding_by_tweet_id( rumor_classifier, tweet_id) print(type(single_source_tweet_tensor_1)) print(single_source_tweet_tensor_1.shape) assert type(single_source_tweet_tensor_1) == torch.Tensor assert single_source_tweet_tensor_1.shape == ( 97, EXPECTED_CONTEXT_INPUT_SIZE ), "expected shape is [19, %s]" % EXPECTED_CONTEXT_INPUT_SIZE tweet_id = "552806117328568321" # with three replies single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id( rumor_classifier, tweet_id) print(type(single_source_tweet_tensor_2)) print(single_source_tweet_tensor_2.shape) assert type(single_source_tweet_tensor_2) == torch.Tensor assert single_source_tweet_tensor_2.shape == ( 94, EXPECTED_CONTEXT_INPUT_SIZE ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE tweet_id = "552806117328568321" # with three replies print("social context encoding without numerical feature .") single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id( rumor_classifier, tweet_id, disable_nf=True) print(type(single_source_tweet_tensor_2)) print(single_source_tweet_tensor_2.shape) assert type(single_source_tweet_tensor_2) == torch.Tensor assert single_source_tweet_tensor_2.shape == ( 94, EXPECTED_CONTEXT_INPUT_SIZE ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE
def build_elmo_model(vocab: Vocabulary) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") embedding = ElmoTokenEmbedder() embedder = BasicTextFieldEmbedder(token_embedders={'bert_tokens': embedding}) encoder = BagOfEmbeddingsEncoder(embedding_dim=embedder.get_output_dim(), averaged=True) return SimpleClassifier(vocab, embedder, encoder)
def _run_test(self, requires_grad): embedder = ElmoTokenEmbedder(self.options_file, self.weight_file, requires_grad=requires_grad) batch_size = 3 seq_len = 4 char_ids = torch.from_numpy(numpy.random.randint(0, 262, (batch_size, seq_len, 50))) for _ in range(2): embeddings = embedder(char_ids) loss = embeddings.sum() loss.backward() elmo_grads = [param.grad for name, param in embedder.named_parameters() if '_elmo_lstm' in name] if requires_grad: # None of the elmo grads should be None. assert all([grad is not None for grad in elmo_grads]) else: # All of the elmo grads should be None. assert all([grad is None for grad in elmo_grads])
def run_ELMo_RSA(stim_file, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Get tokenizer tokenizer = WhitespaceTokenizer() #Load model ##ELMo OG elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json' #ELMo Small #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #ELMo Medium #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json' #ELMo OG (5.5B) #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file, dropout=0.0) embedder = BasicTextFieldEmbedder( token_embedders={'elmo_tokens': elmo_embedding}) for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentence = sentences[1] #GET BASELINE token_indexer = ELMoTokenCharactersIndexer() vocab = Vocabulary() target_tokens = tokenizer.tokenize(target) target_text_field = TextField(target_tokens, {'elmo_tokens': token_indexer}) target_text_field.index(vocab) target_token_tensor = target_text_field.as_tensor( target_text_field.get_padding_lengths()) target_tensor_dict = target_text_field.batch_tensors( [target_token_tensor]) target_embedding = embedder(target_tensor_dict)[0] baseline = target_embedding[-1].data.cpu().squeeze() #GET SIMS sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder) values = get_dummy_values(sentence) EXP.load_IT('elmo', x, values, False, sims) return EXP
def main(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = StanfordSentimentTreeBankDatasetReader( token_indexers={'tokens': elmo_token_indexer}) train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # Use the 'Small' pre-trained model options_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' ) weight_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' ) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train()
def __init__(self, options_file: str, weight_file: str, vocab_to_cache: List[str], do_layer_norm: bool = False, dropout: float = 0.5, requires_grad: bool = False, projection_dim: int = None) -> None: super(ElmoWordEmbedding, self).__init__() self._elmo = ElmoTokenEmbedder(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache) self._projection = self._elmo._projection
def build_model(options_file, weight_file): vocab = Vocabulary() iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_size, bidirectional=True, batch_first=True)) model = BaselineModel(word_embeddings, encoder, vocab) return model, iterator, vocab
def test_cached_download(self): params = Params({ "options_file": "hf://lysandre/test-elmo-tiny/options.json", "weight_file": "hf://lysandre/test-elmo-tiny/lm_weights.hdf5", }) embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params) assert isinstance(embedding_layer, ElmoTokenEmbedder ), "Embedding layer badly instantiated from HF Hub." assert (embedding_layer.get_output_dim() == 32 ), "Embedding layer badly instantiated from HF Hub."
def _run_test_with_vocab_to_cache(self, requires_grad): vocab_to_cache = ['<pad>', 'hello', 'world'] embedder = ElmoTokenEmbedder(self.options_file, self.weight_file, requires_grad=requires_grad, vocab_to_cache=vocab_to_cache) word_tensor = torch.LongTensor([[[1, 2]]]) for _ in range(2): embeddings = embedder(word_tensor, word_tensor) loss = embeddings.sum() loss.backward() elmo_grads = [param.grad for name, param in embedder.named_parameters() if '_elmo_lstm' in name and '_token_embedder' not in name] if requires_grad: # None of the elmo grads should be None. assert all([grad is not None for grad in elmo_grads]) else: # All of the elmo grads should be None. assert all([grad is None for grad in elmo_grads]) assert all([param.grad is None for name, param in embedder.named_parameters() if '_token_embedder' in name])
def test_vocab_extension_attempt_does_not_give_error(self): # It shouldn't give error if TokenEmbedder does not extend the method `extend_vocab` params = Params({ 'options_file': self.FIXTURES_ROOT / 'elmo' / 'options.json', 'weight_file': self.FIXTURES_ROOT / 'elmo' / 'lm_weights.hdf5' }) embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params) vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') # This should just pass and be no-op embedding_layer.extend_vocab(vocab)
def main (): #Initlizing the embeddings (ELMO) elmo_token_indexer = ELMoTokenCharactersIndexer() reader = AnalogyDatasetReader(token_indexers={'tokens':elmo_token_indexer}) train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"]) # elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) vocab = Vocabulary.from_instances(train_dataset + test_dataset + dev_dataset) word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder}) #Initializing the model #takes the hidden state at the last time step of the LSTM for every layer as one single output lstm_encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)) model = LstmModel(word_embeddings, lstm_encoder, vocab) if USE_GPU: model.cuda() else: model # Training the model optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, cuda_device=0 if USE_GPU else -1, num_epochs=20) trainer.train() #Saving the model with open("model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("vocabulary")
def load_elmo_model(): elmo_embedders = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedders}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_DIM, bidirectional=True, batch_first=True)) vocabulary = Vocabulary() model = BaseModel(word_embeddings=word_embeddings, encoder=encoder, vocabulary=vocabulary) output_elmo_model_file = os.path.join(PRETRAINED_ELMO, "lstm_elmo_model.bin") model.load_state_dict(torch.load(output_elmo_model_file)) return model
def sequence_labelling(): # Index each token as a sequence of character Ids (ELMo) token_indexers = {"tokens": ELMoTokenCharactersIndexer()} # Read the data reader = SequenceLabellingDatasetReader(token_indexers) training_data = reader.read(path='data/sequence_labelling/train.txt') validation_data = reader.read(path='data/sequence_labelling/test.txt') test_data = reader.read(path='data/sequence_labelling/test.txt') # Create a vocabulary vocabulary = Vocabulary.from_instances(training_data + validation_data + test_data) # Use ELMo embeddings elmo = ElmoTokenEmbedder(options_file=ELMO_OPTIONS_FILE, weight_file=ELMO_WEIGHTS_FILE) embedder = BasicTextFieldEmbedder(token_embedders={"tokens": elmo}) # Our text classifier will use a CNN encoder lstm_layer = LSTM(input_size=ELMO_EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, bidirectional=True, batch_first=True) lstm_encoder = PytorchSeq2SeqWrapper(module=lstm_layer) model = SequenceLabeller(vocabulary=vocabulary, embedder=embedder, encoder=lstm_encoder) print("\nModel :\n") print(model) # Training train_model(model, training_data, validation_data, vocabulary) # Evaluation evaluate_sequence_labelling_model(model, test_data)
def get_model(vocab, params): emb_d = params["embedding_dim"] hidden_d = params["hidden_dim"] use_elmo_embeddings = params['use_elmo'] use_lstm = params['use_lstm'] n_layers = params["num_layers"] bidirectional = params['bidirectional'] if use_elmo_embeddings: token_embedder = ElmoTokenEmbedder(ELMO_OPTIONS_FILE, ELMO_WEIGHTS_FILE) else: token_embedder = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=emb_d) word_embedder = BasicTextFieldEmbedder({"tokens": token_embedder}) emb_d = word_embedder.get_output_dim() if use_lstm: encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(emb_d, hidden_d, num_layers=n_layers, batch_first=True, bidirectional=bidirectional)) else: encoder = PytorchSeq2SeqWrapper( torch.nn.GRU(emb_d, hidden_d, num_layers=n_layers, batch_first=True, bidirectional=bidirectional)) model = NerModel(word_embedder, encoder, vocab, num_categories=(3 if params["dataset"] == "senti" else 4)) return model
def predict(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = AnalogyDatasetReader(token_indexers={'tokens':elmo_token_indexer}) train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"]) # elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder}) lstm_encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)) vocab2 = Vocabulary.from_files("./vocabulary") model2 = LstmModel(word_embeddings, lstm_encoder, vocab2) if USE_GPU: model2.cuda() else: model2 with open("./model.th", 'rb') as f: model2.load_state_dict(torch.load(f)) predictor2 = SentenceClassifierPredictor(model2, dataset_reader=reader) with open('test.txt', 'w+') as f: top_10_words_list = [] for analogy_test in test_dataset: logits = predictor2.predict_instance(analogy_test)['logits'] label_id = np.argmax(logits) label_predict = model2.vocab.get_token_from_index(label_id, 'labels') top_10_ids = np.argsort(logits)[-10:] top_10_words = [model2.vocab.get_token_from_index(id, 'labels') for id in top_10_ids] top_10_words_list.append(top_10_words) f.write(label_predict + "\n") top_10_words_list = np.array(top_10_words_list) print(top_10_words_list.shape) np.save('elmo_top_10_words_list.npy', np.array(top_10_words_list))
def test_context_feature_encoder(self): elmo_credbank_model_path = load_abs_path( os.path.join( os.path.dirname(__file__), '..', "resource", "embedding", "elmo_model", "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5" )) # test context feature encoding with small sample data # to make sure that source tweet context are sorted in temporal order elmo_embedder = ElmoTokenEmbedder( options_file= "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", weight_file=elmo_credbank_model_path, do_layer_norm=False, dropout=0.5) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) rumor_classifier = RumorTweetsClassifer(word_embeddings, None, None, None, None) propagation_embeddings_tensor = rumor_classifier.batch_compute_context_feature_encoding( ['500294803402137600', '500327120770301952']) print("propagation_embeddings_tensor: ", propagation_embeddings_tensor)
def get_predictor(): EMBEDDING_DIM = 128 HIDDEN_DIM = 60 #128 MAX_LEN = 70 dropout = 0.25 lstm_layers = 2 # pre-trained model options_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' ) weight_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' ) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) vocab = Vocabulary.from_files(data_dir + "vocabulary_allennlp_imdb_twoclass") word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) elmo_embedding_dim = 256 lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, bidirectional=True, num_layers=lstm_layers, dropout=dropout, batch_first=True)) model = LstmTwoClassifier(word_embeddings, lstm, vocab) net = torch.load("model_allen_imdb_twoclass.th", map_location=str(device)) model.load_state_dict(net) elmo_token_indexer = ELMoTokenCharactersIndexer() readerSentence = SentenceDatasetReader( token_indexers={'tokens': elmo_token_indexer}) return SentimentPredictor(model, dataset_reader=readerSentence)
def build_model(vocab: Vocabulary) -> Model: print("Building the model") vocab_size_tokens = vocab.get_vocab_size("tokens") vocab_size_chars = vocab.get_vocab_size("token_characters") embedder = BasicTextFieldEmbedder({"tokens": Embedding(embedding_dim=embedding_dim, pretrained_file=f"{cur_dir}/glove/glove.6B.200d.txt", trainable=False, num_embeddings=vocab_size_tokens, vocab=vocab),\ "elmo": ElmoTokenEmbedder(weight_file="https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", do_layer_norm=False, dropout=0.0),\ "token_characters":TokenCharactersEncoder(embedding=Embedding(embedding_dim=16, num_embeddings=vocab_size_chars, vocab=vocab), \ encoder=CnnEncoder(embedding_dim=16, num_filters=128, ngram_filter_sizes=[3]))}) encoder = PytorchTransformer(input_dim=1352, num_layers=6, positional_encoding="sinusoidal") # embedder = BasicTextFieldEmbedder({"tokens": Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size)}) # encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim) # embedder = BasicTextFieldEmbedder({"tokens": PretrainedTransformerMismatchedEmbedder("bert-large-uncased")}) # encoder = LstmSeq2SeqEncoder(input_size=1024, hidden_size=1024, num_layers=2, dropout=0.5, bidirectional=True) if args.pseudo: return PseudoCrfTagger(vocab, embedder, encoder, \ label_encoding="BIOUL", include_start_end_transitions=False, num_virtual_models = num_virtual_models) else: return CrfTagger(vocab, embedder, encoder, \ label_encoding="BIOUL", include_start_end_transitions=False)
stratify=y_full, train_size=0.9, test_size=0.1) vocab = Vocabulary.from_instances(train_ds + test_ds) iterator = BucketIterator(batch_size=32, biggest_batch_first=True, sorting_keys=[("tokens", "num_tokens")], padding_noise=.15) iterator.index_with(vocab) batch = next(iter(iterator(train_ds))) EMBEDDING_DIM = 256 HIDDEN_DIM = 64 # These files are trained by us, for pretrained ELMO just to take pretrained ones options_file = 'forELMO\\options.json' weight_file = 'forELMO\\corp_trained.hdf5' elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_DIM, batch_first=True, bidirectional=True)) model = BaselineModel(word_embeddings, lstm) batch = nn_util.move_to_device(batch, 0) train_dataset, val_dataset = train_test_split(train_ds, train_size=0.9, test_size=0.1, shuffle=True, stratify=y_train) optimizer = optim.RMSprop(model.parameters(), lr=0.01) if torch.cuda.is_available():
def train_only_lee(): # This is WORKING! # load datasetreader # Save logging to a local file # Multitasking log.getLogger().addHandler(log.FileHandler(directory+"/log.log")) lr = 0.00001 batch_size = 2 epochs = 100 max_seq_len = 512 max_span_width = 30 #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,) token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False) reader = ConllCorefBertReader(max_span_width = max_span_width, token_indexers = {"tokens": token_indexer}) EMBEDDING_DIM = 1024 HIDDEN_DIM = 200 processed_reader_dir = Path(directory+"processed/") train_ds, val_ds, test_ds = load_lee(reader, directory) # restore checkpoint here from allennlp.modules.token_embedders import ElmoTokenEmbedder #vocab = Vocabulary.from_instances(train_ds + val_ds) vocab = Vocabulary() iterator = BasicIterator(batch_size=batch_size) iterator.index_with(vocab) val_iterator = BasicIterator(batch_size=batch_size) val_iterator.index_with(vocab) from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder # here, allow_unmatched_key = True since we dont pass in offsets since #we allow for word embedings of the bert-tokenized, wnot necessiarly the # original tokens # see the documetnation for offsets here for more info: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/bert_token_embedder.py options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embedding = BasicTextFieldEmbedder({"tokens": elmo_embedder})#, allow_unmatched_keys=True) #word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) #BERT_DIM = word_embedding.get_output_dim() ELMO_DIM = word_embedding.get_output_dim() # at each batch, sample from the two, and load th eLSTM shared_layer = torch.nn.LSTM(ELMO_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True) seq2seq = PytorchSeq2SeqWrapper(shared_layer) mention_feedforward = FeedForward(input_dim =512, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) antecedent_feedforward = FeedForward(input_dim =2304, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) model = CoreferenceResolver(vocab=vocab, text_field_embedder=word_embedding,context_layer= seq2seq, mention_feedforward=mention_feedforward,antecedent_feedforward=antecedent_feedforward , feature_size=768,max_span_width=max_span_width,spans_per_word=0.4,max_antecedents=250,lexical_dropout= 0.2) print(model) optimizer = optim.Adam(model.parameters(), lr=lr) # and then we can do the shared loss # # Get USE_GPU = 1 trainer = Trainer( model=model.cuda(), optimizer=optimizer, iterator=iterator, validation_iterator = val_iterator, train_dataset=train_ds, validation_dataset = val_ds, validation_metric = "+coref_f1", cuda_device=0 if USE_GPU else -1, serialization_dir= directory + "saved_models/only_lee", num_epochs=epochs, ) metrics = trainer.train() # save the model with open(directory + "saved_models/current_run_model_state", 'wb') as f: torch.save(model.state_dict(), f)
def multitask_learning(): # load datasetreader # Save logging to a local file # Multitasking log.getLogger().addHandler(log.FileHandler(directory+"/log.log")) lr = 0.00001 batch_size = 2 epochs = 10 max_seq_len = 512 max_span_width = 30 #import pdb #pdb.set_trace() #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,) #token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False) from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer # the token indexer is responsible for mapping tokens to integers token_indexer = ELMoTokenCharactersIndexer() def tokenizer(x: str): return [w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x)[:max_seq_len]] #conll_reader = ConllCorefBertReader(max_span_width = max_span_width, token_indexers = {"tokens": token_indexer}) conll_reader = ConllCorefReader(max_span_width = max_span_width, token_indexers = {"tokens": token_indexer}) swag_reader = SWAGDatasetReader(tokenizer=tokenizer, token_indexers = token_indexer) EMBEDDING_DIM = 1024 HIDDEN_DIM = 200 conll_datasets, swag_datasets = load_datasets(conll_reader, swag_reader, directory) conll_vocab = Vocabulary() conll_iterator = BasicIterator(batch_size=batch_size) conll_iterator.index_with(conll_vocab) swag_vocab = Vocabulary() swag_iterator = BasicIterator(batch_size=batch_size) swag_iterator.index_with(swag_vocab) from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.token_embedders import ElmoTokenEmbedder #bert_embedder = PretrainedBertEmbedder(pretrained_model="bert-base-cased",top_layer_only=True, requires_grad=True) options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embedding = BasicTextFieldEmbedder({"tokens": elmo_embedder})#, allow_unmatched_keys=True) #word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) #BERT_DIM = word_embedding.get_output_dim() ELMO_DIM = word_embedding.get_output_dim() seq2seq = PytorchSeq2SeqWrapper(torch.nn.LSTM(ELMO_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)) seq2vec = PytorchSeq2VecWrapper(torch.nn.LSTM(ELMO_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)) mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) model1 = CoreferenceResolver(vocab=conll_vocab, text_field_embedder=word_embedding,context_layer= seq2seq, mention_feedforward=mention_feedforward,antecedent_feedforward=antecedent_feedforward , feature_size=768,max_span_width=max_span_width,spans_per_word=0.4,max_antecedents=250,lexical_dropout= 0.2) model2 = SWAGExampleModel(vocab=swag_vocab, text_field_embedder=word_embedding, phrase_encoder=seq2vec) optimizer1 = optim.Adam(model1.parameters(), lr=lr) optimizer2 = optim.Adam(model2.parameters(), lr=lr) swag_train_iterator = swag_iterator(swag_datasets[0], num_epochs=1, shuffle=True) conll_train_iterator = conll_iterator(conll_datasets[0], num_epochs=1, shuffle=True) swag_val_iterator = swag_iterator(swag_datasets[1], num_epochs=1, shuffle=True) conll_val_iterator:q = conll_iterator(conll_datasets[1], num_epochs=1, shuffle=True) task_infos = {"swag": {"model": model2, "optimizer": optimizer2, "loss": 0.0, "iterator": swag_iterator, "train_data": swag_datasets[0], "val_data": swag_datasets[1], "num_train": len(swag_datasets[0]), "num_val": len(swag_datasets[1]), "lr": lr, "score": {"accuracy":0.0}}, \ "conll": {"model": model1, "iterator": conll_iterator, "loss": 0.0, "val_data": conll_datasets[1], "train_data": conll_datasets[0], "optimizer": optimizer1, "num_train": len(conll_datasets[0]), "num_val": len(conll_datasets[1]),"lr": lr, "score": {"coref_prediction": 0.0, "coref_recall": 0.0, "coref_f1": 0.0,"mention_recall": 0.0}}} USE_GPU = 1 trainer = MultiTaskTrainer( task_infos=task_infos, num_epochs=epochs, serialization_dir=directory + "saved_models/multitask/" ) metrics = trainer.train()
def embeddings_returner(self, vocab=None): ''' Either the name of the pretrained model to use (e.g. bert-base-uncased),or the path to the .tar.gz file with the model weights. :param args: vocab_size and vocab is needed only when pretrained embeddings is used. :return: embedder ''' ''' "bert-base-uncased", do_lower_case=True "bert-base-cased" , do_lower_case=False https://github.com/huggingface/pytorch-transformers/issues/712 https://qiita.com/uedake722/items/b7f4b75b4d77d9bd358b ''' if self.embedding_strategy == 'bert': self.bertmodel_dir = '' if self.ifbert_use_whichmodel == 'general': self.bertmodel_dir += 'bert-base-uncased/' # recomendded ver is uncased, in original repository self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir # included in pytorch_transformers, so we replace it with model name itself self.bert_weight_filepath = copy.copy('bert-base-uncased') elif self.ifbert_use_whichmodel == 'scibert': self.bertmodel_dir += 'scibert_scivocab_uncased/' # recomendded ver is uncased, in original repository self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz' elif self.ifbert_use_whichmodel == 'biobert': self.bertmodel_dir += 'biobert_v1.1_pubmed/' # currently cased version only supported self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz' # including bert_config.json and bin. # Load embedder bert_embedder = PretrainedBertEmbedder( pretrained_model=self.bert_weight_filepath, top_layer_only=self.bert_top_layer_only, requires_grad=self.emb_requires_grad) return bert_embedder, bert_embedder.get_output_dim( ), BasicTextFieldEmbedder({'tokens': bert_embedder}, allow_unmatched_keys=True) elif self.embedding_strategy == 'elmo': if self.ifelmo_use_whichmodel == 'general': options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elif self.ifelmo_use_whichmodel == 'pubmed': options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_weights_PubMed_only.hdf5' elif self.ifelmo_use_whichmodel == 'bioelmo': options_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_options.json' weight_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_weights.hdf5' else: options_file = -1 weight_file = -1 assert options_file != -1 elmo_embedder = ElmoTokenEmbedder( options_file=options_file, weight_file=weight_file, requires_grad=self.emb_requires_grad) return elmo_embedder, elmo_embedder.get_output_dim( ), BasicTextFieldEmbedder({'tokens': elmo_embedder}) elif self.embedding_strategy == 'pretrained': print('\nGloVe pretrained vocab loading\n') if 'glove' in self.args.ifpretrained_use_whichmodel: embedding_dim = 300 else: embedding_dim = 200 pretrain_emb_embedder = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': self.glove_embeddings_file, 'embedding_dim': embedding_dim, 'trainable': False, 'padding_index': 0 })) return pretrain_emb_embedder, pretrain_emb_embedder.get_output_dim( ), BasicTextFieldEmbedder({'tokens': pretrain_emb_embedder})