def test_sliding_window_with_batch(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / 'bert' / 'config.json' config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})}) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert bert_vectors is not None
def test_squad_with_unwordpieceable_passage(self): tokenizer = SpacyTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ( "There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:" ) question1 = "Who released A Study of High Definition Television Systems?" passage2 = ( "Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import ( make_reading_comprehension_instance, ) instance1 = make_reading_comprehension_instance( tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1, ) instance2 = make_reading_comprehension_instance( tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2, ) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def setUp(self): super().setUp() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' self.token_indexer = PretrainedBertIndexer(str(vocab_path)) config_path = self.FIXTURES_ROOT / 'bert' / 'config.json' config = BertConfig(str(config_path)) self.bert_model = BertModel(config) self.token_embedder = BertEmbedder(self.bert_model)
def setUp(self): super().setUp() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" self.token_indexer = PretrainedBertIndexer(str(vocab_path)) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) self.bert_model = BertModel(config) self.token_embedder = BertEmbedder(self.bert_model)
def _get_bert_word_embedder(self): pretrained_model = self.bert_file_path bert_model = PretrainedBertModel.load(pretrained_model, cache_model=False) for param in bert_model.parameters(): param.requires_grad = self.configuration['train_bert'] bert_embedder = BertEmbedder(bert_model=bert_model, top_layer_only=True) bert_word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": bert_embedder}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) return bert_word_embedder
def test_end_to_end(self): tokenizer = BertPreTokenizer() # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] assert tokens["bert"].tolist() == [ [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0], [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17], ] assert tokens["bert-offsets"].tolist() == [ [1, 3, 4, 5, 6, 7, 8, 9, 10, 11], [1, 2, 3, 4, 5, 6, 7, 10, 11, 12], ] # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def test_sliding_window(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert tokens["bert"].tolist() == [[ 16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17 ]] assert tokens["bert-offsets"].tolist() == [[ 1, 3, 4, 5, 6, 7, 8, 9, 10, 11 ]] bert_vectors = token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [1, 13, 12] # Testing without token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [1, 10, 12] # Testing with token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens["bert-type-ids"]) assert list(bert_vectors.shape) == [1, 10, 12]
def test_end_to_end_with_higher_order_inputs(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) text_field1 = TextField(tokens1, {"bert": self.token_indexer}) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) text_field2 = TextField(tokens2, {"bert": self.token_indexer}) # 2 5 15 10 11 6 sentence3 = "the brown laziest fox" tokens3 = tokenizer.tokenize(sentence3) text_field3 = TextField(tokens3, {"bert": self.token_indexer}) vocab = Vocabulary() instance1 = Instance({"tokens": ListField([text_field1])}) instance2 = Instance({"tokens": ListField([text_field2, text_field3])}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True) tokens = tensor_dict["tokens"] # No offsets, should get 12 vectors back. bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12] ## Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12]
def test_sliding_window_with_batch(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({ "tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer}) }) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # Testing without token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert bert_vectors is not None # Testing with token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"], token_type_ids=tokens["token_type_ids"]) assert bert_vectors is not None
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] embedder(tokens["bert"], tokens["bert-offsets"])
def _get_bert_word_embedder(self): # bert_embedder = PretrainedBertEmbedder( # pretrained_model=self.bert_file_path, # top_layer_only=True, # conserve memory # requires_grad=(not self.configuration['fixed']) # ) pretrained_model = self.bert_file_path bert_model = PretrainedBertModel.load(pretrained_model, cache_model=False) for param in bert_model.parameters(): param.requires_grad = (not self.configuration['fixed']) bert_embedder = BertEmbedder(bert_model=bert_model, top_layer_only=True) bert_word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {"bert": bert_embedder}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) bert_word_embedder.to(self.configuration['device']) return bert_word_embedder
weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity()
def load_decomposable_attention_elmo_softmax_model(): NEGATIVE_PERCENTAGE = 100 # EMBEDDING_TYPE = "" # LOSS_TYPE = "" # NLL # LOSS_TYPE = "_nll" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: model.load_state_dict(torch.load(f, map_location='cuda:0')) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor(model, dataset_reader=reader) return model, predictor
def save_top_results(process_no, start_index, end_index): print("Starting process {} with start at {} and end at {}".format( process_no, start_index, end_index)) DATA_FOLDER = "train_data" # EMBEDDING_TYPE = "" LOSS_TYPE = "" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt") # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt") # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt") #NOTE: Squad dev test set q_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt") r_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt") rules_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt") reader = QuestionResponseSoftmaxReader(q_file, r_file, token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) glove_embeddings_file = os.path.join("data", "glove", "glove.840B.300d.txt") # RESULTS_DIR = "squad_seq2seq_train2" #NOTE: All other experiments # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized" # make_dir_if_not_exists(RESULTS_DIR) # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index)) #NOTE: Squad dev test set RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized" make_dir_if_not_exists(RESULTS_DIR) all_results_save_file = os.path.join( RESULTS_DIR, "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format( start_index, end_index)) with open(all_results_save_file, "w") as all_writer: print("Testing out model with", EMBEDDING_TYPE, "embeddings") print("Testing out model with", LOSS_TYPE, "loss") # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]: for NEGATIVE_PERCENTAGE in [100]: model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder( {"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: device = torch.device('cpu') model.load_state_dict(torch.load(f, map_location=device)) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor( model, dataset_reader=reader) # Read test file and get predictions gold = list() predicted_labels = list() probs = list() total_time = avg_time = 0.0 print("Started Testing:", NEGATIVE_PERCENTAGE) # before working on anything just save all the questions and responses in a list all_data = list() examples_count = processed_examples_count = 0 with open(q_file, 'r') as q_reader, open(r_file, "r") as r_reader, open( rules_file, "r") as rule_reader: logger.info("Reading questions from : %s", q_file) logger.info("Reading responses from : %s", r_file) q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() for i, (response, rule) in enumerate(zip(r_reader, rule_reader)): response = response.strip() rule = rule.strip() if response and rule: # get current_answer from response a = get_answer_from_response(response) if not current_qa[1]: current_qa = (q, a) else: # verify if the a is same as the one in current_qa if a != current_qa[1]: # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response) current_qa = (current_qa[0], a) # print(current_rules_and_responses) # exit() # Add it to the current responses current_rules_and_responses.append((response, rule)) elif len(current_rules_and_responses) > 0: # Create a instance # print(current_qa) # print(current_rules_and_responses) # exit() if rule or response: print("Rule Response mismatch") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() if examples_count < start_index: examples_count += 1 q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() continue elif examples_count > end_index: break all_data.append( (current_qa, current_rules_and_responses)) try: q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) except StopIteration: # previous one was the last question q = "" current_qa = (q, "") current_rules_and_responses = list() examples_count += 1 # if(examples_count%100 == 0): # print(examples_count) else: # Serious Bug print("Serious BUG!!") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() print("{}:\tFINISHED IO".format(process_no)) examples_count = start_index processed_examples_count = 0 for current_qa, responses_and_rules in all_data: start_time = time.time() # Tokenize and preprocess the responses preprocessed_responses = [ mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False) for response, rule in responses_and_rules ] # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules]) predictions = predictor.predict(current_qa[0], preprocessed_responses) label_probs = predictions["label_probs"] tuples = zip(responses_and_rules, label_probs) sorted_by_score = sorted(tuples, key=lambda tup: tup[1], reverse=True) count = 0 all_writer.write("{}\n".format(current_qa[0])) all_writer.write("{}\n".format(current_qa[1])) for index, ((response, rule), label_prob) in enumerate(sorted_by_score): if index == 3: break all_writer.write("{}\t{}\t{}\t{}\n".format( response, mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False), rule, label_prob)) all_writer.write("\n") all_writer.flush() end_time = time.time() processed_examples_count += 1 examples_count += 1 total_time += end_time - start_time avg_time = total_time / float(processed_examples_count) print( "{}:\ttime to write {} with {} responses is {} secs. {} avg time" .format(process_no, examples_count, len(responses_and_rules), end_time - start_time, avg_time))