def get_embedder(type_, vocab, e_dim, rq_grad=False): if type_ == 'elmo': opt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_options.json" wt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" elmo_embedder = ElmoTokenEmbedder(opt_file, wt_file, requires_grad=rq_grad) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) return word_embeddings if type_ == 'glove': wt_file = "data/glove.6B.300d.txt" glove_embedder = Embedding(400000, 300, pretrained_file=wt_file, trainable=rq_grad) word_embeddings = BasicTextFieldEmbedder({"tokens": glove_embedder}) return word_embeddings elif type_ == 'bert': bert_embedder = PretrainedBertEmbedder( pretrained_model="bert-base-uncased", top_layer_only=True, requires_grad=rq_grad) word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) return word_embeddings else: token_embeddings = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=e_dim) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embeddings}) return word_embeddings
def main(): cuda_device = -1 torch.manual_seed(SEED) elmo_embedder = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_DIM, bidirectional=True, batch_first=True)) train_dataset, dev_dataset = dataset_reader(train=True, elmo=True) vocab = Vocabulary() model = BaseModel(word_embeddings=word_embeddings, encoder=lstm, vocabulary=vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) iterator = data_iterator(vocab) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, cuda_device=cuda_device, num_epochs=EPOCHS, patience=5) trainer.train() print("*******Save Model*******\n") output_elmo_model_file = os.path.join(PRETRAINED_ELMO, "lstm_elmo_model.bin") torch.save(model.state_dict(), output_elmo_model_file)
def _load_embedder(config, bert_max_length): if config.embedder.name == 'elmo': embedder = ElmoTokenEmbedder( options_file=os.path.join(config.data.pretrained_models_dir, 'elmo/options.json'), weight_file=os.path.join(config.data.pretrained_models_dir, 'elmo/model.hdf5'), dropout=0.) embedder.eval() elif config.embedder.name.endswith('bert'): embedder = PretrainedTransformerMismatchedEmbedder( model_name=os.path.join(config.data.pretrained_models_dir, config.embedder.name), max_length=bert_max_length) elif config.embedder.name == 'both': elmo_embedder = ElmoTokenEmbedder( options_file=os.path.join(config.data.pretrained_models_dir, 'elmo/options.json'), weight_file=os.path.join(config.data.pretrained_models_dir, 'elmo/model.hdf5'), dropout=0.) elmo_embedder.eval() bert_embedder = PretrainedTransformerMismatchedEmbedder( model_name=os.path.join(config.data.pretrained_models_dir, 'ru_bert'), max_length=bert_max_length) return BasicTextFieldEmbedder({ 'elmo': elmo_embedder, 'ru_bert': bert_embedder }) else: assert False, 'Unknown embedder {}'.format(config.embedder.name) return BasicTextFieldEmbedder({config.embedder.name: embedder})
def _load_embedder(config, vocab, bert_max_length): embedders = {} for embedder_config in config.embedder.models: if embedder_config.name == 'elmo': embedders[embedder_config.name] = ElmoTokenEmbedder( options_file=os.path.join(config.data.pretrained_models_dir, 'elmo/options.json'), weight_file=os.path.join(config.data.pretrained_models_dir, 'elmo/model.hdf5'), requires_grad=embedder_config.params['requires_grad'], dropout=0.) embedders[embedder_config.name].eval() elif embedder_config.name.endswith('bert'): embedders[ embedder_config. name] = PretrainedTransformerMismatchedEmbedder( model_name=os.path.join(config.data.pretrained_models_dir, embedder_config.name), max_length=bert_max_length, requires_grad=embedder_config.params['requires_grad']) elif embedder_config.name == 'char_bilstm': embedders[embedder_config.name] = TokenCharactersEncoder( embedding=Embedding( num_embeddings=vocab.get_vocab_size('token_characters'), embedding_dim=embedder_config.params['char_embedding_dim'] ), encoder=PytorchSeq2VecWrapper( torch.nn.LSTM( embedder_config.params['char_embedding_dim'], embedder_config.params['lstm_dim'], num_layers=embedder_config.params['lstm_num_layers'], dropout=embedder_config.params['lstm_dropout'], bidirectional=True, batch_first=True)), dropout=embedder_config.params['dropout']) else: assert False, 'Unknown embedder {}'.format(embedder_config.name) return BasicTextFieldEmbedder(embedders)
def __init__(self, hidden_sizes, output_size, vocab_size=None, embedding_size=None, pretrained_vecs=None, elmo_config=None): super(DAN, self).__init__() if elmo_config: from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder self.emb_layer = ElmoTokenEmbedder(*elmo_config) else: self.emb_layer = nn.Embedding(vocab_size, embedding_size) self.emb_layer.weight.data.copy_(pretrained_vecs) self.inp_layer = nn.Linear(embedding_size, hidden_sizes[0]) self.out_layer = nn.Linear(hidden_sizes[-1], output_size) self.hidden = nn.ModuleList() for k in range(len(hidden_sizes) - 1): self.hidden.append(nn.Linear(hidden_sizes[k], hidden_sizes[k + 1]))
False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5")
def run(args): ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file, args.dataset_path_prefix) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: JSONDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTaggerAndClassifier(word_embeddings, encoders, vocab, TASKS) map_location = "cpu" if not args.cuda else None model.load_state_dict( torch.load(os.path.join(SERIALIZATION_DIR, "best.th"), map_location=map_location)) if args.cuda: model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE * 2) test_iterator.index_with(vocab) model = model.eval() model.set_inference_mode(True) return TASKS, vocab, model, readers, test_iterator
def load_decomposable_attention_elmo_softmax_model(): NEGATIVE_PERCENTAGE = 100 # EMBEDDING_TYPE = "" # LOSS_TYPE = "" # NLL # LOSS_TYPE = "_nll" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: model.load_state_dict(torch.load(f, map_location='cuda:0')) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor(model, dataset_reader=reader) return model, predictor
def run(args): ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file, args.dataset_path_prefix) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: ConLLDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) if not TEST_MODE: train_dataset = read_datasets(dataset_paths, readers, data_split="train") validation_dataset = read_datasets(dataset_paths, readers, data_split="dev") vocab = create_vocab([train_dataset, validation_dataset]) # Special case for CCG if "ccg" in task_suffixes or "pos" in task_suffixes: for task in TASKS: if task.task_type == "ccg": for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]: vocab.add_token_to_namespace(tag, task.tag_namespace) if task.tag_namespace == "ud_pos": for tag in ["CONJ"]: vocab.add_token_to_namespace(tag, task.tag_namespace) else: vocab = Vocabulary.from_files( os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS) model = model.cuda(device=CUDA_DEVICE) if not TEST_MODE: iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE, cache_instances=True) iterator.index_with(vocab) if CLEAN_MODEL_DIR: if os.path.exists(SERIALIZATION_DIR): logger.info(f"Deleting {SERIALIZATION_DIR}") shutil.rmtree(SERIALIZATION_DIR) logger.info(f"Creating {SERIALIZATION_DIR}") os.makedirs(SERIALIZATION_DIR) logger.info( f"Writing arguments to arguments.json in {SERIALIZATION_DIR}") with open(os.path.join(SERIALIZATION_DIR, "arguments.json"), "w+") as fp: json.dump(vars(args), fp, indent=2) logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}") vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # Use list to ensure each epoch is a full pass through the data combined_training_dataset = list( roundrobin_iterator(*train_dataset.values())) combined_validation_dataset = list( roundrobin_iterator(*validation_dataset.values())) # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1) optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) training_stats = [] trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=combined_training_dataset, validation_dataset=combined_validation_dataset, patience=PATIENTCE, num_epochs=NUM_EPOCHS, cuda_device=CUDA_DEVICE, serialization_dir=SERIALIZATION_DIR, # model_save_interval=600 ) stats = trainer.train() training_stats.append(stats) with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"), "w+") as fp: json.dump(training_stats, fp, indent=2) else: model.load_state_dict( torch.load(os.path.join(SERIALIZATION_DIR, "best.th"))) model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() test_filepaths = { task.tag_namespace: dataset_paths[task.tag_namespace]["test"] for task in TASKS } logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE * 2) test_iterator.index_with(vocab) model = model.eval() test_stats = evaluate_multiple_data(model, readers, test_iterator, test_filepaths, cuda_device=CUDA_DEVICE) with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp: json.dump(test_stats, fp, indent=2)
def save_top_results(process_no, start_index, end_index): print("Starting process {} with start at {} and end at {}".format( process_no, start_index, end_index)) DATA_FOLDER = "train_data" # EMBEDDING_TYPE = "" LOSS_TYPE = "" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt") # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt") # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt") #NOTE: Squad dev test set q_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt") r_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt") rules_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt") reader = QuestionResponseSoftmaxReader(q_file, r_file, token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) glove_embeddings_file = os.path.join("data", "glove", "glove.840B.300d.txt") # RESULTS_DIR = "squad_seq2seq_train2" #NOTE: All other experiments # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized" # make_dir_if_not_exists(RESULTS_DIR) # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index)) #NOTE: Squad dev test set RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized" make_dir_if_not_exists(RESULTS_DIR) all_results_save_file = os.path.join( RESULTS_DIR, "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format( start_index, end_index)) with open(all_results_save_file, "w") as all_writer: print("Testing out model with", EMBEDDING_TYPE, "embeddings") print("Testing out model with", LOSS_TYPE, "loss") # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]: for NEGATIVE_PERCENTAGE in [100]: model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder( {"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: device = torch.device('cpu') model.load_state_dict(torch.load(f, map_location=device)) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor( model, dataset_reader=reader) # Read test file and get predictions gold = list() predicted_labels = list() probs = list() total_time = avg_time = 0.0 print("Started Testing:", NEGATIVE_PERCENTAGE) # before working on anything just save all the questions and responses in a list all_data = list() examples_count = processed_examples_count = 0 with open(q_file, 'r') as q_reader, open(r_file, "r") as r_reader, open( rules_file, "r") as rule_reader: logger.info("Reading questions from : %s", q_file) logger.info("Reading responses from : %s", r_file) q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() for i, (response, rule) in enumerate(zip(r_reader, rule_reader)): response = response.strip() rule = rule.strip() if response and rule: # get current_answer from response a = get_answer_from_response(response) if not current_qa[1]: current_qa = (q, a) else: # verify if the a is same as the one in current_qa if a != current_qa[1]: # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response) current_qa = (current_qa[0], a) # print(current_rules_and_responses) # exit() # Add it to the current responses current_rules_and_responses.append((response, rule)) elif len(current_rules_and_responses) > 0: # Create a instance # print(current_qa) # print(current_rules_and_responses) # exit() if rule or response: print("Rule Response mismatch") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() if examples_count < start_index: examples_count += 1 q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() continue elif examples_count > end_index: break all_data.append( (current_qa, current_rules_and_responses)) try: q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) except StopIteration: # previous one was the last question q = "" current_qa = (q, "") current_rules_and_responses = list() examples_count += 1 # if(examples_count%100 == 0): # print(examples_count) else: # Serious Bug print("Serious BUG!!") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() print("{}:\tFINISHED IO".format(process_no)) examples_count = start_index processed_examples_count = 0 for current_qa, responses_and_rules in all_data: start_time = time.time() # Tokenize and preprocess the responses preprocessed_responses = [ mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False) for response, rule in responses_and_rules ] # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules]) predictions = predictor.predict(current_qa[0], preprocessed_responses) label_probs = predictions["label_probs"] tuples = zip(responses_and_rules, label_probs) sorted_by_score = sorted(tuples, key=lambda tup: tup[1], reverse=True) count = 0 all_writer.write("{}\n".format(current_qa[0])) all_writer.write("{}\n".format(current_qa[1])) for index, ((response, rule), label_prob) in enumerate(sorted_by_score): if index == 3: break all_writer.write("{}\t{}\t{}\t{}\n".format( response, mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False), rule, label_prob)) all_writer.write("\n") all_writer.flush() end_time = time.time() processed_examples_count += 1 examples_count += 1 total_time += end_time - start_time avg_time = total_time / float(processed_examples_count) print( "{}:\ttime to write {} with {} responses is {} secs. {} avg time" .format(process_no, examples_count, len(responses_and_rules), end_time - start_time, avg_time))