コード例 #1
0
def get_embedder(type_, vocab, e_dim, rq_grad=False):
    if type_ == 'elmo':
        opt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        wt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        elmo_embedder = ElmoTokenEmbedder(opt_file,
                                          wt_file,
                                          requires_grad=rq_grad)
        word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
        return word_embeddings
    if type_ == 'glove':
        wt_file = "data/glove.6B.300d.txt"
        glove_embedder = Embedding(400000,
                                   300,
                                   pretrained_file=wt_file,
                                   trainable=rq_grad)
        word_embeddings = BasicTextFieldEmbedder({"tokens": glove_embedder})
        return word_embeddings
    elif type_ == 'bert':
        bert_embedder = PretrainedBertEmbedder(
            pretrained_model="bert-base-uncased",
            top_layer_only=True,
            requires_grad=rq_grad)
        word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder},
                                                 allow_unmatched_keys=True)
        return word_embeddings
    else:
        token_embeddings = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=e_dim)
        word_embeddings = BasicTextFieldEmbedder({"tokens": token_embeddings})
        return word_embeddings
def main():
    cuda_device = -1

    torch.manual_seed(SEED)

    elmo_embedder = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(),
                      HIDDEN_DIM,
                      bidirectional=True,
                      batch_first=True))

    train_dataset, dev_dataset = dataset_reader(train=True, elmo=True)
    vocab = Vocabulary()

    model = BaseModel(word_embeddings=word_embeddings,
                      encoder=lstm,
                      vocabulary=vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)

    iterator = data_iterator(vocab)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      cuda_device=cuda_device,
                      num_epochs=EPOCHS,
                      patience=5)

    trainer.train()

    print("*******Save Model*******\n")

    output_elmo_model_file = os.path.join(PRETRAINED_ELMO,
                                          "lstm_elmo_model.bin")
    torch.save(model.state_dict(), output_elmo_model_file)
コード例 #3
0
ファイル: main.py プロジェクト: rnekrasov-msk/GramEval2020
def _load_embedder(config, bert_max_length):
    if config.embedder.name == 'elmo':
        embedder = ElmoTokenEmbedder(
            options_file=os.path.join(config.data.pretrained_models_dir,
                                      'elmo/options.json'),
            weight_file=os.path.join(config.data.pretrained_models_dir,
                                     'elmo/model.hdf5'),
            dropout=0.)
        embedder.eval()
    elif config.embedder.name.endswith('bert'):
        embedder = PretrainedTransformerMismatchedEmbedder(
            model_name=os.path.join(config.data.pretrained_models_dir,
                                    config.embedder.name),
            max_length=bert_max_length)
    elif config.embedder.name == 'both':
        elmo_embedder = ElmoTokenEmbedder(
            options_file=os.path.join(config.data.pretrained_models_dir,
                                      'elmo/options.json'),
            weight_file=os.path.join(config.data.pretrained_models_dir,
                                     'elmo/model.hdf5'),
            dropout=0.)
        elmo_embedder.eval()

        bert_embedder = PretrainedTransformerMismatchedEmbedder(
            model_name=os.path.join(config.data.pretrained_models_dir,
                                    'ru_bert'),
            max_length=bert_max_length)

        return BasicTextFieldEmbedder({
            'elmo': elmo_embedder,
            'ru_bert': bert_embedder
        })
    else:
        assert False, 'Unknown embedder {}'.format(config.embedder.name)

    return BasicTextFieldEmbedder({config.embedder.name: embedder})
コード例 #4
0
ファイル: main.py プロジェクト: smoke-b/GramEval2020
def _load_embedder(config, vocab, bert_max_length):
    embedders = {}
    for embedder_config in config.embedder.models:
        if embedder_config.name == 'elmo':
            embedders[embedder_config.name] = ElmoTokenEmbedder(
                options_file=os.path.join(config.data.pretrained_models_dir,
                                          'elmo/options.json'),
                weight_file=os.path.join(config.data.pretrained_models_dir,
                                         'elmo/model.hdf5'),
                requires_grad=embedder_config.params['requires_grad'],
                dropout=0.)
            embedders[embedder_config.name].eval()
        elif embedder_config.name.endswith('bert'):
            embedders[
                embedder_config.
                name] = PretrainedTransformerMismatchedEmbedder(
                    model_name=os.path.join(config.data.pretrained_models_dir,
                                            embedder_config.name),
                    max_length=bert_max_length,
                    requires_grad=embedder_config.params['requires_grad'])
        elif embedder_config.name == 'char_bilstm':
            embedders[embedder_config.name] = TokenCharactersEncoder(
                embedding=Embedding(
                    num_embeddings=vocab.get_vocab_size('token_characters'),
                    embedding_dim=embedder_config.params['char_embedding_dim']
                ),
                encoder=PytorchSeq2VecWrapper(
                    torch.nn.LSTM(
                        embedder_config.params['char_embedding_dim'],
                        embedder_config.params['lstm_dim'],
                        num_layers=embedder_config.params['lstm_num_layers'],
                        dropout=embedder_config.params['lstm_dropout'],
                        bidirectional=True,
                        batch_first=True)),
                dropout=embedder_config.params['dropout'])
        else:
            assert False, 'Unknown embedder {}'.format(embedder_config.name)

    return BasicTextFieldEmbedder(embedders)
コード例 #5
0
    def __init__(self,
                 hidden_sizes,
                 output_size,
                 vocab_size=None,
                 embedding_size=None,
                 pretrained_vecs=None,
                 elmo_config=None):
        super(DAN, self).__init__()
        if elmo_config:
            from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder

            self.emb_layer = ElmoTokenEmbedder(*elmo_config)
        else:
            self.emb_layer = nn.Embedding(vocab_size, embedding_size)
            self.emb_layer.weight.data.copy_(pretrained_vecs)

        self.inp_layer = nn.Linear(embedding_size, hidden_sizes[0])
        self.out_layer = nn.Linear(hidden_sizes[-1], output_size)

        self.hidden = nn.ModuleList()
        for k in range(len(hidden_sizes) - 1):
            self.hidden.append(nn.Linear(hidden_sizes[k], hidden_sizes[k + 1]))
                                                 False
                                             }))
 elif EMBEDDING_TYPE == "_elmo":
     # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
     # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
     options_file = os.path.join(
         "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json")
     weights_file = os.path.join(
         "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
     # NOTE: using Small size as medium size gave CUDA out of memory error
     # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
     # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
     # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
     # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
     token_embedding = ElmoTokenEmbedder(options_file,
                                         weights_file,
                                         dropout=DROPOUT,
                                         projection_dim=PROJECT_DIM)
 elif EMBEDDING_TYPE == "_elmo_retrained":
     options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "options.json")
     weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "weights.hdf5")
     token_embedding = ElmoTokenEmbedder(options_file,
                                         weights_file,
                                         dropout=DROPOUT,
                                         projection_dim=PROJECT_DIM)
 elif EMBEDDING_TYPE == "_elmo_retrained_2":
     options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "options_2.json")
     weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "weights_2.hdf5")
コード例 #7
0
def run(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file,
                                              args.dataset_path_prefix)
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name]
        for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i
        for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: JSONDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES])
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR,
                                               "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTaggerAndClassifier(word_embeddings, encoders, vocab,
                                            TASKS)
    map_location = "cpu" if not args.cuda else None
    model.load_state_dict(
        torch.load(os.path.join(SERIALIZATION_DIR, "best.th"),
                   map_location=map_location))
    if args.cuda:
        model = model.cuda(device=CUDA_DEVICE)

        # Empty cache to ensure larger batch can be loaded for testing
        torch.cuda.empty_cache()
    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                   batch_size=BATCH_SIZE * 2)
    test_iterator.index_with(vocab)
    model = model.eval()
    model.set_inference_mode(True)
    return TASKS, vocab, model, readers, test_iterator
コード例 #8
0
def load_decomposable_attention_elmo_softmax_model():
    NEGATIVE_PERCENTAGE = 100
    # EMBEDDING_TYPE = ""
    # LOSS_TYPE = ""				# NLL
    # LOSS_TYPE = "_nll"				# NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    model_file = os.path.join(
        "saved_softmax_models",
        "decomposable_attention{}{}_model_{}.th".format(
            LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

    vocabulary_filepath = os.path.join(
        "saved_softmax_models",
        "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                   NEGATIVE_PERCENTAGE))
    print("LOADING VOCABULARY")
    # Load vocabulary
    vocab = Vocabulary.from_files(vocabulary_filepath)

    EMBEDDING_DIM = 300
    PROJECT_DIM = 200
    DROPOUT = 0.2
    NUM_LAYERS = 2
    if EMBEDDING_TYPE == "":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=EMBEDDING_DIM,
            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_glove":
        token_embedding = Embedding.from_params(vocab=vocab,
                                                params=Params({
                                                    'pretrained_file':
                                                    glove_embeddings_file,
                                                    'embedding_dim':
                                                    EMBEDDING_DIM,
                                                    'projection_dim':
                                                    PROJECT_DIM,
                                                    'trainable':
                                                    False
                                                }))
    elif EMBEDDING_TYPE == "_elmo":
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
        options_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json")
        weights_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
        # NOTE: using Small size as medium size gave CUDA out of memory error
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
        # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained_2":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options_2.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights_2.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_bert":
        print("Loading bert model")
        model = BertModel.from_pretrained('bert-base-uncased')
        token_embedding = BertEmbedder(model)
        PROJECT_DIM = 768
    else:
        print("Error: Some weird Embedding type", EMBEDDING_TYPE)
        exit()
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    HIDDEN_DIM = 200
    params = Params({
        'input_dim': PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    attend_feedforward = FeedForward.from_params(params)
    similarity_function = DotProductSimilarity()
    params = Params({
        'input_dim': 2 * PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    compare_feedforward = FeedForward.from_params(params)
    params = Params({
        'input_dim': 2 * HIDDEN_DIM,
        'hidden_dims': 1,
        'activations': 'linear',
        'num_layers': 1
    })
    aggregate_feedforward = FeedForward.from_params(params)
    model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                         attend_feedforward,
                                         similarity_function,
                                         compare_feedforward,
                                         aggregate_feedforward)
    print("MODEL CREATED")
    # Load model state
    with open(model_file, 'rb') as f:
        model.load_state_dict(torch.load(f, map_location='cuda:0'))
    print("MODEL LOADED!")
    if torch.cuda.is_available():
        # cuda_device = 3
        # model = model.cuda(cuda_device)
        cuda_device = -1
    else:
        cuda_device = -1

    predictor = DecomposableAttentionSoftmaxPredictor(model,
                                                      dataset_reader=reader)
    return model, predictor
コード例 #9
0
def run(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file,
                                              args.dataset_path_prefix)
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name]
        for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i
        for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: ConLLDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES])
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    if not TEST_MODE:
        train_dataset = read_datasets(dataset_paths,
                                      readers,
                                      data_split="train")
        validation_dataset = read_datasets(dataset_paths,
                                           readers,
                                           data_split="dev")

        vocab = create_vocab([train_dataset, validation_dataset])

        # Special case for CCG
        if "ccg" in task_suffixes or "pos" in task_suffixes:
            for task in TASKS:
                if task.task_type == "ccg":
                    for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)
                if task.tag_namespace == "ud_pos":
                    for tag in ["CONJ"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)

    else:
        vocab = Vocabulary.from_files(
            os.path.join(SERIALIZATION_DIR, "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS)
    model = model.cuda(device=CUDA_DEVICE)

    if not TEST_MODE:
        iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                  batch_size=BATCH_SIZE,
                                                  cache_instances=True)
        iterator.index_with(vocab)

        if CLEAN_MODEL_DIR:
            if os.path.exists(SERIALIZATION_DIR):
                logger.info(f"Deleting {SERIALIZATION_DIR}")
                shutil.rmtree(SERIALIZATION_DIR)
            logger.info(f"Creating {SERIALIZATION_DIR}")
            os.makedirs(SERIALIZATION_DIR)

        logger.info(
            f"Writing arguments to arguments.json in {SERIALIZATION_DIR}")
        with open(os.path.join(SERIALIZATION_DIR, "arguments.json"),
                  "w+") as fp:
            json.dump(vars(args), fp, indent=2)

        logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}")
        vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))
        # Use list to ensure each epoch is a full pass through the data
        combined_training_dataset = list(
            roundrobin_iterator(*train_dataset.values()))
        combined_validation_dataset = list(
            roundrobin_iterator(*validation_dataset.values()))

        # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1)
        optimizer = optim.Adam(model.parameters(),
                               lr=LR,
                               weight_decay=WEIGHT_DECAY)

        training_stats = []
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=combined_training_dataset,
            validation_dataset=combined_validation_dataset,
            patience=PATIENTCE,
            num_epochs=NUM_EPOCHS,
            cuda_device=CUDA_DEVICE,
            serialization_dir=SERIALIZATION_DIR,
            # model_save_interval=600
        )
        stats = trainer.train()
        training_stats.append(stats)

        with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"),
                  "w+") as fp:
            json.dump(training_stats, fp, indent=2)
    else:
        model.load_state_dict(
            torch.load(os.path.join(SERIALIZATION_DIR, "best.th")))
        model = model.cuda(device=CUDA_DEVICE)

    # Empty cache to ensure larger batch can be loaded for testing
    torch.cuda.empty_cache()

    test_filepaths = {
        task.tag_namespace: dataset_paths[task.tag_namespace]["test"]
        for task in TASKS
    }

    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                   batch_size=BATCH_SIZE * 2)
    test_iterator.index_with(vocab)
    model = model.eval()
    test_stats = evaluate_multiple_data(model,
                                        readers,
                                        test_iterator,
                                        test_filepaths,
                                        cuda_device=CUDA_DEVICE)
    with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp:
        json.dump(test_stats, fp, indent=2)
def save_top_results(process_no, start_index, end_index):
    print("Starting process {} with start at {} and end at {}".format(
        process_no, start_index, end_index))
    DATA_FOLDER = "train_data"
    # EMBEDDING_TYPE = ""
    LOSS_TYPE = ""  # NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt")
    # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt")
    # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt")

    #NOTE: Squad dev test set
    q_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt")
    r_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt")
    rules_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt")
    reader = QuestionResponseSoftmaxReader(q_file,
                                           r_file,
                                           token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    glove_embeddings_file = os.path.join("data", "glove",
                                         "glove.840B.300d.txt")
    # RESULTS_DIR = "squad_seq2seq_train2"
    #NOTE: All other experiments
    # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized"
    # make_dir_if_not_exists(RESULTS_DIR)
    # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index))

    #NOTE: Squad dev test set
    RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized"
    make_dir_if_not_exists(RESULTS_DIR)
    all_results_save_file = os.path.join(
        RESULTS_DIR,
        "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format(
            start_index, end_index))

    with open(all_results_save_file, "w") as all_writer:
        print("Testing out model with", EMBEDDING_TYPE, "embeddings")
        print("Testing out model with", LOSS_TYPE, "loss")
        # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]:
        for NEGATIVE_PERCENTAGE in [100]:
            model_file = os.path.join(
                "saved_softmax_models",
                "decomposable_attention{}{}_model_{}.th".format(
                    LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

            vocabulary_filepath = os.path.join(
                "saved_softmax_models",
                "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                           NEGATIVE_PERCENTAGE))
            print("LOADING VOCABULARY")
            # Load vocabulary
            vocab = Vocabulary.from_files(vocabulary_filepath)

            EMBEDDING_DIM = 300
            PROJECT_DIM = 200
            DROPOUT = 0.2
            NUM_LAYERS = 2
            if EMBEDDING_TYPE == "":
                token_embedding = Embedding(
                    num_embeddings=vocab.get_vocab_size('tokens'),
                    embedding_dim=EMBEDDING_DIM,
                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_glove":
                token_embedding = Embedding.from_params(
                    vocab=vocab,
                    params=Params({
                        'pretrained_file': glove_embeddings_file,
                        'embedding_dim': EMBEDDING_DIM,
                        'projection_dim': PROJECT_DIM,
                        'trainable': False
                    }))
            elif EMBEDDING_TYPE == "_elmo":
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
                options_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_options.json")
                weights_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
                # NOTE: using Small size as medium size gave CUDA out of memory error
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
                # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
                # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained_2":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options_2.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights_2.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_bert":
                print("Loading bert model")
                model = BertModel.from_pretrained('bert-base-uncased')
                token_embedding = BertEmbedder(model)
                PROJECT_DIM = 768
            else:
                print("Error: Some weird Embedding type", EMBEDDING_TYPE)
                exit()
            word_embeddings = BasicTextFieldEmbedder(
                {"tokens": token_embedding})
            HIDDEN_DIM = 200
            params = Params({
                'input_dim': PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            attend_feedforward = FeedForward.from_params(params)
            similarity_function = DotProductSimilarity()
            params = Params({
                'input_dim': 2 * PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            compare_feedforward = FeedForward.from_params(params)
            params = Params({
                'input_dim': 2 * HIDDEN_DIM,
                'hidden_dims': 1,
                'activations': 'linear',
                'num_layers': 1
            })
            aggregate_feedforward = FeedForward.from_params(params)
            model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                                 attend_feedforward,
                                                 similarity_function,
                                                 compare_feedforward,
                                                 aggregate_feedforward)
            print("MODEL CREATED")
            # Load model state
            with open(model_file, 'rb') as f:
                device = torch.device('cpu')
                model.load_state_dict(torch.load(f, map_location=device))
            print("MODEL LOADED!")
            if torch.cuda.is_available():
                # cuda_device = 3
                # model = model.cuda(cuda_device)
                cuda_device = -1
            else:
                cuda_device = -1

            predictor = DecomposableAttentionSoftmaxPredictor(
                model, dataset_reader=reader)
            # Read test file and get predictions
            gold = list()
            predicted_labels = list()
            probs = list()
            total_time = avg_time = 0.0
            print("Started Testing:", NEGATIVE_PERCENTAGE)
            # before working on anything just save all the questions and responses in a list
            all_data = list()
            examples_count = processed_examples_count = 0
            with open(q_file,
                      'r') as q_reader, open(r_file, "r") as r_reader, open(
                          rules_file, "r") as rule_reader:
                logger.info("Reading questions from : %s", q_file)
                logger.info("Reading responses from : %s", r_file)
                q = next(q_reader).lower().strip()
                q = mt.tokenize(q, return_str=True, escape=False)
                current_qa = (q, "")
                current_rules_and_responses = list()
                for i, (response,
                        rule) in enumerate(zip(r_reader, rule_reader)):
                    response = response.strip()
                    rule = rule.strip()
                    if response and rule:
                        # get current_answer from response
                        a = get_answer_from_response(response)
                        if not current_qa[1]:
                            current_qa = (q, a)
                        else:
                            # verify if the a is same as the one in current_qa
                            if a != current_qa[1]:
                                # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response)
                                current_qa = (current_qa[0], a)
                                # print(current_rules_and_responses)
                                # exit()
                        # Add it to the current responses
                        current_rules_and_responses.append((response, rule))
                    elif len(current_rules_and_responses) > 0:
                        # Create a instance
                        # print(current_qa)
                        # print(current_rules_and_responses)
                        # exit()
                        if rule or response:
                            print("Rule Response mismatch")
                            print(current_qa)
                            print(response)
                            print(rule)
                            print(examples_count)
                            print(i)
                            exit()

                        if examples_count < start_index:
                            examples_count += 1
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                            current_qa = (q, "")
                            current_rules_and_responses = list()
                            continue
                        elif examples_count > end_index:
                            break

                        all_data.append(
                            (current_qa, current_rules_and_responses))
                        try:
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                        except StopIteration:
                            # previous one was the last question
                            q = ""
                        current_qa = (q, "")
                        current_rules_and_responses = list()
                        examples_count += 1
                        # if(examples_count%100 == 0):
                        # 	print(examples_count)
                    else:
                        # Serious Bug
                        print("Serious BUG!!")
                        print(current_qa)
                        print(response)
                        print(rule)
                        print(examples_count)
                        print(i)
                        exit()
            print("{}:\tFINISHED IO".format(process_no))
            examples_count = start_index
            processed_examples_count = 0
            for current_qa, responses_and_rules in all_data:
                start_time = time.time()
                # Tokenize and preprocess the responses
                preprocessed_responses = [
                    mt.tokenize(remove_answer_brackets(response),
                                return_str=True,
                                escape=False)
                    for response, rule in responses_and_rules
                ]
                # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules])
                predictions = predictor.predict(current_qa[0],
                                                preprocessed_responses)
                label_probs = predictions["label_probs"]
                tuples = zip(responses_and_rules, label_probs)
                sorted_by_score = sorted(tuples,
                                         key=lambda tup: tup[1],
                                         reverse=True)
                count = 0
                all_writer.write("{}\n".format(current_qa[0]))
                all_writer.write("{}\n".format(current_qa[1]))
                for index, ((response, rule),
                            label_prob) in enumerate(sorted_by_score):
                    if index == 3:
                        break
                    all_writer.write("{}\t{}\t{}\t{}\n".format(
                        response,
                        mt.tokenize(remove_answer_brackets(response),
                                    return_str=True,
                                    escape=False), rule, label_prob))
                all_writer.write("\n")
                all_writer.flush()
                end_time = time.time()
                processed_examples_count += 1
                examples_count += 1
                total_time += end_time - start_time
                avg_time = total_time / float(processed_examples_count)
                print(
                    "{}:\ttime to write {} with {} responses is {} secs. {} avg time"
                    .format(process_no, examples_count,
                            len(responses_and_rules), end_time - start_time,
                            avg_time))