Exemple #1
0
def get_baseline_model():
    # get the train and predict model model

    embedding_file, vocab_size = get_default_inputs_for_model()

    qa_model = QAModel()
    train_model, prediction_model = qa_model.get_lstm_cnn_model(
        embedding_file, vocab_size)
    logger.info('Default created: Baseline')
    logger.info('enc_timesteps = 30,\
                               dec_timesteps = 30, hidden_dim = 50, filters = 500, kernel_sizes = [2, 3, 5, 7]'
                )
    return train_model, prediction_model
Exemple #2
0
class Bot:
    def __init__(self):
        self.qa_model = QAModel()

    def reply(self, context, log):
        assert isinstance(context, str)
        assert isinstance(log, list)
        if not log:
            return self.message(self.prolog())
        return self.message(self.conversate(log[-1], context))

    def prolog(self):
        return "Hello! I'm the buisness question answering bot. ask me any question in regard to the buisness."

    def ans(self, question, context):
        try:
            answer = self.qa_model.forward(question, context)
        except:
            return "I'm struggling to find an answer."
        if answer in {"", " ", "[CLS]"}:
            return "I couldn't find an answer. try to ask differently."
        return self.normalize(answer)

    def conversate(self, question, context):
        if question['sender'] == 'bot':
            return "Ask me a question!"
        try:
            answer = self.qa_model.forward(question['text'], context)
        except:
            return "I'm struggling to find an answer."
        if answer in {"", " ", "[CLS]"}:
            return "I couldn't find an answer. try to ask differently."
        return self.normalize(answer)

    def normalize(self, answer):
        answer = re.sub(' ,', ',', answer)
        answer = re.sub(' ’ ', '’', answer)
        answer = answer[0].upper() + answer[1:]
        return answer + '.'

    def message(self, text):
        return {"sender": "bot", "datetime": str(dt.now()), "text": text}
Exemple #3
0
def get_small_model():
    # small model
    embedding_file, vocab_size = get_default_inputs_for_model()
    enc_timesteps = 30
    dec_timesteps = 30
    hidden_dim = 10
    filters = 20
    qa_model = QAModel()
    small_train_model, small_prediction_model = qa_model.get_lstm_cnn_model(
        embedding_file,
        vocab_size,
        enc_timesteps=enc_timesteps,
        dec_timesteps=dec_timesteps,
        filters=filters,
        hidden_dim=hidden_dim)
    logger.info('Model created: Small')
    logger.info(f'enc_timesteps = {enc_timesteps},\
                                       dec_timesteps = {dec_timesteps},'
                f' hidden_dim = {hidden_dim}, filters = {filters}, '
                f'kernel_sizes = [2, 3, 5, 7]')
    return small_train_model, small_prediction_model
    def load_model(self, state_path):
        """
        Initialises the model and loads saved state into the instance of the model.

        Parameters
        ----------
        state_path (str) - path pointing to the saved state.

        Returns
        -------
        Model (torch.nn.Module)
        """

        logging.info(f"Loading trained state from {state_path}")
        dbm = DistilBertModel.from_pretrained('distilbert-base-uncased',
                                              return_dict=True)
        device = torch.device(self.device)
        dbm.to(device)
        model = QAModel(transformer_model=dbm, device=device)

        # checkpoint = torch.load(state_path, map_location=device)
        model.load_state_dict(torch.load(state_path))
        model.eval()  # Switch to evaluation mode

        return model
Exemple #5
0
def get_larger_model():
    enc_timesteps = 30
    dec_timesteps = 30
    hidden_dim = 200
    filters = 500

    embedding_file, vocab_size = get_default_inputs_for_model()

    qa_model = QAModel()
    larger_train_model, larger_prediction_model = qa_model.get_lstm_cnn_model(
        embedding_file,
        vocab_size,
        enc_timesteps=enc_timesteps,
        dec_timesteps=dec_timesteps,
        filters=filters,
        hidden_dim=hidden_dim)
    logger.info('Model created: Larger')
    logger.info(f'enc_timesteps = {enc_timesteps},\
                                               dec_timesteps = {dec_timesteps},'
                f' hidden_dim = {hidden_dim}, filters = {filters}, '
                f'kernel_sizes = [2, 3, 5, 7]')
    return larger_train_model, larger_prediction_model
def load_model(state_path, device="cpu"):
    logging.info(f"Loading trained state from {state_path}")
    dbm = DistilBertModel.from_pretrained('distilbert-base-uncased',
                                          return_dict=True)
    device = torch.device(device)
    dbm.to(device)
    model = QAModel(transformer_model=dbm, device=device)

    checkpoint = torch.load(state_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  # Switch to evaluation mode

    return model
Exemple #7
0
    def __init__(self, model_file, word_embeddings_cache_file, stopwords_file,
                 word2dfs_file):
        # init torch random seeds
        torch.manual_seed(1234)
        np.random.seed(1234)

        # load model
        self.model = QAModel.load('', model_file)
        # load vectors
        self.vec_dim = self._preload_cached_embeddings(
            word_embeddings_cache_file)
        self.unk_term_vec = np.random.uniform(-0.25, 0.25, self.vec_dim)

        # stopwords
        self.stoplist = set([line.strip() for line in open(stopwords_file)])

        # word dfs
        if os.path.isfile(word2dfs_file):
            with open(word2dfs_file, "rb") as w2dfin:
                self.word2dfs = pickle.load(w2dfin)
Exemple #8
0
			    help='Indexed test JSON file')
	parser.add_argument('-id2c', '--id2char', type=str, default=None,
			    help='id2char JSON file')

	args = parser.parse_args()

	if args.id2char is None:
		char_vocab_size = 44
	else:
		char_vocab_size = len(load_data(args.id2char)) + 2
	learning_rate = 0.001

	args.char_vocab_size = char_vocab_size
	args.embed_mat = numpy.load(args.embed_mat_path)

	G = QAModel(args)
	model = G.create_model_graph()
	print "Compiling model.."
	# print "Learning rate:", learning_rate
	opt = Adam(lr=learning_rate, clipnorm=5.0)
	model.compile(optimizer=opt,
		      loss='categorical_crossentropy', metrics=['accuracy'])

	model.load_weights(args.weightpath)
	print "Model loaded..."


	################ Evaluating on Validation data ################
	print "Validation data loading"
	textdatapath = args.dev_json
	processed_data = args.tok_dev_json
def main(mode='test', question=None, answers=None):
    """
    This function is used to train, predict or test

    Args:
        mode (str): train/preddict/test
        question (str): this contains the question
        answers (list): this contains list of answers in string format

    Returns:
        index (integer): index of the most likely answer
    """

    # get the train and predict model model
    vocabulary = Vocabulary("./data/vocab_all.txt")
    embedding_file = "./data/word2vec_100_dim.embeddings"
    qa_model = QAModel()
    train_model, predict_model = qa_model.get_bilstm_model(
        embedding_file, len(vocabulary))

    epoch = 1
    if mode == 'train':
        for i in range(epoch):
            print('Training epoch', i)

            # load training data
            qa_data = QAData()
            questions, good_answers, bad_answers = qa_data.get_training_data()

            # train the model
            Y = np.zeros(shape=(questions.shape[0], ))
            train_model.fit([questions, good_answers, bad_answers],
                            Y,
                            epochs=1,
                            batch_size=64,
                            validation_split=0.1,
                            verbose=1)

            # save the trained model
            train_model.save_weights('model/train_weights_epoch_' +
                                     str(epoch) + '.h5',
                                     overwrite=True)
            predict_model.save_weights('model/predict_weights_epoch_' +
                                       str(epoch) + '.h5',
                                       overwrite=True)
    elif mode == 'predict':
        # load the evaluation data
        data = pickle.load(open("./data/dev.pkl", 'rb'))
        random.shuffle(data)

        # load weights from trained model
        qa_data = QAData()
        predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5')

        c = 0
        c1 = 0
        for i, d in enumerate(data):
            print(i, len(data))

            # pad the data and get it in desired format
            indices, answers, question = qa_data.process_data(d)

            # get the similarity score
            sims = predict_model.predict([question, answers])

            n_good = len(d['good'])
            max_r = np.argmax(sims)
            max_n = np.argmax(sims[:n_good])
            r = rankdata(sims, method='max')
            c += 1 if max_r == max_n else 0
            c1 += 1 / float(r[max_r] - r[max_n] + 1)

        precision = c / float(len(data))
        mrr = c1 / float(len(data))
        print("Precision", precision)
        print("MRR", mrr)
    elif mode == 'test':
        # question and answers come from params
        qa_data = QAData()
        answers, question = qa_data.process_test_data(question, answers)

        # load weights from the trained model
        predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5')

        # get similarity score
        sims = predict_model.predict([question, answers])
        max_r = np.argmax(sims)
        return max_r
Exemple #10
0
	if os.path.isfile(baseexp + '/results.txt'):
		mode = 'a'
	else:
		mode = 'w'
	with open(baseexp + '/results.txt', mode) as fp:
		fp.write("######RESULTS######\n")

	# Initializations for tracking the best model
	prev_best_em = 0.0
	prev_best_f1 = 0.0
	prev_best_epoch = args.initial_epoch - 1

	logging.info('=' * 100)
	for epoch in range(args.initial_epoch, args.num_epoch):
		logging.info("Epoch: %s", str(epoch))
		G = QAModel(args)
		model = G.create_model_graph()

		logging.info("Compiling model..")
		# print "Learning rate:", args.learning_rate
		model = G.compile_model(model)
		logging.info("Model compiled..")

		exp = baseexp + "/epoch" + str(epoch)
		if not os.path.isdir(exp):
			os.makedirs(exp)

		if args.pretrained_weightpath is not None and epoch == args.initial_epoch:
			logging.info("Loading a pretrained weight")
			model.load_weights(args.pretrained_weightpath)
			logging.info("Evaluating the pretrained model")
Exemple #11
0
@author: tarun
"""


from data import QAData, Vocabulary
from model import QAModel
import pickle
import numpy as np
import random
from keras.models import Model
import matplotlib.pyplot as plt

vocabulary = Vocabulary("./data/vocab_all.txt")
embedding_file = "./data/word2vec_100_dim.embeddings"
qa_model = QAModel()
train_model, predict_model = qa_model.get_lstm_cnn_model(embedding_file, len(vocabulary))

# layer_outputs = [predict_model.layers[0].output, predict_model.layers[1].output, predict_model.layers[2].layers[0].output, predict_model.layers[2].layers[0].output,
#                  predict_model.layers[2].layers[1].output, predict_model.layers[2].layers[2].get_output_at(0),
#                  predict_model.layers[2].layers[2].get_output_at(1), predict_model.layers[2].layers[3].get_output_at(0),
#                  predict_model.layers[2].layers[3].get_output_at(1), predict_model.layers[2].layers[4].get_output_at(0),
#                  predict_model.layers[2].layers[4].get_output_at(1), predict_model.layers[2].layers[5].output,
#                  predict_model.layers[2].layers[6].output, predict_model.layers[2].layers[7].get_output_at(0),
#                  predict_model.layers[2].layers[7].get_output_at(1), predict_model.layers[2].layers[8].get_output_at(0),
#                  predict_model.layers[2].layers[8].get_output_at(1), predict_model.layers[2].layers[9].get_output_at(0),
#                  predict_model.layers[2].layers[9].get_output_at(1), predict_model.layers[2].layers[10].get_output_at(0),
#                  predict_model.layers[2].layers[10].get_output_at(1), predict_model.layers[2].layers[11].output,
#                  predict_model.layers[2].layers[11].output, predict_model.layers[2].layers[13].get_output_at(0),
#                  predict_model.layers[2].layers[13].get_output_at(1), predict_model.layers[2].layers[14].output]
Exemple #12
0
def main(mode='test'):
    # get the train and predict model model
    vocabulary = Vocabulary("./data/vocab_all.txt")
    embedding_file = "./data/word2vec_100_dim.embeddings"
    qa_model = QAModel()
    train_model, predict_model = qa_model.get_lstm_cnn_model(embedding_file, len(vocabulary))
    epo = 100
    if mode == 'train':
        # load training data
        qa_data = QAData()
        questions, good_answers, bad_answers = qa_data.get_training_data()

        callbacks = [EarlyStopping(monitor='val_loss', patience=20),
                     ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]
        # train the model
        Y = np.zeros(shape=(questions.shape[0],))
        train_model.fit([questions, good_answers, bad_answers], Y, epochs=epo, batch_size=64, validation_split=0.1,
                        verbose=1, callbacks=callbacks)

        # save the trained model
        # train_model.save_weights('model/train_weights_epoch_' + str(epo) + '.h5', overwrite=True)
        model = keras.models.load_model('best_model.h5')
        model.save_weights('model/best_weights_epoch_' + str(epo) + '.h5', overwrite=True)
        predict_model.save_weights('model/predict_weights_epoch_' + str(epo) + '.h5', overwrite=True)

    elif mode == 'predict':
        # load the evaluation data
        data = pickle.load(open("./data/dev.pkl",'rb'))
        random.shuffle(data)

        # load weights from trained model
        qa_data = QAData()
        model_filenames = ['model/best_model.h5', 'model/predict_weights_epoch_' + str(epo) + '.h5']

        for model_name in model_filenames:
            predict_model.load_weights(model_name)

            c = 0
            c1 = 0
            for i, d in enumerate(data):
                if i%100 == 0:
                    print(i, len(data))

                # pad the data and get it in desired format
                indices, answers, question = qa_data.process_data(d)

                # get the similarity score
                sims = predict_model.predict([question, answers])

                n_good = len(d['good'])
                max_r = np.argmax(sims)
                max_n = np.argmax(sims[:n_good])
                r = rankdata(sims, method='max')
                c += 1 if max_r == max_n else 0
                c1 += 1 / float(r[max_r] - r[max_n] + 1)

            precision = c / float(len(data))
            mrr = c1 / float(len(data))
            print(f'Results for: model: {model_name}')
            print("Precision", precision)
            print("MRR", mrr)
Exemple #13
0
 def __init__(self):
     self.qa_model = QAModel()
Exemple #14
0
def main():
    torch.manual_seed(94)
    batch_size = 4
    train_data_list = load_data_list('./train_data_list')
    train_set = QADataset(train_data_list)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

    dev_data_list = load_data_list('./dev_data_list')
    dev_set = QADataset(dev_data_list)
    dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=True)

    model = QAModel()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    EPOCHS = 6
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=5e-5)
    loss = 0
    for epoch in range(EPOCHS):
        cum_loss = 0
        print(f'epoch: {epoch}')
        ite = 0
        model.train()
        for i, (question_id, context, question, answerable, ans_start,
                ans_end) in enumerate(tqdm(train_loader)):
            ite = i + 1
            pt = tokenizer(context, question, return_tensors='pt')

            bs = len(context)
            mask = torch.zeros(bs, 512).bool()
            mask[:, 466:] = True

            if torch.cuda.is_available():
                answerable = answerable.cuda()
                ans_start = ans_start.cuda()
                ans_end = ans_end.cuda()
                pt['input_ids'] = pt['input_ids'].cuda()
                pt['token_type_ids'] = pt['token_type_ids'].cuda()
                pt['attention_mask'] = pt['attention_mask'].cuda()
                mask = mask.cuda()
            target = torch.cat((ans_start.unsqueeze(1), ans_end.unsqueeze(1)),
                               dim=1).to(device)
            optimizer.zero_grad()
            output = model(pt)
            output[:, :, 0].masked_fill_(mask, float('-inf'))
            output[:, :, 1].masked_fill_(mask, float('-inf'))
            loss = loss_fn(output, target)
            cum_loss += float(loss)
            loss.backward()
            optimizer.step()
        print(cum_loss)

        model.eval()
        dev_loss = 0
        dev_ite = 0
        for i, (question_id, context, question, answerable, ans_start,
                ans_end) in enumerate(tqdm(dev_loader)):
            bs = len(context)
            mask = torch.zeros(bs, 512).bool()
            mask[:, 466:] = 1

            with torch.no_grad():
                dev_ite = i + 1
                pt = tokenizer(context, question, return_tensors='pt')
                if torch.cuda.is_available():
                    answerable = answerable.cuda()
                    ans_start = ans_start.cuda()
                    ans_end = ans_end.cuda()
                    pt['input_ids'] = pt['input_ids'].cuda()
                    pt['token_type_ids'] = pt['token_type_ids'].cuda()
                    pt['attention_mask'] = pt['attention_mask'].cuda()
                    mask = mask.cuda()
                target = torch.cat(
                    (ans_start.unsqueeze(1), ans_end.unsqueeze(1)),
                    dim=1).to(device)
                output = model(pt)
                output[:, :, 0].masked_fill_(mask, float('-inf'))
                output[:, :, 1].masked_fill_(mask, float('-inf'))
                loss = loss_fn(output, target)
                dev_loss += float(loss)
        print('avg_train_loss: {}, avg_dev_loss: {}'.format(
            cum_loss / ite, dev_loss / dev_ite))
        SAVED_MDL_PATH = './model/' + str(epoch + 1) + '.pt'
        #torch.save(model.state_dict(), SAVED_MDL_PATH)
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss
            }, SAVED_MDL_PATH)
        print('model {} saved'.format(SAVED_MDL_PATH))
Exemple #15
0
def predict(MDL_PATH, DATA_PATH):
    batch_size = 4
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = QAModel()
    model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=3e-5)

    checkpoint = torch.load(MDL_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    model.eval()

    dev_data_list = load_data_list(DATA_PATH)
    dev_set = QADataset(dev_data_list)
    dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=False)
    print('run prediction')
    dic = {}
    for i, (question_id, context, question, answerable, ans_start,
            ans_end) in enumerate(tqdm(dev_loader)):
        bs = len(context)
        mask = torch.zeros(bs, 512).bool()
        mask[:, 466:] = 1

        with torch.no_grad():
            dev_ite = i + 1
            pt = tokenizer(context, question, return_tensors='pt')
            if torch.cuda.is_available():
                answerable = answerable.cuda()
                ans_start = ans_start.cuda()
                ans_end = ans_end.cuda()
                pt['input_ids'] = pt['input_ids'].cuda()
                pt['token_type_ids'] = pt['token_type_ids'].cuda()
                pt['attention_mask'] = pt['attention_mask'].cuda()
                mask = mask.cuda()
            target = torch.cat((ans_start.unsqueeze(1), ans_end.unsqueeze(1)),
                               dim=1).to(device)
            output = model(pt)  # shape (batch_size, 512, 2)
            output[:, :, 0].masked_fill_(mask, float('-inf'))
            output[:, :, 1].masked_fill_(mask, float('-inf'))

            for batch_idx, sample in enumerate(
                    output):  # sample: shape (512, 2)
                start = sample[:, 0]  # start: shape (512)
                end = sample[:, 1]
                start_candidates = torch.topk(start, k=30)
                end_candidates = torch.topk(end, k=30)
                ans_candidates = []
                scores = []
                for i, s in enumerate(start_candidates[1]):
                    for j, e in enumerate(end_candidates[1]):
                        if e == s and e == 0:
                            ans_candidates.append((s, e))
                            scores.append(start_candidates[0][i] +
                                          end_candidates[0][j])
                        if s < e and e - s <= 30:
                            ans_candidates.append((s, e))
                            scores.append(start_candidates[0][i] +
                                          end_candidates[0][j])
                results = list(zip(scores, ans_candidates))
                results.sort()
                results.reverse()

                if results[0][1][0] == 0:
                    dic[question_id[batch_idx]] = ""
                else:
                    s, e = results[0][1][0], results[0][1][1]
                    ids = pt['input_ids'][batch_idx][s:e]
                    dic[question_id[batch_idx]] = tokenizer.decode(
                        ids).replace(" ", "")

    with open('prediction.json', 'w') as fp:
        json.dump(dic, fp)
Exemple #16
0
    torch.set_num_threads(args.num_threads)

    train_set, dev_set, test_set = 'train-all', 'raw-dev', 'raw-test'
    if args.train:
        train_set, dev_set, test_set = 'train', 'clean-dev', 'clean-test'

    # cache word embeddings
    cache_file = os.path.splitext(args.word_vectors_file)[0] + '.cache'
    utils.cache_word_embeddings(args.word_vectors_file, cache_file)

    vocab_size, vec_dim = utils.load_embedding_dimensions(cache_file)

    # instantiate model
    net = QAModel(vec_dim,
                  args.filter_width,
                  args.num_conv_filters,
                  args.no_ext_feats,
                  cuda=args.cuda)

    # initialize the trainer
    trainer = Trainer(net, args.eta, args.mom, args.no_loss_reg, vec_dim,
                      args.cuda)
    logger.info("Loading input data...")
    # load input data
    trainer.load_input_data(args.dataset_folder, cache_file, train_set,
                            dev_set, test_set)
    logger.info("Setting up external features...")
    # setup external features
    # TODO: remember to update args.* in testing loop below
    if args.paper_ext_feats:
        logger.info("--paper-ext-feats")
Exemple #17
0
    torch.manual_seed(1234)
    np.random.seed(1234)

    train_set, dev_set, test_set = 'train', 'clean-dev', 'clean-test'
    if args.train_all:
        train_set, dev_set, test_set = 'train-all', 'raw-dev', 'raw-test'

    # cache word embeddings
    cache_file = os.path.splitext(args.word_vectors_file)[0] + '.cache'    
    utils.cache_word_embeddings(args.word_vectors_file, cache_file)

    vocab_size, vec_dim = utils.load_embedding_dimensions(cache_file)
    
    # instantiate model
    net = QAModel(vec_dim, args.filter_width, args.num_conv_filters, args.no_ext_feats) #filter width is 5
    QAModel.save(net, args.dataset_folder, args.model_fname)

    torch.set_num_threads(args.num_threads)
    
    trainer = Trainer(net, args.eta, args.mom, args.no_loss_reg, vec_dim)
    logger.info("Loading input data...")
    trainer.load_input_data(args.dataset_folder, cache_file, train_set, dev_set, test_set)

    best_map = 0.0
    best_model = 0

    for i in range(args.epochs):
        logger.info('------------- Training epoch {} --------------'.format(i+1))        
        train_accuracy = trainer.train(train_set, args.batch_size, args.debugSingleBatch)        
        if args.debugSingleBatch: sys.exit(0)