Ejemplo n.º 1
0
def prepare_dev(prefix, dev_filename, vocab, download = False):
    print("Downloading {}".format(dev_filename))
    # Don't check file size, since we could be using other datasets
    if download:
        dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
        dev_data = data_from_json(os.path.join(prefix, dev_filename))
    else:
        dev_dataset = data_from_json(dev_filename)
    context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab)

    return context_data, question_data, question_uuid_data
Ejemplo n.º 2
0
def get_json_data(data_filename):
    """
    Read the contexts and questions from a .json file (like dev-v1.1.json)

    Returns:
      qn_uuid_data: list (length equal to dev set size) of unicode strings like '56be4db0acb8001400a502ec'
      context_token_data, qn_token_data: lists (length equal to dev set size) of lists of strings (no UNKs, unpadded)
    """
    # Check the data file exists
    if not os.path.exists(data_filename):
        raise Exception("JSON input file does not exist: %s" % data_filename)

    # Read the json file
    print("Reading data from %s..." % data_filename)
    data = data_from_json(data_filename)

    # Get the tokenized contexts and questions, and unique question identifiers
    print("Preprocessing data from %s..." % data_filename)
    qn_uuid_data, context_token_data, qn_token_data = preprocess_dataset(data)

    data_size = len(qn_uuid_data)
    assert len(context_token_data) == data_size
    assert len(qn_token_data) == data_size
    print("Finished preprocessing. Got %i examples from %s" %
          (data_size, data_filename))

    return qn_uuid_data, context_token_data, qn_token_data
Ejemplo n.º 3
0
def process_dev_json_to_files():
    # dev_path example data/squad/dev-v1.1.json
    download_prefix = os.path.dirname(os.path.abspath(
        FLAGS.dev_path))  # data/squad/
    dev_filename = os.path.basename(FLAGS.dev_path)  # "dev-v1.1.json"
    # relative path to save the data

    print("Downloading datasets into {}".format(download_prefix))
    print("Preprocessing datasets into {}".format(FLAGS.data_dir))

    if not os.path.exists(download_prefix):
        os.makedirs(download_prefix)
    if not os.path.exists(FLAGS.data_dir):
        os.makedirs(FLAGS.data_dir)

    maybe_download(squad_base_url, dev_filename, download_prefix, None)
    # Read data from dev json file
    dev_data = data_from_json(os.path.join(download_prefix, dev_filename))
    # write data out to FLAGS.data_dir location
    dev_num_questions, dev_num_answers = read_write_dataset(
        dev_data, 'dev', FLAGS.data_dir)

    dev_path = os.path.join(FLAGS.data_dir, "dev")
    ## generate tokens
    x_dev_dis_path = dev_path + ".ids.context"
    y_dev_ids_path = dev_path + ".ids.question"
    qa_data.data_to_token_ids(dev_path + ".context", x_dev_dis_path,
                              FLAGS.vocab_path)
    qa_data.data_to_token_ids(dev_path + ".question", y_dev_ids_path,
                              FLAGS.vocab_path)
Ejemplo n.º 4
0
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data, context_text = read_dataset(
        dev_data, 'dev', vocab)
    return context_data, question_data, question_uuid_data, context_text
Ejemplo n.º 5
0
def get_json_data(data_filename):
    """
    Read the contexts and questions from a .json file (like dev-v1.1.json)

    Returns:
      qn_uuid_data: list (length equal to dev set size) of unicode strings like '56be4db0acb8001400a502ec'
      context_token_data, qn_token_data: lists (length equal to dev set size) of lists of strings (no UNKs, unpadded)
    """
    # Check the data file exists
    if not os.path.exists(data_filename):
        raise Exception("JSON input file does not exist: %s" % data_filename)

    # Read the json file
    print "Reading data from %s..." % data_filename
    data = data_from_json(data_filename)

    # Get the tokenized contexts and questions, and unique question identifiers
    print "Preprocessing data from %s..." % data_filename
    qn_uuid_data, context_token_data, qn_token_data = preprocess_dataset(data)

    data_size = len(qn_uuid_data)
    assert len(context_token_data) == data_size
    assert len(qn_token_data) == data_size
    print "Finished preprocessing. Got %i examples from %s" % (data_size, data_filename)

    return qn_uuid_data, context_token_data, qn_token_data
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)

    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab)

    return context_data, question_data, question_uuid_data
Ejemplo n.º 7
0
def prepare_dev(prefix, dev_filename, vocab):
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data = read_dataset(
        dev_data, 'dev', vocab)

    def normalize(dat):
        return map(lambda tok: map(int, tok.split()), dat)

    context_data = normalize(context_data)
    question_data = normalize(question_data)

    return context_data, question_data, question_uuid_data
Ejemplo n.º 8
0
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)

    dev_data = data_from_json(os.path.join(prefix, dev_filename))

    # remove answer
    #    if FLAGS.eval_on_train:
    #        context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers = read_dataset(dev_data, 'train', vocab)
    #        return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers

    context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data = read_dataset(
        dev_data, 'dev', vocab)

    return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data
Ejemplo n.º 9
0
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)

    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data = read_dataset(
        dev_data, 'dev', vocab)

    def normalize(dat):
        return list(map(lambda tok: list(map(int, tok.split())), dat))

    context_data = normalize(context_data)
    question_data = normalize(question_data)

    return context_data, question_data, question_uuid_data
Ejemplo n.º 10
0
def get_question_context_data(question_string, context_json_file):

    context_string = data_from_json(context_json_file)['context']
    context = str(context_string)  # string

    # The following replacements are suggested in the paper
    # BidAF (Seo et al., 2016)
    context = context.replace("''", '" ')
    context = context.replace("``", '" ')

    context_tokens = tokenize(context)  # list of strings (lowercase)
    context = context.lower()

    question = str(question_string)  # string
    question_tokens = tokenize(question)  # list of strings

    # also get the question_uuid
    question_uuid = len(question_tokens)

    return [question_uuid], [context_tokens], [question_tokens]
def get_json_data(data_filename):
    """
    Read the contexts and questions from a .json file (like dev-v1.1.json)
    """
    if not os.path.exists(data_filename):
        raise Exception("JSON input file does not exist: %s" % data_filename)

    print("Reading data from %s..." % data_filename)
    data = data_from_json(data_filename)

    print("Preprocessing data from %s..." % data_filename)
    qn_uuid_data, context_token_data, qn_token_data = preprocess_dataset(data)

    data_size = len(qn_uuid_data)
    assert len(context_token_data) == data_size
    assert len(qn_token_data) == data_size
    print("Finished preprocessing. Got %i examples from %s" %
          (data_size, data_filename))

    return qn_uuid_data, context_token_data, qn_token_data
Ejemplo n.º 12
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    #os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)


    # ========= Download Dataset json =========
    # You can change this code to load dataset in your own way

    #dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    #dev_filename = os.path.basename(FLAGS.dev_path)
    #_, _, _ = prepare_dev(dev_dirname, dev_filename, vocab)

    # ========= Process input json =========
    # for codalab
    prefix = os.path.join("data", "squad")

    # writes dev.answer, dev.context, dev.question, dev.span
    dev_path = FLAGS.dev_path
    dev_filename = FLAGS.dev_path.split("/")[-1]
    if FLAGS.download:
        dev_data = data_from_json(os.path.join(prefix, dev_filename))
    else:
        dev_data = data_from_json(dev_filename)
    dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix="")
    print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers))

    # writes dev.ids.context, dev.ids.question
    vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat")
    dev_deposit_path = pjoin(os.path.join("", ""), "dev") #pjoin(os.path.join("data", "squad"), "dev")
    x_dis_path = dev_deposit_path + ".ids.context"
    y_ids_path = dev_deposit_path + ".ids.question"
    data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path)
    data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path)

    # load data sets
    #Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set.
    Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data_home(dev_deposit_path) # for our purposes this is as test set.

    question_uuid_data = []
    with open(dev_deposit_path + ".quid") as f:
        for line in f:
            question_uuid_data.append((line))

    # pad the data at load-time. So, we don't need to do any masking later!!!
    # ref: https://keras.io/preprocessing/sequence/
    # if len < maxlen, pad with specified val
    # elif len > maxlen, truncate
    QMAXLEN = FLAGS.QMAXLEN
    PMAXLEN = FLAGS.PMAXLEN
    Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post')
    P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post')
    A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post')
    A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post')
    test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    """models = [ 'MPCM', 'COATT', 'COATT_fixed', 'COATT_mix','COATT_fixed_mix', 'COATT_fixed_200_mix'] # 'COATT_fixed_200', leave out to save time
    predictions_start = {}; predictions_end = {}
    with open("preds_dev.txt", "a") as f:
        f.write("model" + "," + "pred_raw" + "," + "a_raw")
        for model in models:
            FLAGS.model_type = model
            FLAGS.train_dir = "train/ensemble_train_" + model
            train_dir = "train/ensemble_train_" + model
            # define sizes etc. for different models.
            if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' :
                FLAGS.embedding_size = 200
                FLAGS.lstm_units = 200
            elif model == "MPCM_p100":
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 100
            else:
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 50
            with tf.Graph().as_default():
                with tf.Session() as sess:
                    embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz')
                    pretrained_embeddings = embeddings['glove']

                    qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys()))

                    initialize_model(sess, qa, train_dir)

                    # get predicted start-end indices
                    a_s_l = []
                    a_e_l = []

                    f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {}
                    prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size))
                    for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)):
                        batch_test =  batch[:4]
                        (ys, ye) = qa.predict_on_batch(sess, *batch_test)
                        a_s = (np.argmax(ys, axis=1))
                        a_e = (np.argmax(ye, axis=1))

                        a_s_l = a_s_l + list(a_s)
                        a_e_l = a_e_l + list(a_e)

                        print(len(a_s))
                        for j in range(len(a_s)):
                            p_raw = batch[7][j]
                            a_raw = batch[8][j]
                            s = a_s[j]
                            e = a_e[j]

                            pred_raw = ' '.join(p_raw.split()[s:e + 1])
                            p_raw_mapping[batch[9][j].strip("\n")] = p_raw
                            #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n")
                            prob_start[batch[9][j].strip("\n")] = ys[j]
                            prob_end[batch[9][j].strip("\n")] = ye[j]
                            f.write(model + "," + pred_raw + "," + a_raw )
                        prog.update(i + 1, [("processed", i + 1)])

            predictions_start[model] = prob_start
            predictions_end[model] = prob_end
    f.close()



    # save
    dropPickle(predictions_start, "preds_start.pkl")
    dropPickle(predictions_end, "preds_end.pkl")
    dropPickle(p_raw_mapping, "p_raw_mapping.pkl")"""
    predictions_start = loadPickle("preds_start.pkl")
    predictions_end = loadPickle("preds_end.pkl")
    p_raw_mapping = loadPickle("p_raw_mapping.pkl")


    models = ['COATT_fixed_200']
    #predictions_start = {}; predictions_end = {}
    with open("preds_dev.txt", "a") as f:
        f.write("model" + "," + "pred_raw" + "," + "a_raw")
        for model in models:
            FLAGS.model_type = model
            FLAGS.train_dir = "train/ensemble_train_" + model
            train_dir = "train/ensemble_train_" + model
            if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' :
                FLAGS.embedding_size = 200
                FLAGS.lstm_units = 200
            elif model == "MPCM_p100":
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 100
            else:
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 50
            with tf.Graph().as_default():
                with tf.Session() as sess:
                    embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz')
                    pretrained_embeddings = embeddings['glove']

                    qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys()))

                    initialize_model(sess, qa, train_dir)

                    # get predicted start-end indices
                    a_s_l = []
                    a_e_l = []

                    f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {}
                    prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size))
                    for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)):
                        batch_test =  batch[:4]
                        (ys, ye) = qa.predict_on_batch(sess, *batch_test)
                        a_s = (np.argmax(ys, axis=1))
                        a_e = (np.argmax(ye, axis=1))

                        a_s_l = a_s_l + list(a_s)
                        a_e_l = a_e_l + list(a_e)

                        print(len(a_s))
                        for j in range(len(a_s)):
                            p_raw = batch[7][j]
                            a_raw = batch[8][j]
                            s = a_s[j]
                            e = a_e[j]
                            print(s,e)# comment this out
                            pred_raw = ' '.join(p_raw.split()[s:e + 1])
                            p_raw_mapping[batch[9][j].strip("\n")] = p_raw
                            #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n")
                            prob_start[batch[9][j].strip("\n")] = ys[j]
                            prob_end[batch[9][j].strip("\n")] = ye[j]
                            f.write(model + "," + pred_raw + "," + a_raw )
                        prog.update(i + 1, [("processed", i + 1)])

            predictions_start[model] = prob_start
            predictions_end[model] = prob_end
    f.close()

    dropPickle(predictions_start, "preds_start.pkl")
    dropPickle(predictions_end, "preds_end.pkl")
    dropPickle(p_raw_mapping, "p_raw_mapping.pkl")

    # combine the predictions of the two models (while making independent start, end predictions)
    """answers = {}
    for qkey in predictions_start['MPCM'].keys():
        ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey]
        ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey]
        s = (np.argmax(ys))
        arr = ye.copy()
        arr[0:s] = 0
        e = (np.argmax(arr))
        #e = (np.argmax(ye))
        pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1])
        answers[qkey] = pred_raw.strip("\n")"""
    # predict span with max predicted probability (make joint prediction rather than indepenedntly predicitng start and end indices)
    answers = {}
    for qkey in predictions_start['MPCM'].keys():
        ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey]\
             *predictions_start['COATT_mix'][qkey]*predictions_start['COATT_fixed_mix'][qkey]\
             *predictions_start['COATT_fixed_200_mix'][qkey]*predictions_start['COATT_fixed_200'][qkey] #to save time
        ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey]\
             *predictions_end['COATT_mix'][qkey]*predictions_end['COATT_fixed_mix'][qkey]\
             *predictions_end['COATT_fixed_200_mix'][qkey]*predictions_end['COATT_fixed_200'][qkey] #to save time

        s = 0; e = 0; prodmax = 0
        for si in range(0, len(ys)):
            for ei in range(si, len(ye)):
                prod = ys[si]*ye[ei]
                if prod > prodmax:
                    s = si
                    e = ei
                    prodmax = prod
        print(s,e, prodmax)
        pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1]); print(pred_raw)
        answers[qkey] = pred_raw.strip("\n")

        # write to json file to root dir
    with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
        f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Ejemplo n.º 13
0
def expand_vocab(prefix, dev_filename, vocab, embd, raw_glove,
                 raw_glove_vocab):

    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    #context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab)
    dataset = dev_data
    context_data = []
    query_data = []
    question_uuid_data = []
    tier = 'dev'
    new_vocab = {}
    found = 0
    notfound = 0

    for articles_id in tqdm(range(len(dataset['data'])),
                            desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                question_uuid = qas[qid]['id']

                #context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens]
                #qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens]
                #print(context_ids)
                for w in context_tokens:
                    if not w in vocab:
                        if not w in new_vocab:
                            new_vocab[w] = 1
                        else:
                            new_vocab[w] += 1
                        notfound += 1
                    else:
                        found += 1

                for w in question_tokens:
                    if not w in vocab:
                        if not w in new_vocab:
                            new_vocab[w] = 1
                        else:
                            new_vocab[w] += 1
                        notfound += 1
                    else:
                        found += 1

    print('found/not found: {}/{}, {}% not found'.format(
        found, notfound, 100 * notfound / float(found + notfound)))
    print('New vocabulary:', len(new_vocab))

    vocab_list = list(vocab.items())
    vn = len(vocab_list)
    for i in range((len(new_vocab))):
        vocab_list.append((new_vocab.keys()[i], vn + i))

    vocab = dict(vocab_list)
    rev_vocab = dict([(x, y) for (y, x) in vocab_list])
    #context_data.append(' '.join(context_ids))
    #query_data.append(' '.join(qustion_ids))
    #question_uuid_data.append(question_uuid)
    #return context_data, question_data, question_uuid_data
    _, dim = embd.shape
    new_glove = np.random.randn(len(vocab), dim)
    new_glove[:vn, :] = embd

    found = 0
    for i in range(vn, vn + (len(new_vocab))):
        word = vocab_list[i][0]
        if word in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word]
            new_glove[i, :] = raw_glove[idx, :]
        if word.capitalize() in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word.capitalize()]
            new_glove[i, :] = raw_glove[idx, :]
        if word.upper() in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word.upper()]
            new_glove[i, :] = raw_glove[idx, :]
    #from IPython import embed; embed()
    print("{} unseen words found embeddings".format(found))

    return vocab, rev_vocab, new_glove
Ejemplo n.º 14
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

#    # Old version. Encoding issue
#    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
#    dev_filename = os.path.basename(FLAGS.dev_path)
#    context_data, question_data, question_uuid_data = prepare_dev(dev_dirname, dev_filename, vocab)
#    dataset = (context_data, question_data, question_uuid_data)
#
#    dataset = adjust_dataset(dataset, 'dev')

    # New version. To match the preprocessing provided to train.py
    
    # As in squad_preprocessing.py:
    tt = time.time()
    download_prefix = os.path.join("download", "squad")
    data_prefix = os.path.join("data", "squad")
    if not os.path.exists(download_prefix):
        os.makedirs(download_prefix)
    if not os.path.exists(data_prefix):
        os.makedirs(data_prefix)
    dev_filename = "dev-v1.1.json"
    dev_data = data_from_json(os.path.join(download_prefix, dev_filename))
    (dev_num_questions,
     dev_num_answers,
     uuid) = read_write_dataset_(dev_data, 'dev', data_prefix)
    print("Processed {} questions and {} answers in dev".format(
            dev_num_questions, dev_num_answers))
    print(time.time()-tt, 'part: squar_preprocessing.py')

    # As in qa_data.py:
    tt = time.time()
    args = qa_data.setup_args()
    vocab_path = pjoin(args.vocab_dir, "vocab.dat")
    dev_path = pjoin(args.source_dir, "dev")
    x_dev_dis_path = dev_path + ".ids.context"
    y_dev_ids_path = dev_path + ".ids.question"
    data_to_token_ids(dev_path + ".context", x_dev_dis_path, vocab_path)
    data_to_token_ids(dev_path + ".question", y_dev_ids_path, vocab_path)
    print(time.time()-tt, 'part: qa_data.py')

    # As in train.py
    tt = time.time()
    dataset = {}
    load_data_dq(dataset, 'dev', FLAGS.data_dir)
    indices = trim_empty(dataset['dev'])
    print(time.time()-tt, 'part: train.py')
    
    # uuid from read_write_dataset_
    uuid = [i for j, i in enumerate(uuid) if j not in indices]
    dataset['dev']['uuid'] = uuid

    # sanity check
    keys = dataset['dev'].keys()
    for i in xrange(len(keys)-1):
        print(i, keys[i], len(dataset['dev'][keys[i]]),
              i+1, keys[i+1], len(dataset['dev'][keys[i+1]]))
        assert len(dataset['dev'][keys[i]]) == len(dataset['dev'][keys[i+1]])

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(output_size=FLAGS.output_size)

    with tf.Session() as sess:
        pass
    local_device_protos = device_lib.list_local_devices()  # 38559755
    for x in local_device_protos:
        if x.device_type == 'GPU':
            FLAGS.ifgpu = True
            break

    qa = QASystem(encoder, decoder, embed_path, rev_vocab)

    #with tf.Session() as sess:
    #    pass
    train_dir = get_normalized_train_dir(FLAGS.train_dir)
    initialize_model(sess, qa, train_dir)
    answers = generate_answers(sess, qa, dataset, rev_vocab)

    # write to json file to root dir
    with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
        f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Ejemplo n.º 15
0
def main(unused_argv):
    # Print an error message if you've entered flags incorrectly
    if len(unused_argv) != 1:
        raise Exception("There is a problem with how you entered flags: %s" %
                        unused_argv)

    # Define train_dir
    if not FLAGS.name and not FLAGS.train_dir and FLAGS.mode != EVAL_MODE:
        raise Exception("You need to specify either --name or --train_dir")
    FLAGS.train_dir = FLAGS.train_dir or os.path.join(LOGS_DIR, FLAGS.name)

    # If not specified, set d_ff to match d_model
    if FLAGS.d_ff == 0:
        FLAGS.d_ff = FLAGS.d_model

    # Initialize best model directory
    best_model_dir = os.path.join(FLAGS.train_dir, "best_checkpoint")

    # Define path for glove vecs
    FLAGS.glove_path = FLAGS.glove_path or os.path.join(
        FLAGS.data_dir, "glove.840B.{}d.txt".format(FLAGS.word_emb_size))
    FLAGS.char_emb_path = os.path.join(FLAGS.data_dir, "char_emb_file.txt")
    FLAGS.word_emb_path = os.path.join(FLAGS.data_dir, "word_emb_file.txt")

    # Get file paths to train/dev/test datafiles for tokenized queries, contexts and answers
    FLAGS.train_rec_path = os.path.join(FLAGS.data_dir, "train.tfrecord")
    FLAGS.train_ans_path = os.path.join(FLAGS.data_dir, "train_ans.json")
    FLAGS.train_info_path = os.path.join(FLAGS.data_dir, "train_info.json")
    FLAGS.dev_rec_path = os.path.join(FLAGS.data_dir, "dev.tfrecord")
    FLAGS.dev_ans_path = os.path.join(FLAGS.data_dir, "dev_ans.json")
    FLAGS.dev_info_path = os.path.join(FLAGS.data_dir, "dev_info.json")

    # Load word embedding matrix and char embedding matrix.
    word_emb_matrix, word2id = get_word_embs(FLAGS.word_emb_path,
                                             FLAGS.word_emb_size)
    char_emb_matrix, char2id = get_char_embs(FLAGS.char_emb_path,
                                             FLAGS.char_emb_size)

    # Some GPU settings
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    # Split by mode
    if FLAGS.mode == TRAIN_MODE:

        # Load dataset info and answer files
        print("Loading train and dev datasets...")
        train_answers = data_from_json(FLAGS.train_ans_path)
        train_info = data_from_json(FLAGS.train_info_path)
        dev_answers = data_from_json(FLAGS.dev_ans_path)
        dev_info = data_from_json(FLAGS.dev_info_path)

        # Initialize data pipeline
        loader = get_data_loader(FLAGS, is_training=True)
        train_dataset = load_dataset(FLAGS,
                                     FLAGS.train_rec_path,
                                     loader,
                                     shuffle=True)
        train_iterator = train_dataset.make_one_shot_iterator()
        dev_dataset = load_dataset(FLAGS,
                                   FLAGS.dev_rec_path,
                                   loader,
                                   shuffle=True)
        dev_iterator = dev_dataset.make_one_shot_iterator()

        # Initialize the model
        input_handle = tf.placeholder(tf.string, shape=())
        input_iterator = tf.data.Iterator.from_string_handle(
            input_handle, train_dataset.output_types,
            train_dataset.output_shapes)
        model = SQuADTransformer(FLAGS, input_iterator, input_handle,
                                 word_emb_matrix, char_emb_matrix)

        # Setup train dir and logfile
        if not os.path.exists(FLAGS.train_dir):
            os.makedirs(FLAGS.train_dir)
        file_handler = logging.FileHandler(
            os.path.join(FLAGS.train_dir, "log.txt"))
        logging.getLogger().addHandler(file_handler)

        # Make best model dir if necessary
        if not os.path.exists(best_model_dir):
            os.makedirs(best_model_dir)

        with tf.Session(config=config) as sess:

            # Load most recent model
            initialize_model(sess, model, FLAGS.train_dir, expect_exists=False)

            # Train
            model.train(sess, train_iterator, train_answers, train_info,
                        dev_iterator, dev_answers, dev_info)

    elif FLAGS.mode == EVAL_MODE:
        if FLAGS.json_in_path == "":
            raise Exception(
                "For {} mode, you need to specify --json_in_path".format(
                    EVAL_MODE))
        if FLAGS.checkpoint_dir == "" and FLAGS.ensemble_path == "":
            raise Exception(
                "For {} mode, you need to specify --checkpoint_dir or --ensemble_path"
                .format(EVAL_MODE))
        FLAGS.is_training = False

        # Read the JSON data from file
        print("Loading test dataset from {}...".format(FLAGS.json_in_path))
        test_data = data_from_json(FLAGS.json_in_path)
        test_examples, test_answers, test_info, _, _ = preprocess(test_data)

        # Get formatted examples in memory for creating a TF Dataset
        formatted_examples, output_types, output_shapes = get_formatted_examples(
            FLAGS, test_examples, word2id, char2id)

        # Construct a generator function for building TF dataset
        def gen():
            infinite_idx = 0
            while True:
                yield formatted_examples[infinite_idx]
                infinite_idx = (infinite_idx + 1) % len(formatted_examples)

        # Initialize data pipeline (repeat so we can use this multiple times in an ensemble).
        test_dataset = tf.data.Dataset.from_generator(
            gen, output_types, output_shapes).repeat().batch(FLAGS.batch_size)
        test_iterator = test_dataset.make_one_shot_iterator()
        input_handle = tf.placeholder(tf.string, shape=())
        input_iterator = tf.data.Iterator.from_string_handle(
            input_handle, test_dataset.output_types,
            test_dataset.output_shapes)

        # Ensemble or single eval.
        is_ensemble = FLAGS.ensemble_path != ""
        if is_ensemble:  # Path to file with a list of directories for ensemble
            with open(FLAGS.ensemble_path, 'r') as fh:
                checkpoint_paths = [
                    line.strip() for line in fh.readlines() if line
                ]
                if len(checkpoint_paths) == 0:
                    raise Exception(
                        "Ensemble path {} did not contain any checkpoint paths."
                        .format(FLAGS.ensemble_path))
        else:
            checkpoint_paths = [FLAGS.checkpoint_dir]

        # Make predictions using all checkpoints specified in checkpoint_paths
        model = SQuADTransformer(FLAGS, input_iterator, input_handle,
                                 word_emb_matrix, char_emb_matrix)
        all_answers = defaultdict(
            list)  # Maps from UUID to list of (answer text, prob) pairs.
        for i in range(len(checkpoint_paths)):
            if is_ensemble:
                print("Ensemble model {} / {}...".format(
                    i + 1, len(checkpoint_paths)))
            with tf.Session(config=config) as sess:
                # Load model from checkpoint_dir
                initialize_model(sess,
                                 model,
                                 checkpoint_paths[i],
                                 expect_exists=True,
                                 is_training=False)

                # Get a predicted answer for each example in the data
                num_batches = test_info['num_examples'] // FLAGS.batch_size + 1
                answers_dict = model.get_answers(sess, test_iterator,
                                                 test_answers, num_batches)

                # Add it to the combined answers
                for k, v in answers_dict.items():
                    all_answers[k].append(v)

        # Combine the results into a final prediction
        if is_ensemble:
            print("Combining answers with max-vote...")
        answers_dict = {}
        for k, v in tqdm(all_answers.items()):
            answers_dict[k] = ensemble_max_vote(all_answers[k])

        # Write the uuid->answer mapping a to json file in root dir
        print("Writing predictions to %s..." % FLAGS.json_out_path)
        with io.open(FLAGS.json_out_path, 'w', encoding='utf-8') as f:
            f.write(json.dumps(answers_dict, ensure_ascii=False))
            print("Wrote predictions to %s" % FLAGS.json_out_path)

    else:
        raise Exception("Unsupported mode: %s" % FLAGS.mode)
Ejemplo n.º 16
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)


    # ========= Download Dataset json =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    _, _, _ = prepare_dev(dev_dirname, dev_filename, vocab)

    # ========= Process input json =========
    prefix = os.path.join("data", "squad")

    # writes dev.answer, dev.context, dev.question, dev.span
    dev_path = FLAGS.dev_path
    dev_filename = FLAGS.dev_path.split("/")[-1]
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix)
    print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers))

    # writes dev.ids.context, dev.ids.question
    vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat")
    dev_deposit_path = pjoin(os.path.join("data", "squad"), "dev")
    x_dis_path = dev_deposit_path + ".ids.context"
    y_ids_path = dev_deposit_path + ".ids.question"
    data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path)
    data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path)

    # load data sets
    Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set.
    question_uuid_data = []
    with open(os.path.join("data", "squad") + "/dev.quid") as f:
        for line in f:
            question_uuid_data.append((line))

    # pad the data at load-time. So, we don't need to do any masking later!!!
    # ref: https://keras.io/preprocessing/sequence/
    # if len < maxlen, pad with specified val
    # elif len > maxlen, truncate
    QMAXLEN = FLAGS.QMAXLEN
    PMAXLEN = FLAGS.PMAXLEN
    Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post')
    P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post')
    A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post')
    A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post')
    test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    with tf.Graph().as_default():
        with tf.Session() as sess:
            embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz')
            pretrained_embeddings = embeddings['glove']

            qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys()))

            initialize_model(sess, qa, train_dir)

            # get predicted start-end indices
            a_s = [] # store all start index preds
            a_e = [] # store all end index preds
            a_s_l = []
            a_e_l = []

            f1 = exact_match = total = 0; answers = {}
            prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size))
            for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)):
                batch_test =  batch[:4]
                (ys, ye) = qa.predict_on_batch(sess, *batch_test)
                a_s = (np.argmax(ys, axis=1))
                a_e = (np.argmax(ye, axis=1))
                a_s_l = a_s_l + list(a_s)
                a_e_l = a_e_l + list(a_e)

                for j in range(len(a_s)):
                    p_raw = batch[7][j]
                    a_raw = batch[8][j]
                    s = a_s[j]
                    e = a_e[j]
                    pred_raw = ' '.join(p_raw.split()[s:e + 1])
                    f1 += f1_score(pred_raw, a_raw)
                    exact_match += exact_match_score(pred_raw, a_raw)
                    total += 1
                    answers[batch[9][j].strip("\n")] = pred_raw.strip("\n")
                prog.update(i + 1, [("processed", i + 1)])
            exact_match = 100.0 * exact_match / total
            f1 = 100.0 * f1 / total
            print(("First Answer Entity level F1/EM: %.2f/%.2f", f1, exact_match))

        #answers = generate_answers(question_uuid_data, a_s_l, a_e_l, context_data, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))