def prepare_dev(prefix, dev_filename, vocab, download = False): print("Downloading {}".format(dev_filename)) # Don't check file size, since we could be using other datasets if download: dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) else: dev_dataset = data_from_json(dev_filename) context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab) return context_data, question_data, question_uuid_data
def get_json_data(data_filename): """ Read the contexts and questions from a .json file (like dev-v1.1.json) Returns: qn_uuid_data: list (length equal to dev set size) of unicode strings like '56be4db0acb8001400a502ec' context_token_data, qn_token_data: lists (length equal to dev set size) of lists of strings (no UNKs, unpadded) """ # Check the data file exists if not os.path.exists(data_filename): raise Exception("JSON input file does not exist: %s" % data_filename) # Read the json file print("Reading data from %s..." % data_filename) data = data_from_json(data_filename) # Get the tokenized contexts and questions, and unique question identifiers print("Preprocessing data from %s..." % data_filename) qn_uuid_data, context_token_data, qn_token_data = preprocess_dataset(data) data_size = len(qn_uuid_data) assert len(context_token_data) == data_size assert len(qn_token_data) == data_size print("Finished preprocessing. Got %i examples from %s" % (data_size, data_filename)) return qn_uuid_data, context_token_data, qn_token_data
def process_dev_json_to_files(): # dev_path example data/squad/dev-v1.1.json download_prefix = os.path.dirname(os.path.abspath( FLAGS.dev_path)) # data/squad/ dev_filename = os.path.basename(FLAGS.dev_path) # "dev-v1.1.json" # relative path to save the data print("Downloading datasets into {}".format(download_prefix)) print("Preprocessing datasets into {}".format(FLAGS.data_dir)) if not os.path.exists(download_prefix): os.makedirs(download_prefix) if not os.path.exists(FLAGS.data_dir): os.makedirs(FLAGS.data_dir) maybe_download(squad_base_url, dev_filename, download_prefix, None) # Read data from dev json file dev_data = data_from_json(os.path.join(download_prefix, dev_filename)) # write data out to FLAGS.data_dir location dev_num_questions, dev_num_answers = read_write_dataset( dev_data, 'dev', FLAGS.data_dir) dev_path = os.path.join(FLAGS.data_dir, "dev") ## generate tokens x_dev_dis_path = dev_path + ".ids.context" y_dev_ids_path = dev_path + ".ids.question" qa_data.data_to_token_ids(dev_path + ".context", x_dev_dis_path, FLAGS.vocab_path) qa_data.data_to_token_ids(dev_path + ".question", y_dev_ids_path, FLAGS.vocab_path)
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data, context_text = read_dataset( dev_data, 'dev', vocab) return context_data, question_data, question_uuid_data, context_text
def get_json_data(data_filename): """ Read the contexts and questions from a .json file (like dev-v1.1.json) Returns: qn_uuid_data: list (length equal to dev set size) of unicode strings like '56be4db0acb8001400a502ec' context_token_data, qn_token_data: lists (length equal to dev set size) of lists of strings (no UNKs, unpadded) """ # Check the data file exists if not os.path.exists(data_filename): raise Exception("JSON input file does not exist: %s" % data_filename) # Read the json file print "Reading data from %s..." % data_filename data = data_from_json(data_filename) # Get the tokenized contexts and questions, and unique question identifiers print "Preprocessing data from %s..." % data_filename qn_uuid_data, context_token_data, qn_token_data = preprocess_dataset(data) data_size = len(qn_uuid_data) assert len(context_token_data) == data_size assert len(qn_token_data) == data_size print "Finished preprocessing. Got %i examples from %s" % (data_size, data_filename) return qn_uuid_data, context_token_data, qn_token_data
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab) return context_data, question_data, question_uuid_data
def prepare_dev(prefix, dev_filename, vocab): dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data = read_dataset( dev_data, 'dev', vocab) def normalize(dat): return map(lambda tok: map(int, tok.split()), dat) context_data = normalize(context_data) question_data = normalize(question_data) return context_data, question_data, question_uuid_data
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) # remove answer # if FLAGS.eval_on_train: # context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers = read_dataset(dev_data, 'train', vocab) # return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data = read_dataset( dev_data, 'dev', vocab) return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data = read_dataset( dev_data, 'dev', vocab) def normalize(dat): return list(map(lambda tok: list(map(int, tok.split())), dat)) context_data = normalize(context_data) question_data = normalize(question_data) return context_data, question_data, question_uuid_data
def get_question_context_data(question_string, context_json_file): context_string = data_from_json(context_json_file)['context'] context = str(context_string) # string # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) # list of strings (lowercase) context = context.lower() question = str(question_string) # string question_tokens = tokenize(question) # list of strings # also get the question_uuid question_uuid = len(question_tokens) return [question_uuid], [context_tokens], [question_tokens]
def get_json_data(data_filename): """ Read the contexts and questions from a .json file (like dev-v1.1.json) """ if not os.path.exists(data_filename): raise Exception("JSON input file does not exist: %s" % data_filename) print("Reading data from %s..." % data_filename) data = data_from_json(data_filename) print("Preprocessing data from %s..." % data_filename) qn_uuid_data, context_token_data, qn_token_data = preprocess_dataset(data) data_size = len(qn_uuid_data) assert len(context_token_data) == data_size assert len(qn_token_data) == data_size print("Finished preprocessing. Got %i examples from %s" % (data_size, data_filename)) return qn_uuid_data, context_token_data, qn_token_data
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if os.path.exists(global_train_dir): os.unlink(global_train_dir) #os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Download Dataset json ========= # You can change this code to load dataset in your own way #dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) #dev_filename = os.path.basename(FLAGS.dev_path) #_, _, _ = prepare_dev(dev_dirname, dev_filename, vocab) # ========= Process input json ========= # for codalab prefix = os.path.join("data", "squad") # writes dev.answer, dev.context, dev.question, dev.span dev_path = FLAGS.dev_path dev_filename = FLAGS.dev_path.split("/")[-1] if FLAGS.download: dev_data = data_from_json(os.path.join(prefix, dev_filename)) else: dev_data = data_from_json(dev_filename) dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix="") print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers)) # writes dev.ids.context, dev.ids.question vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat") dev_deposit_path = pjoin(os.path.join("", ""), "dev") #pjoin(os.path.join("data", "squad"), "dev") x_dis_path = dev_deposit_path + ".ids.context" y_ids_path = dev_deposit_path + ".ids.question" data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path) data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path) # load data sets #Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set. Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data_home(dev_deposit_path) # for our purposes this is as test set. question_uuid_data = [] with open(dev_deposit_path + ".quid") as f: for line in f: question_uuid_data.append((line)) # pad the data at load-time. So, we don't need to do any masking later!!! # ref: https://keras.io/preprocessing/sequence/ # if len < maxlen, pad with specified val # elif len > maxlen, truncate QMAXLEN = FLAGS.QMAXLEN PMAXLEN = FLAGS.PMAXLEN Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post') P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post') A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post') A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post') test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model """models = [ 'MPCM', 'COATT', 'COATT_fixed', 'COATT_mix','COATT_fixed_mix', 'COATT_fixed_200_mix'] # 'COATT_fixed_200', leave out to save time predictions_start = {}; predictions_end = {} with open("preds_dev.txt", "a") as f: f.write("model" + "," + "pred_raw" + "," + "a_raw") for model in models: FLAGS.model_type = model FLAGS.train_dir = "train/ensemble_train_" + model train_dir = "train/ensemble_train_" + model # define sizes etc. for different models. if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' : FLAGS.embedding_size = 200 FLAGS.lstm_units = 200 elif model == "MPCM_p100": FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 100 else: FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 50 with tf.Graph().as_default(): with tf.Session() as sess: embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) # get predicted start-end indices a_s_l = [] a_e_l = [] f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {} prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size)) for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)): batch_test = batch[:4] (ys, ye) = qa.predict_on_batch(sess, *batch_test) a_s = (np.argmax(ys, axis=1)) a_e = (np.argmax(ye, axis=1)) a_s_l = a_s_l + list(a_s) a_e_l = a_e_l + list(a_e) print(len(a_s)) for j in range(len(a_s)): p_raw = batch[7][j] a_raw = batch[8][j] s = a_s[j] e = a_e[j] pred_raw = ' '.join(p_raw.split()[s:e + 1]) p_raw_mapping[batch[9][j].strip("\n")] = p_raw #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n") prob_start[batch[9][j].strip("\n")] = ys[j] prob_end[batch[9][j].strip("\n")] = ye[j] f.write(model + "," + pred_raw + "," + a_raw ) prog.update(i + 1, [("processed", i + 1)]) predictions_start[model] = prob_start predictions_end[model] = prob_end f.close() # save dropPickle(predictions_start, "preds_start.pkl") dropPickle(predictions_end, "preds_end.pkl") dropPickle(p_raw_mapping, "p_raw_mapping.pkl")""" predictions_start = loadPickle("preds_start.pkl") predictions_end = loadPickle("preds_end.pkl") p_raw_mapping = loadPickle("p_raw_mapping.pkl") models = ['COATT_fixed_200'] #predictions_start = {}; predictions_end = {} with open("preds_dev.txt", "a") as f: f.write("model" + "," + "pred_raw" + "," + "a_raw") for model in models: FLAGS.model_type = model FLAGS.train_dir = "train/ensemble_train_" + model train_dir = "train/ensemble_train_" + model if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' : FLAGS.embedding_size = 200 FLAGS.lstm_units = 200 elif model == "MPCM_p100": FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 100 else: FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 50 with tf.Graph().as_default(): with tf.Session() as sess: embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) # get predicted start-end indices a_s_l = [] a_e_l = [] f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {} prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size)) for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)): batch_test = batch[:4] (ys, ye) = qa.predict_on_batch(sess, *batch_test) a_s = (np.argmax(ys, axis=1)) a_e = (np.argmax(ye, axis=1)) a_s_l = a_s_l + list(a_s) a_e_l = a_e_l + list(a_e) print(len(a_s)) for j in range(len(a_s)): p_raw = batch[7][j] a_raw = batch[8][j] s = a_s[j] e = a_e[j] print(s,e)# comment this out pred_raw = ' '.join(p_raw.split()[s:e + 1]) p_raw_mapping[batch[9][j].strip("\n")] = p_raw #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n") prob_start[batch[9][j].strip("\n")] = ys[j] prob_end[batch[9][j].strip("\n")] = ye[j] f.write(model + "," + pred_raw + "," + a_raw ) prog.update(i + 1, [("processed", i + 1)]) predictions_start[model] = prob_start predictions_end[model] = prob_end f.close() dropPickle(predictions_start, "preds_start.pkl") dropPickle(predictions_end, "preds_end.pkl") dropPickle(p_raw_mapping, "p_raw_mapping.pkl") # combine the predictions of the two models (while making independent start, end predictions) """answers = {} for qkey in predictions_start['MPCM'].keys(): ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey] ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey] s = (np.argmax(ys)) arr = ye.copy() arr[0:s] = 0 e = (np.argmax(arr)) #e = (np.argmax(ye)) pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1]) answers[qkey] = pred_raw.strip("\n")""" # predict span with max predicted probability (make joint prediction rather than indepenedntly predicitng start and end indices) answers = {} for qkey in predictions_start['MPCM'].keys(): ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey]\ *predictions_start['COATT_mix'][qkey]*predictions_start['COATT_fixed_mix'][qkey]\ *predictions_start['COATT_fixed_200_mix'][qkey]*predictions_start['COATT_fixed_200'][qkey] #to save time ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey]\ *predictions_end['COATT_mix'][qkey]*predictions_end['COATT_fixed_mix'][qkey]\ *predictions_end['COATT_fixed_200_mix'][qkey]*predictions_end['COATT_fixed_200'][qkey] #to save time s = 0; e = 0; prodmax = 0 for si in range(0, len(ys)): for ei in range(si, len(ye)): prod = ys[si]*ye[ei] if prod > prodmax: s = si e = ei prodmax = prod print(s,e, prodmax) pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1]); print(pred_raw) answers[qkey] = pred_raw.strip("\n") # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def expand_vocab(prefix, dev_filename, vocab, embd, raw_glove, raw_glove_vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) #context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab) dataset = dev_data context_data = [] query_data = [] question_uuid_data = [] tier = 'dev' new_vocab = {} found = 0 notfound = 0 for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] #context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens] #qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens] #print(context_ids) for w in context_tokens: if not w in vocab: if not w in new_vocab: new_vocab[w] = 1 else: new_vocab[w] += 1 notfound += 1 else: found += 1 for w in question_tokens: if not w in vocab: if not w in new_vocab: new_vocab[w] = 1 else: new_vocab[w] += 1 notfound += 1 else: found += 1 print('found/not found: {}/{}, {}% not found'.format( found, notfound, 100 * notfound / float(found + notfound))) print('New vocabulary:', len(new_vocab)) vocab_list = list(vocab.items()) vn = len(vocab_list) for i in range((len(new_vocab))): vocab_list.append((new_vocab.keys()[i], vn + i)) vocab = dict(vocab_list) rev_vocab = dict([(x, y) for (y, x) in vocab_list]) #context_data.append(' '.join(context_ids)) #query_data.append(' '.join(qustion_ids)) #question_uuid_data.append(question_uuid) #return context_data, question_data, question_uuid_data _, dim = embd.shape new_glove = np.random.randn(len(vocab), dim) new_glove[:vn, :] = embd found = 0 for i in range(vn, vn + (len(new_vocab))): word = vocab_list[i][0] if word in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word] new_glove[i, :] = raw_glove[idx, :] if word.capitalize() in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word.capitalize()] new_glove[i, :] = raw_glove[idx, :] if word.upper() in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word.upper()] new_glove[i, :] = raw_glove[idx, :] #from IPython import embed; embed() print("{} unseen words found embeddings".format(found)) return vocab, rev_vocab, new_glove
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way # # Old version. Encoding issue # dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) # dev_filename = os.path.basename(FLAGS.dev_path) # context_data, question_data, question_uuid_data = prepare_dev(dev_dirname, dev_filename, vocab) # dataset = (context_data, question_data, question_uuid_data) # # dataset = adjust_dataset(dataset, 'dev') # New version. To match the preprocessing provided to train.py # As in squad_preprocessing.py: tt = time.time() download_prefix = os.path.join("download", "squad") data_prefix = os.path.join("data", "squad") if not os.path.exists(download_prefix): os.makedirs(download_prefix) if not os.path.exists(data_prefix): os.makedirs(data_prefix) dev_filename = "dev-v1.1.json" dev_data = data_from_json(os.path.join(download_prefix, dev_filename)) (dev_num_questions, dev_num_answers, uuid) = read_write_dataset_(dev_data, 'dev', data_prefix) print("Processed {} questions and {} answers in dev".format( dev_num_questions, dev_num_answers)) print(time.time()-tt, 'part: squar_preprocessing.py') # As in qa_data.py: tt = time.time() args = qa_data.setup_args() vocab_path = pjoin(args.vocab_dir, "vocab.dat") dev_path = pjoin(args.source_dir, "dev") x_dev_dis_path = dev_path + ".ids.context" y_dev_ids_path = dev_path + ".ids.question" data_to_token_ids(dev_path + ".context", x_dev_dis_path, vocab_path) data_to_token_ids(dev_path + ".question", y_dev_ids_path, vocab_path) print(time.time()-tt, 'part: qa_data.py') # As in train.py tt = time.time() dataset = {} load_data_dq(dataset, 'dev', FLAGS.data_dir) indices = trim_empty(dataset['dev']) print(time.time()-tt, 'part: train.py') # uuid from read_write_dataset_ uuid = [i for j, i in enumerate(uuid) if j not in indices] dataset['dev']['uuid'] = uuid # sanity check keys = dataset['dev'].keys() for i in xrange(len(keys)-1): print(i, keys[i], len(dataset['dev'][keys[i]]), i+1, keys[i+1], len(dataset['dev'][keys[i+1]])) assert len(dataset['dev'][keys[i]]) == len(dataset['dev'][keys[i+1]]) # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) with tf.Session() as sess: pass local_device_protos = device_lib.list_local_devices() # 38559755 for x in local_device_protos: if x.device_type == 'GPU': FLAGS.ifgpu = True break qa = QASystem(encoder, decoder, embed_path, rev_vocab) #with tf.Session() as sess: # pass train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(unused_argv): # Print an error message if you've entered flags incorrectly if len(unused_argv) != 1: raise Exception("There is a problem with how you entered flags: %s" % unused_argv) # Define train_dir if not FLAGS.name and not FLAGS.train_dir and FLAGS.mode != EVAL_MODE: raise Exception("You need to specify either --name or --train_dir") FLAGS.train_dir = FLAGS.train_dir or os.path.join(LOGS_DIR, FLAGS.name) # If not specified, set d_ff to match d_model if FLAGS.d_ff == 0: FLAGS.d_ff = FLAGS.d_model # Initialize best model directory best_model_dir = os.path.join(FLAGS.train_dir, "best_checkpoint") # Define path for glove vecs FLAGS.glove_path = FLAGS.glove_path or os.path.join( FLAGS.data_dir, "glove.840B.{}d.txt".format(FLAGS.word_emb_size)) FLAGS.char_emb_path = os.path.join(FLAGS.data_dir, "char_emb_file.txt") FLAGS.word_emb_path = os.path.join(FLAGS.data_dir, "word_emb_file.txt") # Get file paths to train/dev/test datafiles for tokenized queries, contexts and answers FLAGS.train_rec_path = os.path.join(FLAGS.data_dir, "train.tfrecord") FLAGS.train_ans_path = os.path.join(FLAGS.data_dir, "train_ans.json") FLAGS.train_info_path = os.path.join(FLAGS.data_dir, "train_info.json") FLAGS.dev_rec_path = os.path.join(FLAGS.data_dir, "dev.tfrecord") FLAGS.dev_ans_path = os.path.join(FLAGS.data_dir, "dev_ans.json") FLAGS.dev_info_path = os.path.join(FLAGS.data_dir, "dev_info.json") # Load word embedding matrix and char embedding matrix. word_emb_matrix, word2id = get_word_embs(FLAGS.word_emb_path, FLAGS.word_emb_size) char_emb_matrix, char2id = get_char_embs(FLAGS.char_emb_path, FLAGS.char_emb_size) # Some GPU settings config = tf.ConfigProto() config.gpu_options.allow_growth = True # Split by mode if FLAGS.mode == TRAIN_MODE: # Load dataset info and answer files print("Loading train and dev datasets...") train_answers = data_from_json(FLAGS.train_ans_path) train_info = data_from_json(FLAGS.train_info_path) dev_answers = data_from_json(FLAGS.dev_ans_path) dev_info = data_from_json(FLAGS.dev_info_path) # Initialize data pipeline loader = get_data_loader(FLAGS, is_training=True) train_dataset = load_dataset(FLAGS, FLAGS.train_rec_path, loader, shuffle=True) train_iterator = train_dataset.make_one_shot_iterator() dev_dataset = load_dataset(FLAGS, FLAGS.dev_rec_path, loader, shuffle=True) dev_iterator = dev_dataset.make_one_shot_iterator() # Initialize the model input_handle = tf.placeholder(tf.string, shape=()) input_iterator = tf.data.Iterator.from_string_handle( input_handle, train_dataset.output_types, train_dataset.output_shapes) model = SQuADTransformer(FLAGS, input_iterator, input_handle, word_emb_matrix, char_emb_matrix) # Setup train dir and logfile if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler( os.path.join(FLAGS.train_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # Make best model dir if necessary if not os.path.exists(best_model_dir): os.makedirs(best_model_dir) with tf.Session(config=config) as sess: # Load most recent model initialize_model(sess, model, FLAGS.train_dir, expect_exists=False) # Train model.train(sess, train_iterator, train_answers, train_info, dev_iterator, dev_answers, dev_info) elif FLAGS.mode == EVAL_MODE: if FLAGS.json_in_path == "": raise Exception( "For {} mode, you need to specify --json_in_path".format( EVAL_MODE)) if FLAGS.checkpoint_dir == "" and FLAGS.ensemble_path == "": raise Exception( "For {} mode, you need to specify --checkpoint_dir or --ensemble_path" .format(EVAL_MODE)) FLAGS.is_training = False # Read the JSON data from file print("Loading test dataset from {}...".format(FLAGS.json_in_path)) test_data = data_from_json(FLAGS.json_in_path) test_examples, test_answers, test_info, _, _ = preprocess(test_data) # Get formatted examples in memory for creating a TF Dataset formatted_examples, output_types, output_shapes = get_formatted_examples( FLAGS, test_examples, word2id, char2id) # Construct a generator function for building TF dataset def gen(): infinite_idx = 0 while True: yield formatted_examples[infinite_idx] infinite_idx = (infinite_idx + 1) % len(formatted_examples) # Initialize data pipeline (repeat so we can use this multiple times in an ensemble). test_dataset = tf.data.Dataset.from_generator( gen, output_types, output_shapes).repeat().batch(FLAGS.batch_size) test_iterator = test_dataset.make_one_shot_iterator() input_handle = tf.placeholder(tf.string, shape=()) input_iterator = tf.data.Iterator.from_string_handle( input_handle, test_dataset.output_types, test_dataset.output_shapes) # Ensemble or single eval. is_ensemble = FLAGS.ensemble_path != "" if is_ensemble: # Path to file with a list of directories for ensemble with open(FLAGS.ensemble_path, 'r') as fh: checkpoint_paths = [ line.strip() for line in fh.readlines() if line ] if len(checkpoint_paths) == 0: raise Exception( "Ensemble path {} did not contain any checkpoint paths." .format(FLAGS.ensemble_path)) else: checkpoint_paths = [FLAGS.checkpoint_dir] # Make predictions using all checkpoints specified in checkpoint_paths model = SQuADTransformer(FLAGS, input_iterator, input_handle, word_emb_matrix, char_emb_matrix) all_answers = defaultdict( list) # Maps from UUID to list of (answer text, prob) pairs. for i in range(len(checkpoint_paths)): if is_ensemble: print("Ensemble model {} / {}...".format( i + 1, len(checkpoint_paths))) with tf.Session(config=config) as sess: # Load model from checkpoint_dir initialize_model(sess, model, checkpoint_paths[i], expect_exists=True, is_training=False) # Get a predicted answer for each example in the data num_batches = test_info['num_examples'] // FLAGS.batch_size + 1 answers_dict = model.get_answers(sess, test_iterator, test_answers, num_batches) # Add it to the combined answers for k, v in answers_dict.items(): all_answers[k].append(v) # Combine the results into a final prediction if is_ensemble: print("Combining answers with max-vote...") answers_dict = {} for k, v in tqdm(all_answers.items()): answers_dict[k] = ensemble_max_vote(all_answers[k]) # Write the uuid->answer mapping a to json file in root dir print("Writing predictions to %s..." % FLAGS.json_out_path) with io.open(FLAGS.json_out_path, 'w', encoding='utf-8') as f: f.write(json.dumps(answers_dict, ensure_ascii=False)) print("Wrote predictions to %s" % FLAGS.json_out_path) else: raise Exception("Unsupported mode: %s" % FLAGS.mode)
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if os.path.exists(global_train_dir): os.unlink(global_train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Download Dataset json ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) _, _, _ = prepare_dev(dev_dirname, dev_filename, vocab) # ========= Process input json ========= prefix = os.path.join("data", "squad") # writes dev.answer, dev.context, dev.question, dev.span dev_path = FLAGS.dev_path dev_filename = FLAGS.dev_path.split("/")[-1] dev_data = data_from_json(os.path.join(prefix, dev_filename)) dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix) print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers)) # writes dev.ids.context, dev.ids.question vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat") dev_deposit_path = pjoin(os.path.join("data", "squad"), "dev") x_dis_path = dev_deposit_path + ".ids.context" y_ids_path = dev_deposit_path + ".ids.question" data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path) data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path) # load data sets Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set. question_uuid_data = [] with open(os.path.join("data", "squad") + "/dev.quid") as f: for line in f: question_uuid_data.append((line)) # pad the data at load-time. So, we don't need to do any masking later!!! # ref: https://keras.io/preprocessing/sequence/ # if len < maxlen, pad with specified val # elif len > maxlen, truncate QMAXLEN = FLAGS.QMAXLEN PMAXLEN = FLAGS.PMAXLEN Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post') P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post') A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post') A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post') test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model with tf.Graph().as_default(): with tf.Session() as sess: embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) # get predicted start-end indices a_s = [] # store all start index preds a_e = [] # store all end index preds a_s_l = [] a_e_l = [] f1 = exact_match = total = 0; answers = {} prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size)) for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)): batch_test = batch[:4] (ys, ye) = qa.predict_on_batch(sess, *batch_test) a_s = (np.argmax(ys, axis=1)) a_e = (np.argmax(ye, axis=1)) a_s_l = a_s_l + list(a_s) a_e_l = a_e_l + list(a_e) for j in range(len(a_s)): p_raw = batch[7][j] a_raw = batch[8][j] s = a_s[j] e = a_e[j] pred_raw = ' '.join(p_raw.split()[s:e + 1]) f1 += f1_score(pred_raw, a_raw) exact_match += exact_match_score(pred_raw, a_raw) total += 1 answers[batch[9][j].strip("\n")] = pred_raw.strip("\n") prog.update(i + 1, [("processed", i + 1)]) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total print(("First Answer Entity level F1/EM: %.2f/%.2f", f1, exact_match)) #answers = generate_answers(question_uuid_data, a_s_l, a_e_l, context_data, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))