def yieldRouge(CorpusFile): """yield ROUGE scores of all sentences in corpus >>> rouge = yieldRouge('BioASQ-trainingDataset5b.json') >>> target = (0, '15829955', 0, {'N-1': 0.1519, 'S4': 0.0, 'SU4': 0.04525, 'N-2': 0.0, 'L': 0.0}, 'The identification of common variants that contribute to the genesis of human inherited disorders remains a significant challenge.') >>> next(rouge) == target True >>> target2 = (0, '15829955', 1, {'N-1': 0.31915, 'S4': 0.02273, 'SU4': 0.09399, 'N-2': 0.13043, 'L': 0.04445}, 'Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes.') >>> next(rouge) == target2 True """ data = json.load(open(CorpusFile, encoding='utf-8'))['questions'] for qi in range(len(data)): if 'snippets' not in data[qi].keys(): print("Warning: No snippets in question %s" % data[qi]['body']) continue ai = 0 if type(data[qi]['ideal_answer']) == list: ideal_answers = data[qi]['ideal_answer'] else: ideal_answers = [data[qi]['ideal_answer']] for (pubmedid, senti, sent) in yield_candidate_text(data[qi]): rouge_scores = [ rouge_engine.get_scores(h, sent)[0] for h in ideal_answers ] rouge_l = max([r['rouge-l']['f'] for r in rouge_scores]) yield (qi, pubmedid, senti, rouge_l, sent)
def NNbaseline(testfile=EVALFILE): """Evaluate a baseline that uses supervised NN""" KEEP_PROB = 0.2 # Rate of cells to keep at the dropout layer nanswers = {"summary": 6, "factoid": 2, "yesno": 2, "list": 3} with open(NN_LOGFILE, 'w') as f: f.write("episode,reward,QID,summary\n") with open(NN_EVALFILE, 'w') as f: f.write("episode,reward,QID,summary\n") env = Environment(jsonfile='BioASQ-trainingDataset5b.json') all_data = env.data with open('rl-rouge5b.csv') as f: csvfile = csv.DictReader(f) all_rouge = [l for l in csvfile] if type(testfile) == None: all_indices = list(range(len(all_data))) np.random.shuffle(all_indices) split_boundary = int(len(all_indices)*.8) train_indices = all_indices[:split_boundary] test_indices = all_indices[split_boundary:] else: with open(testfile) as f: reader = csv.DictReader(f) test_indices = list(set(int(l['QID']) for l in reader) & set(range(len(all_data)))) train_indices = [i for i in range(len(all_data)) if i not in test_indices] print("Train indices:", train_indices) print("Test indices:", test_indices) tfidf_train_text = [all_data[x]['body'] for x in train_indices] tfidf_train_text += [c[2] for x in train_indices for c in yield_candidate_text(all_data[x])] ideal_summaries_sentences = [] for x in train_indices: ideal_summaries = all_data[x]['ideal_answer'] if type(ideal_summaries) != list: ideal_summaries = [ideal_summaries] for ideal_sum in ideal_summaries: ideal_summaries_sentences += sent_tokenize(ideal_sum) tfidf_train_text += ideal_summaries_sentences #print(len(tfidf_train_text)) #print(tfidf_train_text[:10]) tfidf = TfidfVectorizer(tokenizer=my_tokenize) tfidf.fit(tfidf_train_text) vocabulary_size = len(tfidf.get_feature_names()) graph = tf.Graph() with graph.as_default(): X_state = tf.placeholder(tf.float32, shape=[None, 2*vocabulary_size]) # + 1]) Q_state = tf.placeholder(tf.float32, shape=[None, vocabulary_size]) Y_result = tf.placeholder(tf.float32, shape=[None, 1]) keep_prob = tf.placeholder(tf.float32) dropout1 = tf.nn.dropout(tf.concat((X_state, Q_state), 1), keep_prob) hidden = tf.layers.dense(dropout1, N_HIDDEN, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.variance_scaling_initializer()) dropout2 = tf.nn.dropout(hidden, keep_prob) outputs = tf.layers.dense(dropout2, 1, activation=None) mse = tf.reduce_mean(tf.square(Y_result - outputs)) optimizer = tf.train.AdamOptimizer() train = optimizer.minimize(mse) init = tf.global_variables_initializer() saver = tf.train.Saver() if VERBOSE > 0: print("Training NN Baseline") with tf.Session(graph=graph) as sess: if RESTORE: saver.restore(sess, NN_CHECKPOINT_PATH) else: init.run() episode = 0 while True: # 1. Train while True: train_x = np.random.choice(train_indices) observation = env.reset(train_x) if len(env.candidates) > 0: break tfidf_all_candidates = tfidf.transform(env.candidates).todense() tfidf_all_text = tfidf.transform([" ".join(env.candidates)]).todense()[0,:] Y = [[float(l['L'])] for l in all_rouge if int(l['qid']) == env.qid][:len(env.candidates)] Q = np.tile(tfidf.transform([env.question]).todense()[0,:], (len(env.candidates),1)) X = np.vstack([np.hstack([tfidf_all_text, c]) for c in tfidf_all_candidates]) sess.run(train, feed_dict={X_state: X, Q_state: Q, Y_result: Y, keep_prob: KEEP_PROB}) # 2. Evaluate predicted = sess.run(outputs, feed_dict={X_state: X, Q_state: Q, keep_prob: 1.0}) n = nanswers[env.qtype] topn = sorted(predicted)[-n:] while not observation['done']: if predicted[observation['next_candidate']] >= topn[0]: action = 1 else: action = 0 observation = env.step(action) reward = observation['reward'] print("Episode: %i, reward: %f" % (episode, reward)) with open(NN_LOGFILE, 'a') as f: f.write('%i,%f,%i,"%s"\n' % (episode,reward,env.qid," ".join([str(x) for x in observation['summary']]))) episode += 1 if episode % SAVE_EPISODES == 0: print("Saving checkpoint in %s" % (NN_CHECKPOINT_PATH)) saver.save(sess, NN_CHECKPOINT_PATH) # 3. Evaluate test data print("Testing results") test_results = [] for test_x in test_indices: observation = env.reset(test_x) if len(env.candidates) == 0: continue tfidf_all_candidates = tfidf.transform(env.candidates).todense() tfidf_all_text = tfidf.transform([" ".join(env.candidates)]).todense()[0,:] Q = np.tile(tfidf.transform([env.question]).todense()[0,:], (len(env.candidates), 1)) X = np.vstack([np.hstack([tfidf_all_text, c]) for c in tfidf_all_candidates]) predicted = sess.run(outputs, feed_dict={X_state: X, Q_state: Q, keep_prob: 1.0}) n = nanswers[env.qtype] topn = sorted(predicted)[-n:] while not observation['done']: if predicted[observation['next_candidate']] >= topn[0]: action = 1 else: action = 0 observation = env.step(action) reward = observation['reward'] test_results.append(reward) with open(NN_EVALFILE, 'a') as f: f.write('%i,%f,%i,"%s"\n' % (episode,reward,env.qid," ".join([str(x) for x in observation['summary']]))) print("Mean of evaluation results:", np.mean(test_results))
def train(): with open(LOGFILE, 'w') as f: f.write("episode,reward,QID,summary\n") with open(EVALFILE, 'w') as f: f.write("episode,reward,QID,summary\n") env = Environment(jsonfile='BioASQ-trainingDataset5b.json') alldata = list(range(len(env.data))) np.random.shuffle(alldata) split_boundary = int(len(alldata) * .8) train_indices = alldata[:split_boundary] test_indices = alldata[split_boundary:] # train tf.idf if VERBOSE > 0: print("Training tf.idf") tfidf_train_text = [env.data[x]['body'] for x in train_indices] tfidf_train_text += [ c[2] for x in train_indices for c in yield_candidate_text(env.data[x]) ] ideal_summaries_sentences = [] for x in train_indices: ideal_summaries = env.data[x]['ideal_answer'] if type(ideal_summaries) != list: ideal_summaries = [ideal_summaries] for ideal_sum in ideal_summaries: ideal_summaries_sentences += sent_tokenize(ideal_sum) tfidf_train_text += ideal_summaries_sentences #print(len(tfidf_train_text)) #print(tfidf_train_text[:10]) tfidf = TfidfVectorizer(tokenizer=my_tokenize) tfidf.fit(tfidf_train_text) nnModel = NNModel(len(tfidf.get_feature_names())) if VERBOSE > 0: print("Training REINFORCE") with tf.Session(graph=nnModel.graph) as sess: if RESTORE: nnModel.saver.restore(sess, CHECKPOINT_PATH) else: nnModel.init.run() while True: train_x = np.random.choice(train_indices) observation = env.reset(train_x) # Reset to a random question if len(env.candidates) > 0: break tfidf_all_candidates = tfidf.transform(env.candidates) tfidf_all_text = tfidf.transform([" ".join(env.candidates) ]).todense()[0, :] all_gradients = [] episode = 0 while True: # The following code is based on "Policy Gradients" # at https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb this_candidate = observation['next_candidate'] tfidf_this_candidate = tfidf_all_candidates[ this_candidate].todense() tfidf_remaining_candidates = tfidf.transform([ " ".join(env.candidates[this_candidate + 1:]) ]).todense()[0, :] tfidf_summary = tfidf.transform([ " ".join([env.candidates[x] for x in observation['summary']]) ]).todense()[0, :] tfidf_question = tfidf.transform([env.question]).todense()[0, :] #print(tfidf_question.shape) XState = np.hstack([ tfidf_all_text, tfidf_this_candidate, tfidf_remaining_candidates, tfidf_summary ]) #, [[len(observation['summary'])]]]) action_val, gradients_val = sess.run( [nnModel.action, nnModel.gradients], feed_dict={ nnModel.X_state: XState, nnModel.Q_state: tfidf_question, nnModel.episode: episode }) all_gradients.append(gradients_val) #action = 1 if np.random.uniform() < action_prob else 0 observation = env.step(action_val) if observation['done']: # reward all actions that lead to the summary reward = observation['reward'] print("Episode: %i, reward: %f" % (episode, reward)) with open(LOGFILE, 'a') as f: f.write('%i,%f,%i,"%s"\n' % (episode, reward, env.qid, " ".join( [str(x) for x in observation['summary']]))) feed_dict = {} #print(nnModel.gradient_placeholders[0].shape) for var_index, grad_placeholder in enumerate( nnModel.gradient_placeholders): mean_gradients = np.mean([ reward * one_gradient[var_index] for one_gradient in all_gradients ], axis=0) feed_dict[grad_placeholder] = mean_gradients sess.run(nnModel.training_op, feed_dict=feed_dict) episode += 1 if episode % SAVE_EPISODES == 0: print("Saving checkpoint in %s" % (CHECKPOINT_PATH)) nnModel.saver.save(sess, CHECKPOINT_PATH) print("Testing results") test_results = [] for test_x in test_indices: observation = env.reset(test_x) if len(env.candidates) == 0: continue tfidf_all_candidates = tfidf.transform(env.candidates) tfidf_all_text = tfidf.transform( [" ".join(env.candidates)]).todense()[0, :] while not observation['done']: this_candidate = observation['next_candidate'] tfidf_this_candidate = tfidf_all_candidates[ this_candidate].todense() tfidf_remaining_candidates = tfidf.transform([ " ".join(env.candidates[this_candidate + 1:]) ]).todense()[0, :] tfidf_summary = tfidf.transform([ " ".join([ env.candidates[x] for x in observation['summary'] ]) ]).todense()[0, :] tfidf_question = tfidf.transform( [env.question]).todense()[0, :] #print(tfidf_question.shape) XState = np.hstack([ tfidf_all_text, tfidf_this_candidate, tfidf_remaining_candidates, tfidf_summary ]) #, [[len(observation['summary'])]]]) output_val = sess.run(nnModel.outputs, feed_dict={ nnModel.X_state: XState, nnModel.Q_state: tfidf_question }) action_val = 0 if output_val < 0.5: action_val = 1 observation = env.step(action_val) reward = observation['reward'] test_results.append(reward) with open(EVALFILE, 'a') as f: f.write( '%i,%f,%i,"%s"\n' % (episode, reward, env.qid, " ".join( [str(x) for x in observation['summary']]))) print("Mean of evaluation results:", np.mean(test_results)) # Pick next training question while True: train_x = np.random.choice(train_indices) observation = env.reset( train_x) # Reset to a random question if len(env.candidates) > 0: break all_gradients = [] tfidf_all_candidates = tfidf.transform(env.candidates) tfidf_all_text = tfidf.transform([" ".join(env.candidates) ]).todense()[0, :]
def bioasq_run(test_data='phaseB_3b_01.json', output_filename='bioasq-out-rl.json'): """Run model for BioASQ""" print("Running BioASQ") with open(BEST_TFIDF_FILENAME, 'rb') as f: tfidf = pickle.load(f) testset = json.load(open(test_data, encoding='utf-8'))['questions'] if DEBUG: testset = testset[:10] result = [] nnModel = NNModel(len(tfidf.get_feature_names())) with tf.Session(graph=nnModel.graph) as sess: nnModel.saver.restore(sess, BEST_CHECKPOINT_PATH) for r in testset: test_question = r['body'] test_id = r['id'] test_candidates = [ sent for pubmedid, senti, sent in yield_candidate_text(r) ] test_candidates = test_candidates[:20] if len(test_candidates) == 0: print("Warning: no text to summarise") test_summary = '' else: if QTYPES: q_types = [0.0] * len(QTYPES) q_types[QTYPES.index(r['type'])] = 1.0 else: q_types = [] tfidf_all_candidates = tfidf.transform(test_candidates) tfidf_all_text = tfidf.transform([" ".join(test_candidates) ]).todense()[0, :] test_summary = '' len_summary = 0 output_probs = [] for this_candidate in range(len(test_candidates)): tfidf_this_candidate = tfidf_all_candidates[ this_candidate].todense() tfidf_remaining_candidates = tfidf.transform([ " ".join(test_candidates[this_candidate + 1:]) ]).todense()[0, :] tfidf_summary = tfidf.transform([test_summary ]).todense()[0, :] tfidf_question = tfidf.transform([test_question ]).todense()[0, :] XState = np.hstack([ tfidf_all_text, tfidf_this_candidate, tfidf_remaining_candidates, tfidf_summary, # [[len(observation['summary']), this_candidate]]]) [[len_summary]], [q_types] ]) output_val = sess.run(nnModel.outputs, feed_dict={ nnModel.X_state: XState, nnModel.Q_state: tfidf_question }) output_probs.append(1 - output_val) if output_val < 0.5: len_summary += 1 if test_summary == '': test_summary = test_candidates[this_candidate] else: test_summary += " " + test_candidates[ this_candidate] if test_summary == '' and len(test_candidates) > 0: print( "Warning: no summary produced; returning top sentence %i" % np.argmax(output_probs)) #print("Output probabilities are:") #print(output_probs) test_summary = test_candidates[np.argmax(output_probs)] if r['type'] == "yesno": exactanswer = "yes" else: exactanswer = "" result.append({ "id": test_id, "ideal_answer": test_summary, "exact_answer": exactanswer }) print("Saving results in file %s" % output_filename) with open(output_filename, 'w') as f: f.write(json.dumps({"questions": result}, indent=2))