def get_data(sections_of_interest=None, mode='experiment', include_sentence_span_splits = False): random.seed(177) if mode == 'experiment': # raise ValueError('implement me!') train_docs = list(preprocessor.train_document_ids()) random.shuffle(train_docs) split_index = int(len(train_docs) * .9) real_train_docs = train_docs[:split_index] real_val_docs = train_docs[split_index:] parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") real_train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(real_train_docs), sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) real_val_Xy = preprocessor.get_Xy(set(real_val_docs), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) # in development, our "test" set is our validation ids so we don't cheat. real_test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return real_train_Xy, real_val_Xy, real_test_Xy, inference_vectorizer elif mode == 'paper': parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_docs = preprocessor.train_document_ids() train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return train_Xy, val_Xy, test_Xy, inference_vectorizer elif mode == 'minimal': parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_docs = list(preprocessor.train_document_ids())[:5] train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) val_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[:5], inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[5:10], inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return train_Xy, val_Xy, test_Xy, inference_vectorizer else: raise ValueError('implement me!')
def load_data(use_test, model_loc): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ t_ids = set(list(preprocessor.train_document_ids())) te_ids = set(list(preprocessor.test_document_ids())) val_ids = set(list(preprocessor.validation_document_ids())) train_Xy, inference_vectorizer = preprocessor.get_train_Xy(t_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True) # load model model = load_model_scan(inference_vectorizer, model_loc) # create an internal validation set from the training data; use 90% for training and 10% for validation. random.shuffle(train_Xy) if not(use_test): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) x_train, y_train = reformat(train_Xy, inference_vectorizer, model) x_val, y_val = reformat(val_Xy, inference_vectorizer, model) x_test, y_test = reformat(test_Xy, inference_vectorizer, model) return x_train, y_train, x_val, y_val, x_test, y_test
def main(): # Load in the data prompts = np.asarray(preprocessor.read_prompts()) annotations = np.asarray(preprocessor.read_annotations()) data = {} preds, y_test = [], [] # store data in dictionary for p in prompts: data[p[0]] = {'xml': p[1], 'outcome': p[2], 'intervention': p[3], 'comparator': p[4], 'answer': '', 'reasoning': ''} for a in annotations: if (a[3]): data[a[1]]['answer'] = a[7] if (a[4]): data[a[1]]['reasoning'] += str(a[6]) test_id = preprocessor.test_document_ids() # get predictions and add them to array for k in data.keys(): # try to parse text to remove weird things id_ = data[k]['xml'] # if the file is not a test file if not(id_ in test_id): continue out = try_except_parse(data[k]['outcome']) inter = try_except_parse(data[k]['intervention']) cmp = try_except_parse(data[k]['comparator']) ans = try_except_parse(data[k]['answer']) res = try_except_parse(data[k]['reasoning']) if (ans == ''): continue # we don't have a valid answer for this one... y_test.append(ans) # just use the reasoning as our sentence likely_sentence = res guess = eval_sentence(likely_sentence, out, inter, cmp) if (guess == "No significant difference"): preds.append(0) elif (guess == "Significantly decreased"): preds.append(-1) else: preds.append(1) acc = accuracy_score(y_test, preds) f1 = f1_score(y_test, preds, average='macro') prec = precision_score(y_test, preds, average = 'macro') rec = recall_score(y_test, preds, average = 'macro') return acc, f1, prec, rec
def train(): # train the model -- this assumes access to evidence_inference: # https://github.com/jayded/evidence-inference/tree/master/evidence_inference # which is not needed in general to load the trained model. # # if inference_true flag is on, then a model will also be fit that predicts the # outcome (sig. decrease, no diff, sig. increase) given punchline snippets. from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy extractor_model = PunchlineExtractor() tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids( ), test_document_ids() tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) # Create vectors and targets for extraction task X_k, y_k = make_Xy(train_Xy, extractor_model.bc) print("train data loaded!") val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) X_kv, y_kv = make_Xy(val_Xy, extractor_model.bc, neg_samples=1) print("val data loaded!") # Fit the model! filepath = "punchline.weights.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] with open("punchline_model.json", "w") as outf: outf.write(extractor_model.model.to_json()) print("fitting punchline extractor!") extractor_model.model.fit(X_k, y_k, validation_data=(X_kv, y_kv), callbacks=callbacks_list, epochs=50)
def run_scan_net_regression(loc = './scan_net.pth'): train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True) if not(USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) train_Xy, val_Xy, test_Xy = train_reformat(train_Xy), scan_reform(val_Xy), scan_reform(test_Xy) # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) model = train_scan(inference_vectorizer, train_Xy, val_Xy, test_Xy, 100, 32, 5) torch.save(model.state_dict(), loc)
def run_scan_net_redux(loc='scan_net_redux.pth'): parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_Xy, inference_vectorizer = preprocessor.get_train_Xy( list(preprocessor.train_document_ids()), sections_of_interest=None, vocabulary_file=vocab_f, include_sentence_span_splits=True) if not (USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) if USE_CUDA: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda() else: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn) # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, test_Xy, 50, 32, 10) # save to specified path torch.save(se_scn.state_dict(), loc)
def run_scan_net_ico(loc = "scan_net_ICO_no_attn_test.pth"): print("Modules loaded.") parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_Xy, inference_vectorizer = preprocessor.get_train_Xy(list(preprocessor.train_document_ids()), sections_of_interest=None, vocabulary_file=vocab_f, include_sentence_span_splits=True) print("Train Data Achieved") if not(USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids()), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) print("Test Data Achieved") if USE_CUDA: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda() else: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn) print("Model loaded") # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, 50, 32, 10) acc, f1, prc, rc, auc = test_model(se_scn, test_Xy, inference_vectorizer) # save to specified path #args = parser.parse_args() torch.save(se_scn.state_dict(), loc)
def train_simple_inference_net(n_epochs=30): inf_net = SimpleInferenceNet() tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids( ), test_document_ids() tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) X_k, y_k = make_Xy_inference(train_Xy, inf_net.bc) print("train data for inference task loaded!") val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) X_kv, y_kv = make_Xy_inference(val_Xy, inf_net.bc) print("val data loaded!") filepath = "inference.weights.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] with open("inference_model.json", "w") as outf: outf.write(inf_net.model.to_json()) print("fitting inference model!") inf_net.model.fit(X_k, y_k, validation_data=(X_kv, y_kv), callbacks=callbacks_list, epochs=n_epochs)
def load_data(use_test, bow=True): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ prompts = preprocessor.read_prompts() annotations = preprocessor.read_annotations() # filter out prompts for which we do not have annotations for whatever reason # this was actually just one case; not sure what was going on there. def have_annotations_for_prompt(prompt_id): return len( annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0 prompts = [ prompt for row_idx, prompt in prompts.iterrows() if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME]) ] prompts = pd.DataFrame(prompts) # Sort into training and validation by article id train_doc_ids = preprocessor.train_document_ids() val_doc_ids = preprocessor.validation_document_ids() test_doc_ids = preprocessor.test_document_ids() # get a dev set randomly dev_doc_ids = list(train_doc_ids) random.shuffle(dev_doc_ids) dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)]) x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], [] pids = prompts[STUDY_ID_COL].values for i in range(len(pids)): annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] == prompts["PromptID"].values[i]] labels = annotations_for_prompt[[LBL_COL_NAME, EVIDENCE_COL_NAME]].values id_ = pids[i] # this is all of the reasonings articles = [a[1] for a in labels] for article_text in articles: # extract i/c/o out = prompts["Outcome"].values[i].lower() inter = prompts["Intervention"].values[i].lower() cmp = prompts["Comparator"].values[i].lower() # add to correct pile: train/val/test tmp = [article_text, out, inter, cmp] loss = stats.mode([l1[0] for l1 in labels])[0][0] if id_ in dev_doc_ids and not (use_test): x_dev.append(tmp) y_dev.append(loss) elif id_ in train_doc_ids: x_train.append(tmp) y_train.append(loss) elif id_ in val_doc_ids: x_val.append(tmp) y_val.append(loss) elif id_ in test_doc_ids: x_test.append(tmp) y_test.append(loss) else: raise ValueError("Unknown study id {}".format(id_)) # transform to np.array y_test = np.asarray(y_test) # if we are removing the test set, use validation as test set. if not (use_test): x_test = x_val y_test = y_val x_val = x_dev y_val = y_dev print("Running bag of words...") ret = bag_of_words( x_train, y_train, x_val, y_val, x_test, y_test) if bow else [x_train, y_train, x_val, y_val, x_test, y_test] return ret
if not (USE_TEST): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set( list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) else: val_Xy = preprocessor.get_Xy(set( list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) test_Xy = preprocessor.get_Xy(set(list(preprocessor.test_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) print("Test data loaded.") # modify training data train_Xy, val_Xy, test_Xy = train_reformat(train_Xy), scan_reform( val_Xy), scan_reform(test_Xy) bow = Bag_of_words(inference_vectorizer) print("Data transformed.") # load the model model = load_model_scan(inference_vectorizer, './models/')
def load_data(use_test, bow=True): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ prompts = preprocessor.read_prompts() annotations = preprocessor.read_annotations() # filter out prompts for which we do not have annotations for whatever reason # this was actually just one case; not sure what was going on there. def have_annotations_for_prompt(prompt_id): return len( annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0 prompts = [ prompt for row_idx, prompt in prompts.iterrows() if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME]) ] prompts = pd.DataFrame(prompts) # Sort into training and validation by article id train_doc_ids = preprocessor.train_document_ids() val_doc_ids = preprocessor.validation_document_ids() test_doc_ids = preprocessor.test_document_ids() # get a dev set randomly dev_doc_ids = list(train_doc_ids) random.shuffle(dev_doc_ids) dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)]) x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], [] pids = prompts[STUDY_ID_COL].values for i in range(len(pids)): id_, data, losses = parse_prompt_id_data(annotations, prompts, pids, i) in_training = id_ in (train_doc_ids - dev_doc_ids) # get a reasoning from previous/next prompt id if (i > 0 and id_ == pids[i - 1] and not (in_training)): _, mismatched_data, _ = parse_prompt_id_data( annotations, prompts, pids, i - 1) # add the mismatched data here row = copy.deepcopy(data[0]) row[1] = mismatched_data[0][1] data.append(row) losses.append(losses[-1]) elif (i < len(pids) and id_ == pids[i + 1] and not (in_training)): _, mismatched_data, _ = parse_prompt_id_data( annotations, prompts, pids, i + 1) # add the mismatched data here row = copy.deepcopy(data[0]) row[1] = mismatched_data[0][1] data.append(row) losses.append(losses[-1]) for i in range(len(data)): tmp = data[i] loss = losses[i] # find where to put this section if id_ in dev_doc_ids and not (use_test): x_dev.append(tmp) y_dev.append(loss) elif id_ in train_doc_ids: x_train.append(tmp) y_train.append(loss) elif id_ in val_doc_ids: x_val.append(tmp) y_val.append(loss) elif id_ in test_doc_ids: x_test.append(tmp) y_test.append(loss) else: raise ValueError("Unknown study id {}".format(id_)) # if we are removing the test set, use validation as test set. if not (use_test): x_test = x_val y_test = y_val x_val = x_dev y_val = y_dev ret = bag_of_words( x_train, y_train, x_val, y_val, x_test, y_test, 5) if bow else [x_train, y_train, x_val, y_val, x_test, y_test] return ret
weighted by number of prompts for that document. """ tokens = {} # Map article ids to token prompts = {} # Map article ids to num prompts for d in Xy: n_tokens = len(d['article']) tokens[d['a_id']] = n_tokens if d['a_id'] in prompts: prompts[d['a_id']] += 1 else: prompts[d['a_id']] = 1 total_entropy = 0 for art in prompts.keys(): total_entropy += np.log(tokens[art]) * prompts[art] / len(Xy) return total_entropy tr_ids, val_ids, te_ids = preprocessor.train_document_ids( ), preprocessor.validation_document_ids(), preprocessor.test_document_ids() train_Xy, inference_vectorizer = preprocessor.get_train_Xy(tr_ids) val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer) test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer) print(calculate_entropy(train_Xy)) print(calculate_entropy(val_Xy)) print(calculate_entropy(test_Xy))
pred = preds[0].data.tolist()[0] return pred print("Loading data...") # get training data train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file="..//..//.//annotations/vocab.txt", include_sentence_span_splits = True) print("Training data loaded...") if not(USE_TEST): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(set(list(preprocessor.test_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) print("Test data loaded...") # modify training data val_Xy, test_Xy = scan_reform(val_Xy), scan_reform(test_Xy) print("Reformatted data.") # load the model model = load_model_scan(inference_vectorizer, './models/scan_model_neural.pth') print("Model loaded...") # after loading the model, get all predictions. instances = val_Xy # validation set for now... y_preds = [] y_test = []