def parse_prompt_id_data(annotations, prompts, pids, i): """ Parse a single prompt and get the information necessary. @param annotations is the annotation dictionary that contains all answers/reasoning data. @param prompts is the prompt dictionary that contains all i/c/o data. @param pids is the ordering of the PMC id. @param i is which prompt to look at. @return id_ the prompt idenitification number. @return tmp an arrary containing text, outcome, intervention, comparator. @return loss a single value of -1/0/1 for sig. inc/ no diff. / sig. dec """ # information for this article annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] == prompts["PromptID"].values[i]] labels = annotations_for_prompt[[LBL_COL_NAME, EVIDENCE_COL_NAME]].values id_ = pids[i] article = preprocessor.get_article(id_) article_text = preprocessor.extract_raw_text(article).lower() # extract i/c/o out = prompts["Outcome"].values[i].lower() inter = prompts["Intervention"].values[i].lower() cmp = prompts["Comparator"].values[i].lower() # this is all of the reasonings res = [a[1] for a in labels] losses = [] data = [] # match up reasonings with text for r in res: tmp = [article_text, r, out, inter, cmp] loss = stats.mode([l1[0] for l1 in labels])[0][0] data.append(tmp) losses.append(loss) return id_, data, losses
def main(): # Load in the data prompts = np.asarray(preprocessor.read_prompts()) annotations = np.asarray(preprocessor.read_annotations()) data = {} preds, y_test = [], [] # store data in dictionary sentences = [] for p in prompts: data[p[0]] = { 'xml': p[1], 'outcome': p[2], 'intervention': p[3], 'comparator': p[4], 'answer': '', 'reasoning': '' } for a in annotations: if (a[3]): data[a[1]]['answer'] = a[7] if (a[4]): data[a[1]]['reasoning'] += str(a[6]) + "; " test_id = preprocessor.test_document_ids() # get predictions and add them to array for k in data.keys(): # try to parse text to remove weird things id_ = data[k]['xml'] # if the file is not a test file if not (id_ in test_id): continue out = try_except_parse(data[k]['outcome']) inter = try_except_parse(data[k]['intervention']) cmp = try_except_parse(data[k]['comparator']) ans = try_except_parse(data[k]['answer']) if (ans == ''): continue # we don't have a valid answer for this one... y_test.append(ans) # article text article = preprocessor.get_article(id_) text = preprocessor.extract_raw_text(article).lower() likely_sentence, pt_array = locate_probable_sentence( text, out, inter, cmp) guess = eval_sentence(likely_sentence, out, inter, cmp) sentences.append(pt_array) if (guess == "No significant difference"): preds.append(0) elif (guess == "Significantly decreased"): preds.append(-1) else: preds.append(1) # tm = calculate_token_mass(t_labels, sentences) acc = accuracy_score(y_test, preds) f1 = f1_score(y_test, preds, average='macro') prec = precision_score(y_test, preds, average='macro') rec = recall_score(y_test, preds, average='macro') return acc, f1, prec, rec
annotations = pp.read_annotations() counter = 0 almost = 0 exact = 0 total = len(annotations) for _, annot in annotations.iterrows(): article_file = path.join(DATA_DIR, "txt_files/PMC" + str(annot.PMCID) + ".txt") with open(article_file, encoding='utf-8') as f: text = f.read() start, end = annot["Evidence Start"], annot["Evidence End"] raw_text = text[start:end + 1] saved_text = pp.extract_raw_text(pp.get_article(annot.PMCID))[start:end + 1] counter = counter + 1 if raw_text == saved_text else counter if start == end: exact += 1 almost += 1 elif type(annot.Annotations) == str: valid = fix_offsets(annot.Annotations, start, end, text) exact = exact + 1 if saved_text == annot.Annotations else exact almost = almost + 1 if valid else almost print( "Number of spans extracted from the XML different from those extracted from the TXT files: {} / {} = {:.2f}" .format(counter, total, counter / total)) print( "Number of spans extracted from the TXT/XML file that exactly match the ones in the CSV: {} / {} = {:.2f}"
def load_data(use_test, bow=True): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ print("Loading data.") prompts = preprocessor.read_prompts() annotations = preprocessor.read_annotations() # filter out prompts for which we do not have annotations for whatever reason # this was actually just one case; not sure what was going on there. def have_annotations_for_prompt(prompt_id): return len( annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0 prompts = [ prompt for row_idx, prompt in prompts.iterrows() if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME]) ] prompts = pd.DataFrame(prompts) # Sort into training and validation by article id train_doc_ids = preprocessor.train_document_ids() val_doc_ids = preprocessor.validation_document_ids() test_doc_ids = preprocessor.test_document_ids() # get a dev set randomly dev_doc_ids = list(train_doc_ids) random.shuffle(dev_doc_ids) dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)]) x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], [] pids = prompts[STUDY_ID_COL].values for i in range(len(pids)): annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] == prompts["PromptID"].values[i]] labels = annotations_for_prompt[[LBL_COL_NAME, EVIDENCE_COL_NAME]].values id_ = pids[i] article = preprocessor.get_article(id_) article_text = preprocessor.extract_raw_text(article).lower() # extract i/c/o out = prompts["Outcome"].values[i].lower() inter = prompts["Intervention"].values[i].lower() cmp = prompts["Comparator"].values[i].lower() # add to correct pile: train/val/test tmp = [article_text, out, inter, cmp] loss = stats.mode([l1[0] for l1 in labels])[0][0] if id_ in dev_doc_ids and not (use_test): x_dev.append(tmp) y_dev.append(loss) elif id_ in train_doc_ids: x_train.append(tmp) y_train.append(loss) elif id_ in val_doc_ids: x_val.append(tmp) y_val.append(loss) elif id_ in test_doc_ids: x_test.append(tmp) y_test.append(loss) else: raise ValueError("Unknown study id {}".format(id_)) # if we are removing the test set, use validation as test set. if not (use_test): x_test = x_val y_test = y_val x_val = x_dev y_val = y_dev print("Generating bag of words...") ret = bag_of_words( x_train, y_train, x_val, y_val, x_test, y_test) if bow else [x_train, y_train, x_val, y_val, x_test, y_test] return ret