def get_contexts( dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.json", output_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.txt", downcase=False): """ Gets passage text with no concept annotations. """ dataset = load_json(dataset_file) data = dataset[DATA_KEY] n_all = 0 all_contexts = "" for datum in data: new_context = "\n" + datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][ CONTEXT_KEY] all_contexts += remove_concept_marks(new_context) curr_queries = set() for qa in datum[DOC_KEY][QAS_KEY]: a = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": a = ans[TXT_KEY] assert a curr_queries.add( remove_concept_marks(qa[QUERY_KEY]).replace( PLACEHOLDER_KEY, a)) all_contexts += "\n" + "\n".join(curr_queries) n_all += 1 print(n_all) all_contexts = all_contexts.replace("\n\n", "\n") with open(output_file, "w") as fh: fh.write(all_contexts.lower() if downcase else all_contexts)
def vocabulary_passage(self, lowercase=True): v = Counter() for datum in self.dataset[DATA_KEY]: for w in remove_concept_marks(datum[DOC_KEY][TITLE_KEY]).split(): v[to_lower(w, lowercase)] += 1 for w in remove_concept_marks(datum[DOC_KEY][CONTEXT_KEY]).split(): v[to_lower(w, lowercase)] += 1 return v
def percentage_of_ans_in_docs(self, include_extended=False): """ Find out what proportion of answers actually occur in documents. NB: this is based on pure word matching. This is not the same as the percentage of entity (concept) answers found in text. @param include_extended: whether to use expanded answers in counting. """ n_all = 0 n_found = 0 for datum in self.dataset[DATA_KEY]: for qa in datum[DOC_KEY][QAS_KEY]: n_all += 1 text = remove_concept_marks(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) for ans in qa[ANS_KEY]: if include_extended: if re.search(re.escape(ans[TXT_KEY]), text) is not None: n_found += 1 break else: if ans[ORIG_KEY] == "dataset": if re.search(re.escape(ans[TXT_KEY]), text) is not None: n_found += 1 assert n_all > 0 assert n_found <= n_all return 100 * n_found / n_all