Example #1
0
def get_contexts(
        dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.json",
        output_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.txt",
        downcase=False):
    """
    Gets passage text with no concept annotations.
    """
    dataset = load_json(dataset_file)
    data = dataset[DATA_KEY]
    n_all = 0
    all_contexts = ""

    for datum in data:
        new_context = "\n" + datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][
            CONTEXT_KEY]
        all_contexts += remove_concept_marks(new_context)
        curr_queries = set()
        for qa in datum[DOC_KEY][QAS_KEY]:
            a = ""
            for ans in qa[ANS_KEY]:
                if ans[ORIG_KEY] == "dataset":
                    a = ans[TXT_KEY]
            assert a
            curr_queries.add(
                remove_concept_marks(qa[QUERY_KEY]).replace(
                    PLACEHOLDER_KEY, a))
        all_contexts += "\n" + "\n".join(curr_queries)
        n_all += 1
    print(n_all)

    all_contexts = all_contexts.replace("\n\n", "\n")
    with open(output_file, "w") as fh:
        fh.write(all_contexts.lower() if downcase else all_contexts)
Example #2
0
 def vocabulary_passage(self, lowercase=True):
     v = Counter()
     for datum in self.dataset[DATA_KEY]:
         for w in remove_concept_marks(datum[DOC_KEY][TITLE_KEY]).split():
             v[to_lower(w, lowercase)] += 1
         for w in remove_concept_marks(datum[DOC_KEY][CONTEXT_KEY]).split():
             v[to_lower(w, lowercase)] += 1
     return v
Example #3
0
    def percentage_of_ans_in_docs(self, include_extended=False):
        """
        Find out what proportion of answers actually occur in documents.
        NB: this is based on pure word matching. This is not the same as the percentage of
        entity (concept) answers found in text.

        @param include_extended: whether to use expanded answers in counting.
        """
        n_all = 0
        n_found = 0
        for datum in self.dataset[DATA_KEY]:
            for qa in datum[DOC_KEY][QAS_KEY]:
                n_all += 1
                text = remove_concept_marks(datum[DOC_KEY][TITLE_KEY] + "\n" +
                                            datum[DOC_KEY][CONTEXT_KEY])
                for ans in qa[ANS_KEY]:
                    if include_extended:
                        if re.search(re.escape(ans[TXT_KEY]),
                                     text) is not None:
                            n_found += 1
                            break
                    else:
                        if ans[ORIG_KEY] == "dataset":
                            if re.search(re.escape(ans[TXT_KEY]),
                                         text) is not None:
                                n_found += 1
        assert n_all > 0
        assert n_found <= n_all
        return 100 * n_found / n_all