コード例 #1
0
def parse_prompt_id_data(annotations, prompts, pids, i):
    """ 
    Parse a single prompt and get the information necessary.
    
    @param annotations is the annotation dictionary that contains all answers/reasoning data.
    @param prompts     is the prompt dictionary that contains all i/c/o data.
    @param pids        is the ordering of the PMC id. 
    @param i           is which prompt to look at.
    
    @return id_        the prompt idenitification number.
    @return tmp        an arrary containing text, outcome, intervention, comparator.
    @return loss       a single value of -1/0/1 for sig. inc/ no diff. / sig. dec 
    """
    # information for this article
    annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] ==
                                         prompts["PromptID"].values[i]]
    labels = annotations_for_prompt[[LBL_COL_NAME, EVIDENCE_COL_NAME]].values
    id_ = pids[i]
    article = preprocessor.get_article(id_)
    article_text = preprocessor.extract_raw_text(article).lower()

    # extract i/c/o
    out = prompts["Outcome"].values[i].lower()
    inter = prompts["Intervention"].values[i].lower()
    cmp = prompts["Comparator"].values[i].lower()

    # this is all of the reasonings
    res = [a[1] for a in labels]

    losses = []
    data = []

    # match up reasonings with text
    for r in res:
        tmp = [article_text, r, out, inter, cmp]
        loss = stats.mode([l1[0] for l1 in labels])[0][0]

        data.append(tmp)
        losses.append(loss)

    return id_, data, losses
コード例 #2
0
def main():
    # Load in the data
    prompts = np.asarray(preprocessor.read_prompts())
    annotations = np.asarray(preprocessor.read_annotations())
    data = {}
    preds, y_test = [], []

    # store data in dictionary
    sentences = []
    for p in prompts:
        data[p[0]] = {
            'xml': p[1],
            'outcome': p[2],
            'intervention': p[3],
            'comparator': p[4],
            'answer': '',
            'reasoning': ''
        }

    for a in annotations:
        if (a[3]):
            data[a[1]]['answer'] = a[7]
        if (a[4]):
            data[a[1]]['reasoning'] += str(a[6]) + "; "

    test_id = preprocessor.test_document_ids()
    # get predictions and add them to array
    for k in data.keys():
        # try to parse text to remove weird things
        id_ = data[k]['xml']

        # if the file is not a test file
        if not (id_ in test_id):
            continue

        out = try_except_parse(data[k]['outcome'])
        inter = try_except_parse(data[k]['intervention'])
        cmp = try_except_parse(data[k]['comparator'])
        ans = try_except_parse(data[k]['answer'])

        if (ans == ''):
            continue  # we don't have a valid answer for this one...

        y_test.append(ans)

        # article text
        article = preprocessor.get_article(id_)
        text = preprocessor.extract_raw_text(article).lower()

        likely_sentence, pt_array = locate_probable_sentence(
            text, out, inter, cmp)
        guess = eval_sentence(likely_sentence, out, inter, cmp)

        sentences.append(pt_array)

        if (guess == "No significant difference"):
            preds.append(0)
        elif (guess == "Significantly decreased"):
            preds.append(-1)
        else:
            preds.append(1)

    # tm = calculate_token_mass(t_labels, sentences)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='macro')
    prec = precision_score(y_test, preds, average='macro')
    rec = recall_score(y_test, preds, average='macro')

    return acc, f1, prec, rec
コード例 #3
0
annotations = pp.read_annotations()

counter = 0
almost = 0
exact = 0
total = len(annotations)
for _, annot in annotations.iterrows():
    article_file = path.join(DATA_DIR,
                             "txt_files/PMC" + str(annot.PMCID) + ".txt")
    with open(article_file, encoding='utf-8') as f:
        text = f.read()

    start, end = annot["Evidence Start"], annot["Evidence End"]

    raw_text = text[start:end + 1]
    saved_text = pp.extract_raw_text(pp.get_article(annot.PMCID))[start:end +
                                                                  1]
    counter = counter + 1 if raw_text == saved_text else counter
    if start == end:
        exact += 1
        almost += 1
    elif type(annot.Annotations) == str:
        valid = fix_offsets(annot.Annotations, start, end, text)
        exact = exact + 1 if saved_text == annot.Annotations else exact
        almost = almost + 1 if valid else almost

print(
    "Number of spans extracted from the XML different from those extracted from the TXT files: {} / {} = {:.2f}"
    .format(counter, total, counter / total))
print(
    "Number of spans extracted from the TXT/XML file that exactly match the ones in the CSV: {} / {} = {:.2f}"
コード例 #4
0
def load_data(use_test, bow=True):
    """
    Load the data into a train/val/test set that allows for easy access.

    @return bag-of-word representation of training, validation, test sets (with labels).
    """
    print("Loading data.")
    prompts = preprocessor.read_prompts()
    annotations = preprocessor.read_annotations()

    # filter out prompts for which we do not have annotations for whatever reason
    # this was actually just one case; not sure what was going on there.
    def have_annotations_for_prompt(prompt_id):
        return len(
            annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0

    prompts = [
        prompt for row_idx, prompt in prompts.iterrows()
        if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME])
    ]
    prompts = pd.DataFrame(prompts)

    # Sort into training and validation by article id
    train_doc_ids = preprocessor.train_document_ids()
    val_doc_ids = preprocessor.validation_document_ids()
    test_doc_ids = preprocessor.test_document_ids()

    # get a dev set randomly
    dev_doc_ids = list(train_doc_ids)
    random.shuffle(dev_doc_ids)
    dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)])

    x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], []
    pids = prompts[STUDY_ID_COL].values
    for i in range(len(pids)):
        annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] ==
                                             prompts["PromptID"].values[i]]
        labels = annotations_for_prompt[[LBL_COL_NAME,
                                         EVIDENCE_COL_NAME]].values
        id_ = pids[i]
        article = preprocessor.get_article(id_)
        article_text = preprocessor.extract_raw_text(article).lower()

        # extract i/c/o
        out = prompts["Outcome"].values[i].lower()
        inter = prompts["Intervention"].values[i].lower()
        cmp = prompts["Comparator"].values[i].lower()

        # add to correct pile: train/val/test
        tmp = [article_text, out, inter, cmp]
        loss = stats.mode([l1[0] for l1 in labels])[0][0]

        if id_ in dev_doc_ids and not (use_test):
            x_dev.append(tmp)
            y_dev.append(loss)
        elif id_ in train_doc_ids:
            x_train.append(tmp)
            y_train.append(loss)
        elif id_ in val_doc_ids:
            x_val.append(tmp)
            y_val.append(loss)
        elif id_ in test_doc_ids:
            x_test.append(tmp)
            y_test.append(loss)
        else:
            raise ValueError("Unknown study id {}".format(id_))

    # if we are removing the test set, use validation as test set.
    if not (use_test):
        x_test = x_val
        y_test = y_val
        x_val = x_dev
        y_val = y_dev

    print("Generating bag of words...")
    ret = bag_of_words(
        x_train, y_train, x_val, y_val, x_test,
        y_test) if bow else [x_train, y_train, x_val, y_val, x_test, y_test]
    return ret