Esempio n. 1
0
def ev_abst(doc, i, ev):
    ev_s = [
        s.text for idx, s in enumerate(doc.sents) if utils.s_overlap(s, ev)
    ]
    other_s = [
        s.text for idx, s in enumerate(doc.sents)
        if not utils.s_overlap(s, ev)
    ]
    return ' '.join(ev_s + other_s)
def full_context(doc, frame):
	group_idx = doc.text.lower().find('group')
	sents = [s for idx, s in enumerate(doc.sents) if \
				utils.s_overlap(s, frame.ev) or \
				s.i <= group_idx <= s.f or \
				2 <= idx <= 4]
	return sents
Esempio n. 3
0
def write_sent_data(docs, fdir, balance_classes=False):
    group_docs = defaultdict(list)
    for doc in docs:
        group_docs[doc.group].append(doc)
    for group, doc_list in group_docs.items():
        fout = open('{}/{}.tsv'.format(fdir, group), 'w')
        for doc in doc_list:
            sent_labels = [
                any([utils.s_overlap(s, f.ev) for f in doc.frames])
                for s in doc.sents
            ]
            pos_sents = [s.text for s, l in zip(doc.sents, sent_labels) if l]
            neg_sents = [
                s.text for s, l in zip(doc.sents, sent_labels) if not l
            ]
            if balance_classes:
                neg_samples = []
                for pos_s in pos_sents:
                    neg_sents = sorted(neg_sents,
                                       key=lambda s: abs(len(s) - len(pos_s)))
                    try:
                        neg_samples.append(neg_sents.pop(0))
                        neg_samples.append(neg_sents.pop(0))
                    except IndexError:
                        print(
                            'Warning: unable to sample enough negatives from doc {}'
                            .format(doc.id))
                neg_sents = neg_samples
            for s in pos_sents:
                fout.write('1\t{}\t{}\n'.format(doc.id, utils.clean_str(s)))
            for s in neg_sents:
                fout.write('0\t{}\t{}\n'.format(doc.id, utils.clean_str(s)))
Esempio n. 4
0
def first_and_ev(doc, i, ev):
    i_idx = doc.text.lower().find(i.text.lower().strip('. ,)('))
    g_idx = doc.text.lower().find('group')
    sents = [s for idx, s in enumerate(doc.sents) if \
       utils.s_overlap(s, ev) or \
       s.i <= i_idx <= s.f or \
       s.i <= g_idx <= s.f]
    return ' '.join([s.text for s in sents])
Esempio n. 5
0
def intro_group_ev(doc, i, ev):
    ev_sents = [s for idx, s in enumerate(doc.sents) if utils.s_overlap(s, ev)]
    group_idx = doc.text.lower().find('group')
    context_sents = [
        s for idx, s in enumerate(doc.sents)
        if s.i <= group_idx <= s.f or 2 <= idx <= 4
    ]
    return ' '.join([s.text for s in ev_sents + context_sents])
Esempio n. 6
0
 def get_overlap_labels(self, span, attr):
     doc_spans = getattr(self, attr)
     labels = [int(utils.s_overlap(span, s)) for s in doc_spans]
     return labels
def read_docs(abst_only=False):

    Prompt = namedtuple('Prompt', 'i c o')
    docs = {}
    prompts = {}

    print('Reading prompts + articles')
    for prompt in preprocessor.read_prompts().to_dict('records'):
        pmcid = prompt['PMCID']
        if pmcid not in docs:
            docs[pmcid] = init_doc(pmcid, abst_only)

        pid = prompt['PromptID']
        if pid not in prompts:
            prompts[pid] = Prompt(prompt['Intervention'], prompt['Comparator'],
                                  prompt['Outcome'])

    print(len(docs))
    print(len(prompts))

    n_anns = 0
    n_bad_offsets = 0
    print('Processing annotations')
    anns = preprocessor.read_annotations().to_dict('records')
    for ann in anns:
        if abst_only and not ann['In Abstract']:
            continue
        if not ann['Annotations']:
            continue
        ev = classes.Span(ann['Evidence Start'], ann['Evidence End'],
                          ann['Annotations'])
        doc = docs[ann['PMCID']]
        if doc.text[ev.i:ev.f] != ev.text:
            n_bad_offsets += 1
            continue
        n_anns += 1
        prompt = prompts[ann['PromptID']]
        label = ann['Label']
        i = prompt.i.strip()
        c = prompt.c.strip()
        o = prompt.o.strip()
        add_new_frame = True
        for f in doc.frames:
            if f.i.text == i and f.c.text == c and f.o.text == o:
                assert f.label == classes.Frame.get_encoded_label(label)
                if utils.s_overlap(f.ev, ev):
                    add_new_frame = False
        if add_new_frame:
            frame = classes.Frame( \
              classes.Span(-1, -1, i),
              classes.Span(-1, -1, c),
              classes.Span(-1, -1, o), ev, label)
            doc.frames.append(frame)

    pmcids_docs = list(docs.items())
    for pmcid, doc in pmcids_docs:
        if not doc.frames:
            del docs[pmcid]

    print('Retained {}/{} valid annotations ({} w/ bad offsets)'.format(\
      n_anns, len(anns), n_bad_offsets))
    print('Retained {}/{} docs with nonzero prompts'.format(
        len(docs), len(pmcids_docs)))

    return list(docs.values())