def ev_abst(doc, i, ev): ev_s = [ s.text for idx, s in enumerate(doc.sents) if utils.s_overlap(s, ev) ] other_s = [ s.text for idx, s in enumerate(doc.sents) if not utils.s_overlap(s, ev) ] return ' '.join(ev_s + other_s)
def full_context(doc, frame): group_idx = doc.text.lower().find('group') sents = [s for idx, s in enumerate(doc.sents) if \ utils.s_overlap(s, frame.ev) or \ s.i <= group_idx <= s.f or \ 2 <= idx <= 4] return sents
def write_sent_data(docs, fdir, balance_classes=False): group_docs = defaultdict(list) for doc in docs: group_docs[doc.group].append(doc) for group, doc_list in group_docs.items(): fout = open('{}/{}.tsv'.format(fdir, group), 'w') for doc in doc_list: sent_labels = [ any([utils.s_overlap(s, f.ev) for f in doc.frames]) for s in doc.sents ] pos_sents = [s.text for s, l in zip(doc.sents, sent_labels) if l] neg_sents = [ s.text for s, l in zip(doc.sents, sent_labels) if not l ] if balance_classes: neg_samples = [] for pos_s in pos_sents: neg_sents = sorted(neg_sents, key=lambda s: abs(len(s) - len(pos_s))) try: neg_samples.append(neg_sents.pop(0)) neg_samples.append(neg_sents.pop(0)) except IndexError: print( 'Warning: unable to sample enough negatives from doc {}' .format(doc.id)) neg_sents = neg_samples for s in pos_sents: fout.write('1\t{}\t{}\n'.format(doc.id, utils.clean_str(s))) for s in neg_sents: fout.write('0\t{}\t{}\n'.format(doc.id, utils.clean_str(s)))
def first_and_ev(doc, i, ev): i_idx = doc.text.lower().find(i.text.lower().strip('. ,)(')) g_idx = doc.text.lower().find('group') sents = [s for idx, s in enumerate(doc.sents) if \ utils.s_overlap(s, ev) or \ s.i <= i_idx <= s.f or \ s.i <= g_idx <= s.f] return ' '.join([s.text for s in sents])
def intro_group_ev(doc, i, ev): ev_sents = [s for idx, s in enumerate(doc.sents) if utils.s_overlap(s, ev)] group_idx = doc.text.lower().find('group') context_sents = [ s for idx, s in enumerate(doc.sents) if s.i <= group_idx <= s.f or 2 <= idx <= 4 ] return ' '.join([s.text for s in ev_sents + context_sents])
def get_overlap_labels(self, span, attr): doc_spans = getattr(self, attr) labels = [int(utils.s_overlap(span, s)) for s in doc_spans] return labels
def read_docs(abst_only=False): Prompt = namedtuple('Prompt', 'i c o') docs = {} prompts = {} print('Reading prompts + articles') for prompt in preprocessor.read_prompts().to_dict('records'): pmcid = prompt['PMCID'] if pmcid not in docs: docs[pmcid] = init_doc(pmcid, abst_only) pid = prompt['PromptID'] if pid not in prompts: prompts[pid] = Prompt(prompt['Intervention'], prompt['Comparator'], prompt['Outcome']) print(len(docs)) print(len(prompts)) n_anns = 0 n_bad_offsets = 0 print('Processing annotations') anns = preprocessor.read_annotations().to_dict('records') for ann in anns: if abst_only and not ann['In Abstract']: continue if not ann['Annotations']: continue ev = classes.Span(ann['Evidence Start'], ann['Evidence End'], ann['Annotations']) doc = docs[ann['PMCID']] if doc.text[ev.i:ev.f] != ev.text: n_bad_offsets += 1 continue n_anns += 1 prompt = prompts[ann['PromptID']] label = ann['Label'] i = prompt.i.strip() c = prompt.c.strip() o = prompt.o.strip() add_new_frame = True for f in doc.frames: if f.i.text == i and f.c.text == c and f.o.text == o: assert f.label == classes.Frame.get_encoded_label(label) if utils.s_overlap(f.ev, ev): add_new_frame = False if add_new_frame: frame = classes.Frame( \ classes.Span(-1, -1, i), classes.Span(-1, -1, c), classes.Span(-1, -1, o), ev, label) doc.frames.append(frame) pmcids_docs = list(docs.items()) for pmcid, doc in pmcids_docs: if not doc.frames: del docs[pmcid] print('Retained {}/{} valid annotations ({} w/ bad offsets)'.format(\ n_anns, len(anns), n_bad_offsets)) print('Retained {}/{} docs with nonzero prompts'.format( len(docs), len(pmcids_docs))) return list(docs.values())