def get_neg_i(d, f): if random.random() <= 0.5: neg_i = f.o elif random.random() <= 0.75: neg_i = classes.Span(-1, -1, '{} vs. {}'.format(f.i.text, f.c.text)) else: t_idx_i = random.randint(0, len(d.tokens) - 4) tokens = d.tokens[t_idx_i:t_idx_i + random.randint(2, 4)] neg_i = classes.Span(-1, -1, d.text[tokens[0].i:tokens[-1].f]) return neg_i
def read_docs(phase='starting_spans'): pmid_groups = {} for g in GROUPS: pmids = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g))) for pmid in pmids: pmid_groups[pmid] = g def get_e_fname(pmid, e): if pmid_groups[pmid] == 'test': subdir = os.path.join('test', 'gold') else: subdir = 'train' f = '{}.AGGREGATED.ann'.format(pmid) return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated', phase, e, subdir, f) docs = [] for pmid, group in pmid_groups.items(): tokens = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'documents', '{}.tokens'.format(pmid))) text, token_offsets = utils.join_tokens(tokens) doc = classes.Doc(pmid, text) doc.group = group for e in ['participants', 'interventions', 'outcomes']: label_name = 'GOLD_{}'.format(e[0]) labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))] for token_i, token_f, l in utils.condense_labels(labels): char_i = token_offsets[token_i][0] char_f = token_offsets[token_f - 1][1] doc.labels[label_name].append( classes.Span(char_i, char_f, text[char_i:char_f])) docs.append(doc) return docs
def process_json_data(data): docs = [] for d in data: doc = classes.Doc(d['pmid'], d['abstract']) for e in 'pio': for span in d[e]: for m in re.finditer(re.escape(span), doc.text): doc.labels['NER_' + e].append( classes.Span(m.start(), m.end(), span)) for span in d.get('ev', []): for m in re.finditer(re.escape(span), doc.text): doc.labels['BERT_ev'].append( classes.Span(m.start(), m.end(), span)) doc.group = 'test' doc.parse_text() docs.append(doc) return docs
def read_docs(abst_only=False): Prompt = namedtuple("Prompt", "i c o") docs = {} prompts = {} print("Reading prompts + articles") for prompt in preprocessor.read_prompts().to_dict("records"): pmcid = prompt["PMCID"] if pmcid not in docs: docs[pmcid] = init_doc(pmcid, abst_only) pid = prompt["PromptID"] if pid not in prompts: prompts[pid] = Prompt(prompt["Intervention"], prompt["Comparator"], prompt["Outcome"]) print(len(docs)) print(len(prompts)) n_anns = 0 n_bad_offsets = 0 print("Processing annotations") anns = preprocessor.read_annotations().to_dict("records") for ann in anns: if abst_only and not ann["In Abstract"]: continue if not ann["Annotations"]: continue ev = classes.Span(ann["Evidence Start"], ann["Evidence End"], ann["Annotations"]) label = ann["Label"] doc = docs[ann["PMCID"]] prompt = prompts[ann["PromptID"]] if doc.text[ev.i:ev.f] != ev.text: n_bad_offsets += 1 continue n_anns += 1 frame = classes.Frame(prompt.i.strip(), prompt.c.strip(), prompt.o.strip(), ev, label) doc.frames.append(frame) pmcids_docs = list(docs.items()) for pmcid, doc in pmcids_docs: if not doc.frames: del docs[pmcid] print("Retained {}/{} valid annotations ({} w/ bad offsets)".format( n_anns, len(anns), n_bad_offsets)) print("Retained {}/{} docs with nonzero prompts".format( len(docs), len(pmcids_docs))) return list(docs.values())
def get_gold_entities(doc, assign_mentions = False): # collapse Entities with identical coref groups entities = [] for l in doc.labels: if l.startswith('GOLD_'): _, e, g_name = l.split('_') entity = classes.Entity(classes.Span(-1, -1, g_name), e) if assign_mentions: entity.mentions = doc.labels[l] entities.append(entity) return entities
def get_wp_spans(text): wp_tokens = tokenizer.tokenize(text) cur_i = 0 wp_spans = [] for t in wp_tokens: t_text = t.strip('@@') i = text.find(t_text, cur_i) f = i + len(t_text) wp_spans.append(classes.Span(i, f, t)) cur_i = f return wp_spans
def add_ic_ev_output(docs, group, fdir = '../models/sentence_classifier/data/i_c_intro'): model_input = '{}/{}.tsv'.format(fdir, group) model_output = '{}/results/{}_results.tsv'.format(fdir, group) inputs = [l.strip().split('\t') for l in open(model_input).readlines()] outputs = [list(map(float, l.strip().split('\t'))) for l in open(model_output).readlines()] assert len(inputs) == len(outputs) pmid_ev_map = defaultdict(lambda: defaultdict(list)) for (_, pmid, ev_i, ev_f, i_i, i_f, i_text, context), class_probs in zip(inputs, outputs): result = { \ 'class_probs': list(map(float, class_probs)), 'idx_i': int(i_i), 'idx_f': int(i_f), 'text': i_text } pmid_ev_map[pmid][(int(ev_i), int(ev_f))].append(result) for doc in docs: for (ev_i, ev_f), results in pmid_ev_map[doc.id].items(): sents = [s for s in doc.labels['BERT_ev'] if s.i == ev_i and s.f == ev_f] assert len(sents) == 1 sent = sents[0] best_i = max(results, key = lambda r: r['class_probs'][2]) best_c = max(results, key = lambda r: r['class_probs'][1]) sent.pred_i = classes.Span(best_i['idx_i'], best_i['idx_f'], best_i['text']) sent.pred_c = classes.Span(best_c['idx_i'], best_c['idx_f'], best_c['text']) try: assert sent.pred_i.text == utils.clean_str(doc.text[sent.pred_i.i:sent.pred_i.f]) except AssertionError: print('Mismatch for I when loading IC results...') print(sent.pred_i.text) print(utils.clean_str(doc.text[sent.pred_i.i:sent.pred_i.f])) try: assert sent.pred_c.text == utils.clean_str(doc.text[sent.pred_c.i:sent.pred_c.f]) except AssertionError: print('Mismatch for C when loading IC results...') print(sent.pred_c.text) print(utils.clean_str(doc.text[sent.pred_c.i:sent.pred_c.f])) sent.pred_os = utils.s_overlaps(sent, doc.labels['NER_o'])
def add_ner_output(docs, ner_fname, verbose = True): if not docs[0].has_sf_lf_map(): print('Warning: apply replace_acronyms first or the offsets may be wrong!') doc_lookup = { d.id: d for d in docs } rows = [json.loads(l.strip()) for l in open(ner_fname).readlines()] for row in rows: if row['pmid'] not in doc_lookup: continue doc = doc_lookup[row['pmid']] e_label_ranges = utils.condense_labels(row['pred_labels'], '0') for i, f, l in e_label_ranges: if l not in NER_LABEL_MAP: if verbose: print('skipping ner data with unknown label: {}'.format(l)) continue text_i = row['offsets'][i][0] text_f = row['offsets'][f-1][1] span = classes.Span(text_i, text_f, doc.text[text_i:text_f]) doc.labels['NER_'+NER_LABEL_MAP[l]].append(span)
def assign_exact_mention(entities, doc): for e in entities: e.mentions = [] for m in re.finditer(e.text, doc.text): e.mentions.append(classes.Span(m.start(), m.end(), e.text))
def read_docs(glob_str = None, abst_only = True, check_errors = True): fnames = glob.glob(glob_str) frames = defaultdict(list) for idx, frame in pd.read_csv('../data/exhaustive_ico_fixed.csv').iterrows(): frames[str(frame.RowID)].append(frame) n_missing_ico = 0 n_missing_ev = 0 n_total = 0 docs = [] for fname in fnames: worker, pmid, exta = fname.split('/')[-1].split('_') text, offsets = extract_text_and_offsets(pmid, abst_only) ann = json.load(open(fname)) doc = classes.Doc(pmid, text) doc.max_xml = offsets[-1][1] doc.group = 'test' docs.append(doc) entity_group_ids = {} coref_spans = defaultdict(set) for e in 'io': for group_id, (html_id, group_data) in enumerate(ann[e].items()): group_name = group_data['name'] name_tokens = group_name.split(' ') if name_tokens[0].isdigit(): group_name = ' '.join(name_tokens[1:]) group_id = '{}_{}'.format(e, group_name.replace('_', '-')) for s in group_data['spans']: if s['i'] == '-1' and s['f'] == '-1': try: assert entity_group_ids.get(s['txt'], group_id) == group_id except AssertionError: if check_errors: print(fname) print(s['txt']) print(group_id) print(entity_group_ids.get(s['txt'], group_id)) input() continue entity_group_ids[s['txt']] = group_id else: text_i, text_f = xml_to_text(offsets, s['i'], s['f'], s['txt'], text) if text_i == -1 or text_f == -1: continue coref_spans[group_id].add(classes.Span(text_i, text_f, s['txt'])) for group_id, spans in coref_spans.items(): doc.labels['GOLD_'+group_id] = list(spans) for frame in frames[pmid]: xml_i, xml_f = frame.xml_offsets.split(':') if not (xml_i.isdigit() and xml_f.isdigit()): continue xml_i, xml_f = int(xml_i), int(xml_f) if xml_f > doc.max_xml: continue n_total += 1 ev_text = clean_html_str(frame.Reasoning) ev_i = text.find(ev_text) if ev_i < 0: n_missing_ev += 1 continue try: i_span = classes.Span(-1, -1, frame.Comparator, entity_group_ids[frame.Comparator]) c_span = classes.Span(-1, -1, frame.Intervention, entity_group_ids[frame.Intervention]) o_span = classes.Span(-1, -1, frame.Outcome, entity_group_ids[frame.Outcome]) except KeyError: n_missing_ico += 1 continue ev_f = ev_i + len(ev_text) ev_span = classes.Span(ev_i, ev_f, ev_text) frame = classes.Frame(i_span, c_span, o_span, ev_span, frame.Answer) doc.frames.append(frame) print('Read coref groups for {} docs'.format(len(docs))) print('\t{}/{} frames w/ ico missing'.format(n_missing_ico, n_total)) print('\t{}/{} frames w/ ev missing'.format(n_missing_ev, n_total)) return docs
def read_docs(abst_only=False): Prompt = namedtuple('Prompt', 'i c o') docs = {} prompts = {} print('Reading prompts + articles') for prompt in preprocessor.read_prompts().to_dict('records'): pmcid = prompt['PMCID'] if pmcid not in docs: docs[pmcid] = init_doc(pmcid, abst_only) pid = prompt['PromptID'] if pid not in prompts: prompts[pid] = Prompt(prompt['Intervention'], prompt['Comparator'], prompt['Outcome']) print(len(docs)) print(len(prompts)) n_anns = 0 n_bad_offsets = 0 print('Processing annotations') anns = preprocessor.read_annotations().to_dict('records') for ann in anns: if abst_only and not ann['In Abstract']: continue if not ann['Annotations']: continue ev = classes.Span(ann['Evidence Start'], ann['Evidence End'], ann['Annotations']) doc = docs[ann['PMCID']] if doc.text[ev.i:ev.f] != ev.text: n_bad_offsets += 1 continue n_anns += 1 prompt = prompts[ann['PromptID']] label = ann['Label'] i = prompt.i.strip() c = prompt.c.strip() o = prompt.o.strip() add_new_frame = True for f in doc.frames: if f.i.text == i and f.c.text == c and f.o.text == o: assert f.label == classes.Frame.get_encoded_label(label) if utils.s_overlap(f.ev, ev): add_new_frame = False if add_new_frame: frame = classes.Frame( \ classes.Span(-1, -1, i), classes.Span(-1, -1, c), classes.Span(-1, -1, o), ev, label) doc.frames.append(frame) pmcids_docs = list(docs.items()) for pmcid, doc in pmcids_docs: if not doc.frames: del docs[pmcid] print('Retained {}/{} valid annotations ({} w/ bad offsets)'.format(\ n_anns, len(anns), n_bad_offsets)) print('Retained {}/{} docs with nonzero prompts'.format( len(docs), len(pmcids_docs))) return list(docs.values())