def get_neg_i(d, f):
    if random.random() <= 0.5:
        neg_i = f.o
    elif random.random() <= 0.75:
        neg_i = classes.Span(-1, -1, '{} vs. {}'.format(f.i.text, f.c.text))
    else:
        t_idx_i = random.randint(0, len(d.tokens) - 4)
        tokens = d.tokens[t_idx_i:t_idx_i + random.randint(2, 4)]
        neg_i = classes.Span(-1, -1, d.text[tokens[0].i:tokens[-1].f])
    return neg_i
def read_docs(phase='starting_spans'):
    pmid_groups = {}
    for g in GROUPS:
        pmids = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g)))
        for pmid in pmids:
            pmid_groups[pmid] = g

    def get_e_fname(pmid, e):
        if pmid_groups[pmid] == 'test':
            subdir = os.path.join('test', 'gold')
        else:
            subdir = 'train'
        f = '{}.AGGREGATED.ann'.format(pmid)
        return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated',
                            phase, e, subdir, f)

    docs = []
    for pmid, group in pmid_groups.items():
        tokens = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'documents',
                         '{}.tokens'.format(pmid)))
        text, token_offsets = utils.join_tokens(tokens)
        doc = classes.Doc(pmid, text)
        doc.group = group
        for e in ['participants', 'interventions', 'outcomes']:
            label_name = 'GOLD_{}'.format(e[0])
            labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))]
            for token_i, token_f, l in utils.condense_labels(labels):
                char_i = token_offsets[token_i][0]
                char_f = token_offsets[token_f - 1][1]
                doc.labels[label_name].append(
                    classes.Span(char_i, char_f, text[char_i:char_f]))
        docs.append(doc)
    return docs
def process_json_data(data):
    docs = []
    for d in data:
        doc = classes.Doc(d['pmid'], d['abstract'])
        for e in 'pio':
            for span in d[e]:
                for m in re.finditer(re.escape(span), doc.text):
                    doc.labels['NER_' + e].append(
                        classes.Span(m.start(), m.end(), span))
        for span in d.get('ev', []):
            for m in re.finditer(re.escape(span), doc.text):
                doc.labels['BERT_ev'].append(
                    classes.Span(m.start(), m.end(), span))
        doc.group = 'test'
        doc.parse_text()
        docs.append(doc)
    return docs
def read_docs(abst_only=False):
    Prompt = namedtuple("Prompt", "i c o")
    docs = {}
    prompts = {}

    print("Reading prompts + articles")
    for prompt in preprocessor.read_prompts().to_dict("records"):
        pmcid = prompt["PMCID"]
        if pmcid not in docs:
            docs[pmcid] = init_doc(pmcid, abst_only)

        pid = prompt["PromptID"]
        if pid not in prompts:
            prompts[pid] = Prompt(prompt["Intervention"], prompt["Comparator"],
                                  prompt["Outcome"])

    print(len(docs))
    print(len(prompts))

    n_anns = 0
    n_bad_offsets = 0
    print("Processing annotations")
    anns = preprocessor.read_annotations().to_dict("records")
    for ann in anns:
        if abst_only and not ann["In Abstract"]:
            continue
        if not ann["Annotations"]:
            continue

        ev = classes.Span(ann["Evidence Start"], ann["Evidence End"],
                          ann["Annotations"])
        label = ann["Label"]
        doc = docs[ann["PMCID"]]
        prompt = prompts[ann["PromptID"]]

        if doc.text[ev.i:ev.f] != ev.text:
            n_bad_offsets += 1
            continue

        n_anns += 1
        frame = classes.Frame(prompt.i.strip(), prompt.c.strip(),
                              prompt.o.strip(), ev, label)
        doc.frames.append(frame)

    pmcids_docs = list(docs.items())
    for pmcid, doc in pmcids_docs:
        if not doc.frames:
            del docs[pmcid]

    print("Retained {}/{} valid annotations ({} w/ bad offsets)".format(
        n_anns, len(anns), n_bad_offsets))
    print("Retained {}/{} docs with nonzero prompts".format(
        len(docs), len(pmcids_docs)))

    return list(docs.values())
def get_gold_entities(doc, assign_mentions = False):
	# collapse Entities with identical coref groups
	entities = []
	for l in doc.labels:
		if l.startswith('GOLD_'):
			_, e, g_name = l.split('_')
			entity = classes.Entity(classes.Span(-1, -1, g_name), e)
			if assign_mentions:
				entity.mentions = doc.labels[l]
			entities.append(entity)
	return entities
Beispiel #6
0
def get_wp_spans(text):
    wp_tokens = tokenizer.tokenize(text)
    cur_i = 0
    wp_spans = []
    for t in wp_tokens:
        t_text = t.strip('@@')
        i = text.find(t_text, cur_i)
        f = i + len(t_text)
        wp_spans.append(classes.Span(i, f, t))
        cur_i = f
    return wp_spans
def add_ic_ev_output(docs, group, fdir = '../models/sentence_classifier/data/i_c_intro'):
	model_input = '{}/{}.tsv'.format(fdir, group)
	model_output = '{}/results/{}_results.tsv'.format(fdir, group)
	inputs = [l.strip().split('\t') for l in open(model_input).readlines()]
	outputs = [list(map(float, l.strip().split('\t'))) for l in open(model_output).readlines()]
	assert len(inputs) == len(outputs)
	pmid_ev_map = defaultdict(lambda: defaultdict(list))
	for (_, pmid, ev_i, ev_f, i_i, i_f, i_text, context), class_probs in zip(inputs, outputs):
		result = { \
				'class_probs': list(map(float, class_probs)),
				'idx_i': int(i_i),
				'idx_f': int(i_f),
				'text': i_text
		}
		pmid_ev_map[pmid][(int(ev_i), int(ev_f))].append(result)
	for doc in docs:
		for (ev_i, ev_f), results in pmid_ev_map[doc.id].items():
			sents = [s for s in doc.labels['BERT_ev'] if s.i == ev_i and s.f == ev_f]
			assert len(sents) == 1
			sent = sents[0]
			best_i = max(results, key = lambda r: r['class_probs'][2])
			best_c = max(results, key = lambda r: r['class_probs'][1])
			sent.pred_i = classes.Span(best_i['idx_i'], best_i['idx_f'], best_i['text'])
			sent.pred_c = classes.Span(best_c['idx_i'], best_c['idx_f'], best_c['text'])
			try:
				assert sent.pred_i.text == utils.clean_str(doc.text[sent.pred_i.i:sent.pred_i.f])
			except AssertionError:
				print('Mismatch for I when loading IC results...')
				print(sent.pred_i.text)
				print(utils.clean_str(doc.text[sent.pred_i.i:sent.pred_i.f]))
			try:
				assert sent.pred_c.text == utils.clean_str(doc.text[sent.pred_c.i:sent.pred_c.f])
			except AssertionError:
				print('Mismatch for C when loading IC results...')
				print(sent.pred_c.text)
				print(utils.clean_str(doc.text[sent.pred_c.i:sent.pred_c.f]))
			sent.pred_os = utils.s_overlaps(sent, doc.labels['NER_o'])
def add_ner_output(docs, ner_fname, verbose = True):
	if not docs[0].has_sf_lf_map():
		print('Warning: apply replace_acronyms first or the offsets may be wrong!')
	doc_lookup = { d.id: d for d in docs }
	rows = [json.loads(l.strip()) for l in open(ner_fname).readlines()]
	for row in rows:
		if row['pmid'] not in doc_lookup:
			continue
		doc = doc_lookup[row['pmid']]
		e_label_ranges = utils.condense_labels(row['pred_labels'], '0')
		for i, f, l in e_label_ranges:
			if l not in NER_LABEL_MAP:
				if verbose: print('skipping ner data with unknown label: {}'.format(l))
				continue
			text_i = row['offsets'][i][0]
			text_f = row['offsets'][f-1][1]
			span = classes.Span(text_i, text_f, doc.text[text_i:text_f])
			doc.labels['NER_'+NER_LABEL_MAP[l]].append(span)
def assign_exact_mention(entities, doc):
	for e in entities:
		e.mentions = []
		for m in re.finditer(e.text, doc.text):
			e.mentions.append(classes.Span(m.start(), m.end(), e.text))
Beispiel #10
0
def read_docs(glob_str = None, abst_only = True, check_errors = True):
	fnames = glob.glob(glob_str)
	frames = defaultdict(list)
	for idx, frame in pd.read_csv('../data/exhaustive_ico_fixed.csv').iterrows():
		frames[str(frame.RowID)].append(frame)

	n_missing_ico = 0
	n_missing_ev = 0
	n_total = 0

	docs = []
	for fname in fnames:
		worker, pmid, exta = fname.split('/')[-1].split('_')
		text, offsets = extract_text_and_offsets(pmid, abst_only)
		ann = json.load(open(fname))
		doc = classes.Doc(pmid, text)
		doc.max_xml = offsets[-1][1]
		doc.group = 'test'
		docs.append(doc)

		entity_group_ids = {}
		coref_spans = defaultdict(set)
		for e in 'io':
			for group_id, (html_id, group_data) in enumerate(ann[e].items()):
				group_name = group_data['name']
				name_tokens = group_name.split(' ')
				if name_tokens[0].isdigit():
					group_name = ' '.join(name_tokens[1:])
				group_id = '{}_{}'.format(e, group_name.replace('_', '-'))
				for s in group_data['spans']:
					if s['i'] == '-1' and s['f'] == '-1':
						try:
							assert entity_group_ids.get(s['txt'], group_id) == group_id
						except AssertionError:
							if check_errors:
								print(fname)
								print(s['txt'])
								print(group_id)
								print(entity_group_ids.get(s['txt'], group_id))
								input()
							continue
						entity_group_ids[s['txt']] = group_id
					else:
						text_i, text_f = xml_to_text(offsets, s['i'], s['f'], s['txt'], text)
						if text_i == -1 or text_f == -1:
							continue
						coref_spans[group_id].add(classes.Span(text_i, text_f, s['txt']))
		for group_id, spans in coref_spans.items():
			doc.labels['GOLD_'+group_id] = list(spans)

		for frame in frames[pmid]:
			xml_i, xml_f = frame.xml_offsets.split(':')
			if not (xml_i.isdigit() and xml_f.isdigit()):
				continue
			xml_i, xml_f = int(xml_i), int(xml_f)
			if xml_f > doc.max_xml:
				continue
			n_total += 1
			ev_text = clean_html_str(frame.Reasoning)
			ev_i = text.find(ev_text)
			if ev_i < 0:
				n_missing_ev += 1
				continue
			try:
				i_span = classes.Span(-1, -1, frame.Comparator, entity_group_ids[frame.Comparator])
				c_span = classes.Span(-1, -1, frame.Intervention, entity_group_ids[frame.Intervention])
				o_span = classes.Span(-1, -1, frame.Outcome, entity_group_ids[frame.Outcome])
			except KeyError:
				n_missing_ico += 1
				continue
			ev_f = ev_i + len(ev_text)
			ev_span = classes.Span(ev_i, ev_f, ev_text)
			frame = classes.Frame(i_span, c_span, o_span, ev_span, frame.Answer)
			doc.frames.append(frame)

	print('Read coref groups for {} docs'.format(len(docs)))
	print('\t{}/{} frames w/ ico missing'.format(n_missing_ico, n_total))
	print('\t{}/{} frames w/ ev  missing'.format(n_missing_ev,  n_total))
	return docs
def read_docs(abst_only=False):

    Prompt = namedtuple('Prompt', 'i c o')
    docs = {}
    prompts = {}

    print('Reading prompts + articles')
    for prompt in preprocessor.read_prompts().to_dict('records'):
        pmcid = prompt['PMCID']
        if pmcid not in docs:
            docs[pmcid] = init_doc(pmcid, abst_only)

        pid = prompt['PromptID']
        if pid not in prompts:
            prompts[pid] = Prompt(prompt['Intervention'], prompt['Comparator'],
                                  prompt['Outcome'])

    print(len(docs))
    print(len(prompts))

    n_anns = 0
    n_bad_offsets = 0
    print('Processing annotations')
    anns = preprocessor.read_annotations().to_dict('records')
    for ann in anns:
        if abst_only and not ann['In Abstract']:
            continue
        if not ann['Annotations']:
            continue
        ev = classes.Span(ann['Evidence Start'], ann['Evidence End'],
                          ann['Annotations'])
        doc = docs[ann['PMCID']]
        if doc.text[ev.i:ev.f] != ev.text:
            n_bad_offsets += 1
            continue
        n_anns += 1
        prompt = prompts[ann['PromptID']]
        label = ann['Label']
        i = prompt.i.strip()
        c = prompt.c.strip()
        o = prompt.o.strip()
        add_new_frame = True
        for f in doc.frames:
            if f.i.text == i and f.c.text == c and f.o.text == o:
                assert f.label == classes.Frame.get_encoded_label(label)
                if utils.s_overlap(f.ev, ev):
                    add_new_frame = False
        if add_new_frame:
            frame = classes.Frame( \
              classes.Span(-1, -1, i),
              classes.Span(-1, -1, c),
              classes.Span(-1, -1, o), ev, label)
            doc.frames.append(frame)

    pmcids_docs = list(docs.items())
    for pmcid, doc in pmcids_docs:
        if not doc.frames:
            del docs[pmcid]

    print('Retained {}/{} valid annotations ({} w/ bad offsets)'.format(\
      n_anns, len(anns), n_bad_offsets))
    print('Retained {}/{} docs with nonzero prompts'.format(
        len(docs), len(pmcids_docs)))

    return list(docs.values())