def read_docs(phase='starting_spans'): pmid_groups = {} for g in GROUPS: pmids = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g))) for pmid in pmids: pmid_groups[pmid] = g def get_e_fname(pmid, e): if pmid_groups[pmid] == 'test': subdir = os.path.join('test', 'gold') else: subdir = 'train' f = '{}.AGGREGATED.ann'.format(pmid) return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated', phase, e, subdir, f) docs = [] for pmid, group in pmid_groups.items(): tokens = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'documents', '{}.tokens'.format(pmid))) text, token_offsets = utils.join_tokens(tokens) doc = classes.Doc(pmid, text) doc.group = group for e in ['participants', 'interventions', 'outcomes']: label_name = 'GOLD_{}'.format(e[0]) labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))] for token_i, token_f, l in utils.condense_labels(labels): char_i = token_offsets[token_i][0] char_f = token_offsets[token_f - 1][1] doc.labels[label_name].append( classes.Span(char_i, char_f, text[char_i:char_f])) docs.append(doc) return docs
def init_doc(pmcid, abst_only): article = preprocessor.get_article(pmcid) if (abst_only): # gotta add the same gunk as the preprocessor so it all lines up text = "TITLE:\n{}\n\n\n\n{}".format(article.get_title(), extract_raw_abstract(article)) else: text = preprocessor.extract_raw_text(article) doc = classes.Doc(pmcid, text) return doc
def read_covid(data_dir='../data/cures_within_reach/covid'): df = pd.read_csv('{}/covid_docs.csv'.format(data_dir)) docs = [] for idx, r in df.iterrows(): abst = r.Abstract if type(r.Abstract) == str else '' doc = classes.Doc(r.EntrezUID, '{}\n\n{}'.format(r.Title, abst)) doc.parse_text() doc.group = 'test' docs.append(doc) return docs
def get_entrez_docs( fname='../data/cures_within_reach/entrez_downloads/docs.csv'): df = pd.read_csv(open(fname)) docs = [] for idx, r in df.iterrows(): doc = classes.Doc(r.pmid, r.abst) doc.title = r.title doc.group = 'test' doc.parse_text() docs.append(doc) return docs
def read_55(data_dir='../data/cures_within_reach/55_sample'): df = pd.read_csv('{}/55_sample.csv'.format(data_dir)) df.rename(columns={c: c.lstrip() for c in df.columns}, inplace=True) docs = [] for idx, r in df.iterrows(): doc = classes.Doc(r.pmid, r.abstract) doc.qp = r.disease doc.qi = r.drugs doc.parse_text() doc.group = 'test' docs.append(doc) return docs
def read_eric_docs(data_dir='../data/cures_within_reach/eric_data'): fnames = glob.glob('{}/*.text'.format(data_dir)) docs = [ classes.Doc(os.path.basename(f).strip('.text'), open(f).read()) for f in fnames ] docs = [d for d in docs if d.text] for d in docs: d.parse_text() d.group = 'test' d.sf_lf_map = {} # already acronym'd return docs
def read_shard_docs(data_dir): print('\t\tcreating Docs for {}'.format(shard)) fnames = glob.glob('{}/*.text'.format(data_dir)) docs = [ classes.Doc(os.path.basename(f).strip('.text'), open(f).read()) for f in fnames ] docs = [d for d in docs if d.text] for d in docs: d.parse_text() d.group = 'test' d.sf_lf_map = {} # already acronym'd return docs
def process_json_data(data): docs = [] for d in data: doc = classes.Doc(d['pmid'], d['abstract']) for e in 'pio': for span in d[e]: for m in re.finditer(re.escape(span), doc.text): doc.labels['NER_' + e].append( classes.Span(m.start(), m.end(), span)) for span in d.get('ev', []): for m in re.finditer(re.escape(span), doc.text): doc.labels['BERT_ev'].append( classes.Span(m.start(), m.end(), span)) doc.group = 'test' doc.parse_text() docs.append(doc) return docs
def generate_shard_files(): print('Reading trial_annotations.csv') df = pd.read_csv( '/home/ben/Desktop/forked_trialstreamer/trialstreamer/data/trial_annotations.csv' ) start_idx = 550000 shard_size = 10000 for i, f in list( zip(range(start_idx, len(df), shard_size), range(start_idx + shard_size, len(df), shard_size))): print('parsing shard {}_{}'.format(i, f)) os.system('mkdir -p ../data/trialstreamer/{}_{}'.format(i, f)) for idx, r in df.ix[i:f, :].iterrows(): if type(r.ab) != str: continue d = classes.Doc(idx, r.ab) d.replace_acronyms() open('../data/trialstreamer/{}_{}/{}.text'.format(i, f, idx), 'w').write(d.text) open('../data/trialstreamer/{}_{}/{}.title'.format(i, f, idx), 'w').write(r.ti)
def process_covid_data(): top = '../data/covid/' fnames = glob.glob('{}/json/*/*.json'.format(top)) print('Processing {} files...'.format(len(fnames))) docs = [] for f in fnames: j = json.load(open(f)) pmid = j['paper_id'] title = j['metadata']['title'] abst = '\n\n'.join([p['text'] for p in j['abstract']]) body = '\n\n'.join([p['text'] for p in j['body_text']]) text = '\n\n\n'.join([abst, body]) doc = classes.Doc(pmid, text) doc.group = 'test' docs.append(doc) with open('{}/docs/{}.abst'.format(top, pmid), 'w') as fp: fp.write(abst) with open('{}/docs/{}.body'.format(top, pmid), 'w') as fp: fp.write(body) with open('{}/docs/{}.title'.format(top, pmid), 'w') as fp: fp.write(title) return docs
def process_eric_data(): df = pd.read_csv('../data/cures_within_reach/cwr.csv') df = df[~df.Relevant.apply(np.isnan)] df = df[df['Matched.Outcome..Word.Embeddings.'].apply( lambda o: type(o) == str)] docs = {} for idx, r in df.iterrows(): if r.PMID in docs: print('Ignoring dupe id: {}'.format(r.PMID)) continue if type(r.Abstract) is not str: continue text = r.Abstract.replace('\r', '') text = re.sub('\n+', '\n', text) doc = classes.Doc(r.PMID, text) doc.group = 'test' with open('../data/cures_within_reach/{}.text'.format(r.PMID), 'w') as fp: fp.write(doc.text) with open('../data/cures_within_reach/{}.title'.format(r.PMID), 'w') as fp: fp.write(r.Title) p_match = r['Article.Population..Word.Embeddings.'] i_match = r['Article.Intervention..Word.Embeddings.'] o_match = r['Article.Outcome..Word.Embeddings.'] p_query = r['Matched.Population..Word.Embeddings.'] i_query = r['Matched.Intervention..Word.Embeddings.'] o_query = r['Matched.Outcome..Word.Embeddings.'] doc.query = (p_query, i_query, o_query) doc.match = (p_match, i_match, o_match) doc.relevant = r.Relevant docs[r.PMID] = doc return list(docs.values())
def read_docs(glob_str = None, abst_only = True, check_errors = True): fnames = glob.glob(glob_str) frames = defaultdict(list) for idx, frame in pd.read_csv('../data/exhaustive_ico_fixed.csv').iterrows(): frames[str(frame.RowID)].append(frame) n_missing_ico = 0 n_missing_ev = 0 n_total = 0 docs = [] for fname in fnames: worker, pmid, exta = fname.split('/')[-1].split('_') text, offsets = extract_text_and_offsets(pmid, abst_only) ann = json.load(open(fname)) doc = classes.Doc(pmid, text) doc.max_xml = offsets[-1][1] doc.group = 'test' docs.append(doc) entity_group_ids = {} coref_spans = defaultdict(set) for e in 'io': for group_id, (html_id, group_data) in enumerate(ann[e].items()): group_name = group_data['name'] name_tokens = group_name.split(' ') if name_tokens[0].isdigit(): group_name = ' '.join(name_tokens[1:]) group_id = '{}_{}'.format(e, group_name.replace('_', '-')) for s in group_data['spans']: if s['i'] == '-1' and s['f'] == '-1': try: assert entity_group_ids.get(s['txt'], group_id) == group_id except AssertionError: if check_errors: print(fname) print(s['txt']) print(group_id) print(entity_group_ids.get(s['txt'], group_id)) input() continue entity_group_ids[s['txt']] = group_id else: text_i, text_f = xml_to_text(offsets, s['i'], s['f'], s['txt'], text) if text_i == -1 or text_f == -1: continue coref_spans[group_id].add(classes.Span(text_i, text_f, s['txt'])) for group_id, spans in coref_spans.items(): doc.labels['GOLD_'+group_id] = list(spans) for frame in frames[pmid]: xml_i, xml_f = frame.xml_offsets.split(':') if not (xml_i.isdigit() and xml_f.isdigit()): continue xml_i, xml_f = int(xml_i), int(xml_f) if xml_f > doc.max_xml: continue n_total += 1 ev_text = clean_html_str(frame.Reasoning) ev_i = text.find(ev_text) if ev_i < 0: n_missing_ev += 1 continue try: i_span = classes.Span(-1, -1, frame.Comparator, entity_group_ids[frame.Comparator]) c_span = classes.Span(-1, -1, frame.Intervention, entity_group_ids[frame.Intervention]) o_span = classes.Span(-1, -1, frame.Outcome, entity_group_ids[frame.Outcome]) except KeyError: n_missing_ico += 1 continue ev_f = ev_i + len(ev_text) ev_span = classes.Span(ev_i, ev_f, ev_text) frame = classes.Frame(i_span, c_span, o_span, ev_span, frame.Answer) doc.frames.append(frame) print('Read coref groups for {} docs'.format(len(docs))) print('\t{}/{} frames w/ ico missing'.format(n_missing_ico, n_total)) print('\t{}/{} frames w/ ev missing'.format(n_missing_ev, n_total)) return docs