def test7(self): input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.''' rules = [] rules.append(r'\b(\a 0 stbegin') rules.append(r'\a\e 2 stend') rules.append(r'. +(This 0 stbegin') rules.append(r'](. 2 stend') rush = RuSH(rules, enable_logger=True) sentences = rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str)
def __init__(self, rules_path: str = '', max_repeat: int = 50, auto_fix_gaps: bool = True, token_compatible: bool = True) -> Sentencizer: """ @param rules_path: The string of the rule file path or rules themselves. @param max_repeat: Total number of replicates that allows to be handled by "+" wildcard. @param auto_fix_gaps: If gaps are caused by malcrafted rules, try to fix them. @param token_compatible: when true, this approach only works for spacy >=2.2.3. However, this has no control of sentence end, TODO: need to see how the downsteam spacy components make use of doc.c """ self.rules_path = rules_path self.token_compatible = token_compatible self.rush = RuSH(rule_str=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)
def test_doc2(self): input_str = ''' 9. Advair b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. ''' self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True) sentences = self.rush.segToSentenceSpans(input_str) for sent in sentences: print('>' + input_str[sent.begin:sent.end] + '<\n') assert (len(sentences) == 4) sent = sentences[1] assert ( input_str[sent.begin:sent.end] == '10. Xopenex q.i.d. and p.r.n.')
class PyRuSHSentencizer(Sentencizer): def __init__(self, rules_path: str = '', max_repeat: int = 50, auto_fix_gaps: bool = True, token_compatible: bool = True) -> Sentencizer: """ @param rules_path: The string of the rule file path or rules themselves. @param max_repeat: Total number of replicates that allows to be handled by "+" wildcard. @param auto_fix_gaps: If gaps are caused by malcrafted rules, try to fix them. @param token_compatible: when true, this approach only works for spacy >=2.2.3. However, this has no control of sentence end, TODO: need to see how the downsteam spacy components make use of doc.c """ self.rules_path = rules_path self.token_compatible = token_compatible self.rush = RuSH(rule_str=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps) @classmethod def from_nlp(cls, nlp, **cfg): return cls(**cfg) def __call__(self, doc): if self.token_compatible: tags = self.predict([doc]) self.set_annotations([doc], tags) return doc else: doc = csegment(doc, self.rush.segToSentenceSpans(doc.text)) return doc def predict(self, docs): """Apply the pipeline's model to a batch of docs, without modifying them. """ guesses = cpredict(docs, self.rush.segToSentenceSpans) return guesses def set_annotations(self, docs, batch_tag_ids, tensors=None): """ This function overwrite spacy's Sentencizer. @param batch_tag_ids: a list of doc's tags (a list of boolean values) @param tensors: a place holder for future extensions """ cset_annotations(docs, batch_tag_ids, tensors)
def setUp(self): pwd = os.path.dirname(os.path.abspath(__file__)) self.rush = RuSH(str(os.path.join(pwd, '../../conf/rush_rules.tsv')))
class TestRuSH(unittest.TestCase): def setUp(self): pwd = os.path.dirname(os.path.abspath(__file__)) self.rush = RuSH(str(os.path.join(pwd, '../../conf/rush_rules.tsv'))) def test1(self): input_str = 'Can Mr. K check it. Look\n good.\n' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 19) assert (sentences[1].begin == 20 and sentences[1].end == 31) def test2(self): input_str = 'S/p C6-7 ACDF. No urgent events overnight. Pain control ON. ' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 14) assert (sentences[1].begin == 15 and sentences[1].end == 42) assert (sentences[2].begin == 43 and sentences[2].end == 59) def test3(self): input_str = ''' • Coagulopathy (HCC) • Hepatic encephalopathy (HCC) • Hepatorenal syndrome (HCC) ''' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 1 and sentences[0].end == 22) assert (sentences[1].begin == 31 and sentences[1].end == 62) assert (sentences[2].begin == 71 and sentences[2].end == 100) def test4(self): input_str = 'Delirium - ' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 10) pass def test5(self): input_str = "The patient complained about the TIA \n\n No memory issues. \"I \n\nOrdered the MRI scan.- " sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 36) assert (sentences[1].begin == 39 and sentences[1].end == 85) pass def printDetails(self, sentences, input_str): for i in range(0, len(sentences)): sentence = sentences[i] print('assert (sentences[' + str(i) + '].begin == ' + str(sentence.begin) + ' and sentences[' + str(i) + '].end == ' + str(sentence.end) + ')') # self.printDetails(sentences, input_str) pass def test6(self): input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.''' sentences = self.rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str)
def main(args): if len(args) < 3: sys.stderr.write( "Required arguments: <input directory> <rest host> <output directory>\n" ) sys.exit(-1) hostname = args[1] # initialize rest server init_url = 'http://%s:8000/temporal/initialize' % hostname process_url = 'http://%s:8000/temporal/process' % hostname # sentence segmenter rush = RuSH('conf/rush_rules.tsv') # tokenizer # tokenizer = TreebankWordTokenizer() r = requests.post(init_url) if r.status_code != 200: sys.stderr.write('Error: rest init call was not successful\n') sys.exit(-1) for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex): print("Processing filename: %s" % (text_name)) if len(xml_names) > 1: sys.stderr.write( 'There were multiple valid xml files for file %s' % (text_name)) sys.exit(-1) xml_name = xml_names[0] with open(os.path.join(args[0], sub_dir, text_name)) as f: text = f.read() sentences = rush.segToSentenceSpans(text) sent_tokens = [] for sentence in sentences: sent_txt = text[sentence.begin:sentence.end] sent_tokens.append(tokenize(sent_txt)) r = requests.post(process_url, json={'sent_tokens': sent_tokens}) if r.status_code != 200: sys.stderr.write('Error: rest call was not successful\n') sys.exit(-1) json = r.json() anafora_data = AnaforaData() cur_id = 0 for sent_ind, sentence in enumerate(sentences): sent_txt = text[sentence.begin:sentence.end] sent_events = json['events'][sent_ind] sent_timexes = json['timexes'][sent_ind] try: token_spans = align_tokens(sent_tokens[sent_ind], sent_txt) except Exception as e: sys.stderr.write( 'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n' % (text_name, str(e), sent_txt)) sys.exit(-1) for event in sent_events: begin_token_ind = event['begin'] end_token_ind = event['end'] dtr = event['dtr'] event_start_offset = token_spans[begin_token_ind][ 0] + sentence.begin event_end_offset = token_spans[end_token_ind][ 1] + sentence.begin event_text = text[event_start_offset:event_end_offset] annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name cur_id += 1 annot.spans = ((event_start_offset, event_end_offset), ) annot.type = "EVENT" annot.properties['DocTimeRel'] = dtr anafora_data.annotations.append(annot) #print("Found event %s" % (event_text)) for timex in sent_timexes: begin_token_ind = timex['begin'] end_token_ind = timex['end'] time_class = timex['timeClass'] timex_start_offset = token_spans[begin_token_ind][ 0] + sentence.begin timex_end_offset = token_spans[end_token_ind][ 1] + sentence.begin timex_text = text[timex_start_offset:timex_end_offset] # create anafora entry annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name cur_id += 1 annot.spans = ((timex_start_offset, timex_end_offset), ) annot.type = "TIMEX3" annot.properties['Class'] = time_class anafora_data.annotations.append(annot) #print("Found timex %s" % (timex_text)) #break anafora_data.indent() os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True) anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
def main(args): if len(args) < 3: sys.stderr.write( "Required arguments: <input directory> <rest host> <output directory>\n" ) sys.exit(-1) hostname = args[1] # initialize rest server init_url = 'http://%s:8000/temporal/initialize' % hostname process_url = 'http://%s:8000/temporal/process' % hostname # sentence segmenter rush = RuSH('conf/rush_rules.tsv') # tokenizer tokenizer = TreebankWordTokenizer() r = requests.post(init_url) if r.status_code != 200: sys.stderr.write('Error: rest init call was not successful\n') sys.exit(-1) combine_sentences = True token_threshold = 100 for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex): print("Processing filename: %s" % (text_name)) if len(xml_names) > 1: sys.stderr.write( 'There were multiple valid xml files for file %s\n' % (text_name)) filtered_names = [] for xml_name in xml_names: if 'Relation' in xml_name: filtered_names.append(xml_name) if len(filtered_names) == 1: sys.stderr.write( 'Picking the file with "Relation" in the title: %s\n' % (filtered_names[0])) xml_names = filtered_names else: sys.exit(-1) xml_name = xml_names[0] section_texts = [] sentences = [] text = '' with open(os.path.join(args[0], sub_dir, text_name)) as f: cur_section = [] cur_ind = 0 section_start = 0 for line in f.readlines(): text += line line_len = len(line) line = line.rstrip() if line.startswith('[meta') or line.startswith( '[start section') or line.startswith('[end section'): if len(cur_section) > 0: section_texts.append('\n'.join(cur_section)) section_text = '\n'.join(cur_section) section_sents = rush.segToSentenceSpans(section_text) if len(section_sents) > 0: section_sents[0].text = '<section>' #section_sents[-1].text = '</section>' for section_sent in section_sents: section_sent.begin += section_start section_sent.end += section_start sentences.extend(section_sents) cur_section = [] section_start = cur_ind + line_len else: cur_section.append(line) cur_ind += line_len #sentences = rush.segToSentenceSpans(text) sent_tokens = [] merged_sentences = [] if combine_sentences: for sentence_ind, sentence in enumerate(sentences): sent_txt = text[sentence.begin:sentence.end] if tb_tokenize: raw_tokens = tokenizer.tokenize(sent_txt) # From https://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer.span_tokenize # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) or double # single-quotes (because '' might be transformed to `` if it is # treated as starting quotes). if ('"' in sent_txt) or ("''" in sent_txt): # Find double quotes and converted quotes matched = [ m.group() for m in re.finditer(r"``|'{2}|\"", sent_txt) ] # Replace converted quotes back to double quotes tokens = [ matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens ] else: tokens = raw_tokens else: tokens = tokenize(sent_txt) # fix apostrophe s ('s) to be one token def fix_simple_tokenize(tokens): new_tokens = [] ind = 0 while ind < len(tokens): if tokens[ind] == "'" and ind + 1 < len( tokens) and tokens[ind + 1] == 's': new_tokens.append("'s") ind += 2 else: new_tokens.append(tokens[ind]) ind += 1 return new_tokens tokens = fix_simple_tokenize(tokens) if text[sentence.end] == '\n': tokens.append('<cr>') # print("Sentence number %d has %d tokens" % (sentence_ind, len(tokens))) if len(sent_tokens) > 0 and ( len(sent_tokens[-1]) + len(tokens)) < token_threshold and sentence.text == '': sent_tokens[-1].extend(tokens) merged_sentences[-1].end = sentence.end else: sent_tokens.append(tokens) merged_sentences.append(sentence) for tokens in sent_tokens: while tokens[-1] == "<cr>": tokens.pop() sentences = merged_sentences else: for sentence in sentences: sent_txt = text[sentence.begin:sentence.end] sent_tokens.append(tokenize(sent_txt)) r = requests.post(process_url, json={ 'sent_tokens': sent_tokens, 'metadata': text_name }) if r.status_code != 200: sys.stderr.write('Error: rest call was not successful\n') sys.exit(-1) json = r.json() anafora_data = AnaforaData() cur_id = 0 rel_id = 0 for sent_ind, sentence in enumerate(sentences): sent_txt = text[sentence.begin:sentence.end] sent_events = json['events'][sent_ind] sent_timexes = json['timexes'][sent_ind] sent_rels = json['relations'][sent_ind] event_ids = [] timex_ids = [] meta_rev_loc = sent_txt.find('[meta rev_date') if meta_rev_loc >= 0: meta_rev_end = sent_txt.find(']', meta_rev_loc) meta_rev_loc += sentence.begin meta_rev_end += sentence.begin # Replace <cr> with empty string so that tokens align again, # then after alignment add them back in so token offsets from classifier are correct. cr_token_inds = [] num_crs_at_position = [] for ind in range(len(sent_tokens[sent_ind])): num_crs_at_position.append(len(cr_token_inds)) if sent_tokens[sent_ind][ind] == '<cr>': cr_token_inds.append(ind) sent_tokens[sent_ind][ind] = '' try: token_spans = align_tokens(sent_tokens[sent_ind], sent_txt) except Exception as e: sys.stderr.write( 'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n' % (text_name, str(e), sent_txt)) sys.exit(-1) for event in sent_events: begin_token_ind = event['begin'] end_token_ind = event['end'] dtr = event['dtr'] event_start_offset = token_spans[ begin_token_ind + num_crs_at_position[begin_token_ind]][0] + sentence.begin event_end_offset = token_spans[ end_token_ind + num_crs_at_position[end_token_ind]][1] + sentence.begin event_text = text[event_start_offset:event_end_offset] annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name if event_text.endswith('_date'): annot.properties['datesectiontime'] = 'True' event_ids.append(-1) else: event_ids.append(annot.id) annot.spans = ((event_start_offset, event_end_offset), ) annot.type = "EVENT" annot.properties['DocTimeRel'] = dtr anafora_data.annotations.append(annot) cur_id += 1 #print("Found event %s" % (event_text)) for timex in sent_timexes: begin_token_ind = timex['begin'] end_token_ind = timex['end'] time_class = timex['timeClass'] timex_start_offset = token_spans[ begin_token_ind + num_crs_at_position[begin_token_ind]][0] + sentence.begin timex_end_offset = token_spans[ end_token_ind + num_crs_at_position[end_token_ind]][1] + sentence.begin timex_text = text[timex_start_offset:timex_end_offset] if meta_rev_loc >= 0 and timex_start_offset > meta_rev_loc and timex_end_offset < meta_rev_end: timex_ids.append(-1) elif time_class == 'SECTIONTIME': timex_ids.append(-1) elif not re.match(r'\d{5}', timex_text) is None: timex_ids.append(-1) else: # create anafora entry annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name timex_ids.append(annot.id) cur_id += 1 annot.spans = ((timex_start_offset, timex_end_offset), ) annot.type = "TIMEX3" annot.properties['Class'] = time_class anafora_data.annotations.append(annot) #print("Found timex %s" % (timex_text)) if not 'path' in text_name.lower(): # no relations in pathology notes, so if we find any they are false positives. for rel in sent_rels: arg1_type, arg1_ind = rel['arg1'].split('-') arg2_type, arg2_ind = rel['arg2'].split('-') if arg1_type == 'EVENT': arg1 = event_ids[int(arg1_ind)] elif arg1_type == 'TIMEX': arg1 = timex_ids[int(arg1_ind)] if arg1 == -1: continue if arg2_type == 'EVENT': arg2 = event_ids[int(arg2_ind)] elif arg2_type == 'TIMEX': arg2 = timex_ids[int(arg2_ind)] if arg2 == -1: continue reln = AnaforaRelation() reln.id = str(rel_id) + '@r@' + text_name rel_id += 1 reln.type = 'TLINK' reln.properties['Type'] = rel['category'] reln.properties['Source'] = arg1 reln.properties['Target'] = arg2 anafora_data.annotations.append(reln) #break anafora_data.indent() os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True) anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
def setUp(self): self.pwd = os.path.dirname(os.path.abspath(__file__)) self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), enable_logger=True)
class TestRuSH(unittest.TestCase): def setUp(self): self.pwd = os.path.dirname(os.path.abspath(__file__)) self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), enable_logger=True) def test1(self): input_str = 'Can Mr. K check it. Look\n good.\n' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 19) assert (sentences[1].begin == 20 and sentences[1].end == 31) def test2(self): input_str = 'S/p C6-7 ACDF. No urgent events overnight. Pain control ON. ' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 14) assert (sentences[1].begin == 15 and sentences[1].end == 42) assert (sentences[2].begin == 43 and sentences[2].end == 59) def test3(self): input_str = ''' • Coagulopathy (HCC) • Hepatic encephalopathy (HCC) • Hepatorenal syndrome (HCC) ''' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 1 and sentences[0].end == 22) assert (sentences[1].begin == 31 and sentences[1].end == 62) assert (sentences[2].begin == 71 and sentences[2].end == 100) def test4(self): input_str = 'Delirium - ' sentences = self.rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str) assert (sentences[0].begin == 0 and sentences[0].end == 8) pass def test5(self): input_str = "The patient complained about the TIA \n\n No memory issues. \"I \n\nOrdered the MRI scan.- " sentences = self.rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str) assert (sentences[0].begin == 0 and sentences[0].end == 36) assert (sentences[1].begin == 39 and sentences[1].end == 57) assert (sentences[2].begin == 58 and sentences[2].end == 84) pass def printDetails(self, sentences, input_str): for i in range(0, len(sentences)): sentence = sentences[i] print('assert (sentences[' + str(i) + '].begin == ' + str(sentence.begin) + ' and sentences[' + str(i) + '].end == ' + str(sentence.end) + ')') for i in range(0, len(sentences)): sentence = sentences[i] print(input_str[sentence.begin:sentence.end]) # self.printDetails(sentences, input_str) pass def test6(self): input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.''' sentences = self.rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str) def test7(self): input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.''' rules = [] rules.append(r'\b(\a 0 stbegin') rules.append(r'\a\e 2 stend') rules.append(r'. +(This 0 stbegin') rules.append(r'](. 2 stend') rush = RuSH(rules, enable_logger=True) sentences = rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str) def test_doc2(self): input_str = ''' 9. Advair b.i.d. 10. Xopenex q.i.d. and p.r.n. I will see her in a month to six weeks. She is to follow up with Dr. X before that. ''' self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True) sentences = self.rush.segToSentenceSpans(input_str) for sent in sentences: print('>' + input_str[sent.begin:sent.end] + '<\n') assert (len(sentences) == 4) sent = sentences[1] assert ( input_str[sent.begin:sent.end] == '10. Xopenex q.i.d. and p.r.n.')