Ejemplo n.º 1
 def test7(self):
     input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.'''
     rules = []
     rules.append(r'\b(\a	0	stbegin')
     rules.append(r'\a\e	2	stend')
     rules.append(r'. +(This	0	stbegin')
     rules.append(r'](.	2	stend')
     rush = RuSH(rules, enable_logger=True)
     sentences = rush.segToSentenceSpans(input_str)
     self.printDetails(sentences, input_str)
Ejemplo n.º 2
    def __init__(self, rules_path: str = '', max_repeat: int = 50, auto_fix_gaps: bool = True,
                 token_compatible: bool = True) -> Sentencizer:

        @param rules_path: The string of the rule file path or rules themselves.
        @param max_repeat: Total number of replicates that allows to be handled by "+" wildcard.
        @param auto_fix_gaps: If gaps are caused by malcrafted rules, try to fix them.
        @param token_compatible: when true, this approach only works for spacy >=2.2.3.
            However, this has no control of sentence end, TODO: need to see how the downsteam spacy components make use of doc.c
        self.rules_path = rules_path
        self.token_compatible = token_compatible
        self.rush = RuSH(rule_str=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)
Ejemplo n.º 3
    def test_doc2(self):
        input_str = '''  
9.  Advair b.i.d.
10.  Xopenex q.i.d. and p.r.n.
I will see her in a month to six weeks.  She is to follow up with Dr. X before that.
        self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')),
        sentences = self.rush.segToSentenceSpans(input_str)
        for sent in sentences:
            print('>' + input_str[sent.begin:sent.end] + '<\n')
        assert (len(sentences) == 4)
        sent = sentences[1]
        assert (
            input_str[sent.begin:sent.end] == '10.  Xopenex q.i.d. and p.r.n.')
Ejemplo n.º 4
class PyRuSHSentencizer(Sentencizer):
    def __init__(self, rules_path: str = '', max_repeat: int = 50, auto_fix_gaps: bool = True,
                 token_compatible: bool = True) -> Sentencizer:

        @param rules_path: The string of the rule file path or rules themselves.
        @param max_repeat: Total number of replicates that allows to be handled by "+" wildcard.
        @param auto_fix_gaps: If gaps are caused by malcrafted rules, try to fix them.
        @param token_compatible: when true, this approach only works for spacy >=2.2.3.
            However, this has no control of sentence end, TODO: need to see how the downsteam spacy components make use of doc.c
        self.rules_path = rules_path
        self.token_compatible = token_compatible
        self.rush = RuSH(rule_str=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)

    def from_nlp(cls, nlp, **cfg):
        return cls(**cfg)

    def __call__(self, doc):
        if self.token_compatible:

            tags = self.predict([doc])
            self.set_annotations([doc], tags)
            return doc
            doc = csegment(doc, self.rush.segToSentenceSpans(doc.text))
            return doc

    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without
        modifying them.
        guesses = cpredict(docs, self.rush.segToSentenceSpans)
        return guesses

    def set_annotations(self, docs, batch_tag_ids, tensors=None):
        This function overwrite spacy's Sentencizer.

        @param batch_tag_ids: a list of doc's tags (a list of boolean values)
        @param tensors: a place holder for future extensions
        cset_annotations(docs, batch_tag_ids, tensors)
Ejemplo n.º 5
 def setUp(self):
     pwd = os.path.dirname(os.path.abspath(__file__))
     self.rush = RuSH(str(os.path.join(pwd, '../../conf/rush_rules.tsv')))
Ejemplo n.º 6
class TestRuSH(unittest.TestCase):
    def setUp(self):
        pwd = os.path.dirname(os.path.abspath(__file__))
        self.rush = RuSH(str(os.path.join(pwd, '../../conf/rush_rules.tsv')))

    def test1(self):
        input_str = 'Can Mr. K check it. Look\n good.\n'
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 19)
        assert (sentences[1].begin == 20 and sentences[1].end == 31)

    def test2(self):
        input_str = 'S/p C6-7 ACDF. No urgent events overnight. Pain control ON. '
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 14)
        assert (sentences[1].begin == 15 and sentences[1].end == 42)
        assert (sentences[2].begin == 43 and sentences[2].end == 59)

    def test3(self):
        input_str = ''' •  Coagulopathy (HCC)    

 •  Hepatic encephalopathy (HCC)    

 •  Hepatorenal syndrome (HCC)    

        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 1 and sentences[0].end == 22)
        assert (sentences[1].begin == 31 and sentences[1].end == 62)
        assert (sentences[2].begin == 71 and sentences[2].end == 100)

    def test4(self):
        input_str = 'Delirium - '
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 10)

    def test5(self):
        input_str = "The patient complained about the TIA \n\n No memory issues. \"I \n\nOrdered the MRI scan.- "
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 36)
        assert (sentences[1].begin == 39 and sentences[1].end == 85)

    def printDetails(self, sentences, input_str):
        for i in range(0, len(sentences)):
            sentence = sentences[i]
            print('assert (sentences[' + str(i) + '].begin == ' +
                  str(sentence.begin) + ' and sentences[' + str(i) +
                  '].end == ' + str(sentence.end) + ')')
        # self.printDetails(sentences, input_str)

    def test6(self):
        input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.'''
        sentences = self.rush.segToSentenceSpans(input_str)
        self.printDetails(sentences, input_str)
Ejemplo n.º 7
def main(args):
    if len(args) < 3:
            "Required arguments: <input directory> <rest host> <output directory>\n"

    hostname = args[1]

    # initialize rest server
    init_url = 'http://%s:8000/temporal/initialize' % hostname
    process_url = 'http://%s:8000/temporal/process' % hostname

    # sentence segmenter
    rush = RuSH('conf/rush_rules.tsv')
    # tokenizer
    # tokenizer = TreebankWordTokenizer()

    r = requests.post(init_url)
    if r.status_code != 200:
        sys.stderr.write('Error: rest init call was not successful\n')

    for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex):
        print("Processing filename: %s" % (text_name))
        if len(xml_names) > 1:
                'There were multiple valid xml files for file %s' %
        xml_name = xml_names[0]

        with open(os.path.join(args[0], sub_dir, text_name)) as f:
            text = f.read()

        sentences = rush.segToSentenceSpans(text)
        sent_tokens = []

        for sentence in sentences:
            sent_txt = text[sentence.begin:sentence.end]

        r = requests.post(process_url, json={'sent_tokens': sent_tokens})
        if r.status_code != 200:
            sys.stderr.write('Error: rest call was not successful\n')

        json = r.json()
        anafora_data = AnaforaData()
        cur_id = 0

        for sent_ind, sentence in enumerate(sentences):
            sent_txt = text[sentence.begin:sentence.end]
            sent_events = json['events'][sent_ind]
            sent_timexes = json['timexes'][sent_ind]
                token_spans = align_tokens(sent_tokens[sent_ind], sent_txt)
            except Exception as e:
                    'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n'
                    % (text_name, str(e), sent_txt))

            for event in sent_events:
                begin_token_ind = event['begin']
                end_token_ind = event['end']
                dtr = event['dtr']
                event_start_offset = token_spans[begin_token_ind][
                    0] + sentence.begin
                event_end_offset = token_spans[end_token_ind][
                    1] + sentence.begin
                event_text = text[event_start_offset:event_end_offset]
                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name
                cur_id += 1
                annot.spans = ((event_start_offset, event_end_offset), )
                annot.type = "EVENT"
                annot.properties['DocTimeRel'] = dtr

                #print("Found event %s" % (event_text))

            for timex in sent_timexes:
                begin_token_ind = timex['begin']
                end_token_ind = timex['end']
                time_class = timex['timeClass']
                timex_start_offset = token_spans[begin_token_ind][
                    0] + sentence.begin
                timex_end_offset = token_spans[end_token_ind][
                    1] + sentence.begin
                timex_text = text[timex_start_offset:timex_end_offset]

                # create anafora entry
                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name
                cur_id += 1
                annot.spans = ((timex_start_offset, timex_end_offset), )
                annot.type = "TIMEX3"
                annot.properties['Class'] = time_class

                #print("Found timex %s" % (timex_text))

        os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True)
        anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
def main(args):
    if len(args) < 3:
            "Required arguments: <input directory> <rest host> <output directory>\n"

    hostname = args[1]

    # initialize rest server
    init_url = 'http://%s:8000/temporal/initialize' % hostname
    process_url = 'http://%s:8000/temporal/process' % hostname

    # sentence segmenter
    rush = RuSH('conf/rush_rules.tsv')
    # tokenizer
    tokenizer = TreebankWordTokenizer()

    r = requests.post(init_url)
    if r.status_code != 200:
        sys.stderr.write('Error: rest init call was not successful\n')

    combine_sentences = True
    token_threshold = 100

    for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex):
        print("Processing filename: %s" % (text_name))
        if len(xml_names) > 1:
                'There were multiple valid xml files for file %s\n' %
            filtered_names = []
            for xml_name in xml_names:
                if 'Relation' in xml_name:
            if len(filtered_names) == 1:
                    'Picking the file with "Relation" in the title: %s\n' %
                xml_names = filtered_names
        xml_name = xml_names[0]

        section_texts = []
        sentences = []
        text = ''
        with open(os.path.join(args[0], sub_dir, text_name)) as f:
            cur_section = []
            cur_ind = 0
            section_start = 0
            for line in f.readlines():
                text += line
                line_len = len(line)
                line = line.rstrip()
                if line.startswith('[meta') or line.startswith(
                        '[start section') or line.startswith('[end section'):
                    if len(cur_section) > 0:
                        section_text = '\n'.join(cur_section)
                        section_sents = rush.segToSentenceSpans(section_text)
                        if len(section_sents) > 0:
                            section_sents[0].text = '<section>'
                            #section_sents[-1].text = '</section>'
                        for section_sent in section_sents:
                            section_sent.begin += section_start
                            section_sent.end += section_start
                        cur_section = []
                    section_start = cur_ind + line_len
                cur_ind += line_len

        #sentences = rush.segToSentenceSpans(text)
        sent_tokens = []
        merged_sentences = []

        if combine_sentences:
            for sentence_ind, sentence in enumerate(sentences):
                sent_txt = text[sentence.begin:sentence.end]

                if tb_tokenize:
                    raw_tokens = tokenizer.tokenize(sent_txt)

                    # From https://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer.span_tokenize
                    # Convert converted quotes back to original double quotes
                    # Do this only if original text contains double quote(s) or double
                    # single-quotes (because '' might be transformed to `` if it is
                    # treated as starting quotes).
                    if ('"' in sent_txt) or ("''" in sent_txt):
                        # Find double quotes and converted quotes
                        matched = [
                            for m in re.finditer(r"``|'{2}|\"", sent_txt)

                        # Replace converted quotes back to double quotes
                        tokens = [
                            matched.pop(0) if tok in ['"', "``", "''"] else tok
                            for tok in raw_tokens
                        tokens = raw_tokens
                    tokens = tokenize(sent_txt)

                    # fix apostrophe s ('s) to be one token
                    def fix_simple_tokenize(tokens):
                        new_tokens = []
                        ind = 0
                        while ind < len(tokens):
                            if tokens[ind] == "'" and ind + 1 < len(
                                    tokens) and tokens[ind + 1] == 's':
                                ind += 2
                                ind += 1

                        return new_tokens

                    tokens = fix_simple_tokenize(tokens)

                if text[sentence.end] == '\n':

                # print("Sentence number %d has %d tokens" % (sentence_ind, len(tokens)))

                if len(sent_tokens) > 0 and (
                        len(sent_tokens[-1]) +
                        len(tokens)) < token_threshold and sentence.text == '':
                    merged_sentences[-1].end = sentence.end
            for tokens in sent_tokens:
                while tokens[-1] == "<cr>":

            sentences = merged_sentences
            for sentence in sentences:
                sent_txt = text[sentence.begin:sentence.end]

        r = requests.post(process_url,
                              'sent_tokens': sent_tokens,
                              'metadata': text_name
        if r.status_code != 200:
            sys.stderr.write('Error: rest call was not successful\n')

        json = r.json()
        anafora_data = AnaforaData()
        cur_id = 0
        rel_id = 0

        for sent_ind, sentence in enumerate(sentences):
            sent_txt = text[sentence.begin:sentence.end]
            sent_events = json['events'][sent_ind]
            sent_timexes = json['timexes'][sent_ind]
            sent_rels = json['relations'][sent_ind]
            event_ids = []
            timex_ids = []

            meta_rev_loc = sent_txt.find('[meta rev_date')
            if meta_rev_loc >= 0:
                meta_rev_end = sent_txt.find(']', meta_rev_loc)
                meta_rev_loc += sentence.begin
                meta_rev_end += sentence.begin

            # Replace <cr> with empty string so that tokens align again,
            # then after alignment add them back in so token offsets from classifier are correct.
            cr_token_inds = []
            num_crs_at_position = []
            for ind in range(len(sent_tokens[sent_ind])):
                if sent_tokens[sent_ind][ind] == '<cr>':
                    sent_tokens[sent_ind][ind] = ''

                token_spans = align_tokens(sent_tokens[sent_ind], sent_txt)
            except Exception as e:
                    'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n'
                    % (text_name, str(e), sent_txt))

            for event in sent_events:
                begin_token_ind = event['begin']
                end_token_ind = event['end']
                dtr = event['dtr']
                event_start_offset = token_spans[
                    begin_token_ind +
                    num_crs_at_position[begin_token_ind]][0] + sentence.begin
                event_end_offset = token_spans[
                    end_token_ind +
                    num_crs_at_position[end_token_ind]][1] + sentence.begin
                event_text = text[event_start_offset:event_end_offset]

                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name

                if event_text.endswith('_date'):
                    annot.properties['datesectiontime'] = 'True'
                    annot.spans = ((event_start_offset, event_end_offset), )
                    annot.type = "EVENT"
                    annot.properties['DocTimeRel'] = dtr

                cur_id += 1

                #print("Found event %s" % (event_text))

            for timex in sent_timexes:
                begin_token_ind = timex['begin']
                end_token_ind = timex['end']
                time_class = timex['timeClass']
                timex_start_offset = token_spans[
                    begin_token_ind +
                    num_crs_at_position[begin_token_ind]][0] + sentence.begin
                timex_end_offset = token_spans[
                    end_token_ind +
                    num_crs_at_position[end_token_ind]][1] + sentence.begin
                timex_text = text[timex_start_offset:timex_end_offset]

                if meta_rev_loc >= 0 and timex_start_offset > meta_rev_loc and timex_end_offset < meta_rev_end:
                elif time_class == 'SECTIONTIME':
                elif not re.match(r'\d{5}', timex_text) is None:
                    # create anafora entry
                    annot = AnaforaEntity()
                    annot.id = str(cur_id) + "@e@" + text_name
                    cur_id += 1
                    annot.spans = ((timex_start_offset, timex_end_offset), )
                    annot.type = "TIMEX3"
                    annot.properties['Class'] = time_class

                #print("Found timex %s" % (timex_text))

            if not 'path' in text_name.lower():
                # no relations in pathology notes, so if we find any they are false positives.
                for rel in sent_rels:
                    arg1_type, arg1_ind = rel['arg1'].split('-')
                    arg2_type, arg2_ind = rel['arg2'].split('-')
                    if arg1_type == 'EVENT':
                        arg1 = event_ids[int(arg1_ind)]
                    elif arg1_type == 'TIMEX':
                        arg1 = timex_ids[int(arg1_ind)]

                    if arg1 == -1:

                    if arg2_type == 'EVENT':
                        arg2 = event_ids[int(arg2_ind)]
                    elif arg2_type == 'TIMEX':
                        arg2 = timex_ids[int(arg2_ind)]

                    if arg2 == -1:

                    reln = AnaforaRelation()
                    reln.id = str(rel_id) + '@r@' + text_name
                    rel_id += 1
                    reln.type = 'TLINK'
                    reln.properties['Type'] = rel['category']
                    reln.properties['Source'] = arg1
                    reln.properties['Target'] = arg2


        os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True)
        anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
Ejemplo n.º 9
 def setUp(self):
     self.pwd = os.path.dirname(os.path.abspath(__file__))
     self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), enable_logger=True)
Ejemplo n.º 10
class TestRuSH(unittest.TestCase):
    def setUp(self):
        self.pwd = os.path.dirname(os.path.abspath(__file__))
        self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')),

    def test1(self):
        input_str = 'Can Mr. K check it. Look\n good.\n'
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 19)
        assert (sentences[1].begin == 20 and sentences[1].end == 31)

    def test2(self):
        input_str = 'S/p C6-7 ACDF. No urgent events overnight. Pain control ON. '
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 14)
        assert (sentences[1].begin == 15 and sentences[1].end == 42)
        assert (sentences[2].begin == 43 and sentences[2].end == 59)

    def test3(self):
        input_str = ''' •  Coagulopathy (HCC)    

 •  Hepatic encephalopathy (HCC)    

 •  Hepatorenal syndrome (HCC)    

        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 1 and sentences[0].end == 22)
        assert (sentences[1].begin == 31 and sentences[1].end == 62)
        assert (sentences[2].begin == 71 and sentences[2].end == 100)

    def test4(self):
        input_str = 'Delirium - '
        sentences = self.rush.segToSentenceSpans(input_str)
        self.printDetails(sentences, input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 8)

    def test5(self):
        input_str = "The patient complained about the TIA \n\n No memory issues. \"I \n\nOrdered the MRI scan.- "
        sentences = self.rush.segToSentenceSpans(input_str)
        self.printDetails(sentences, input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 36)
        assert (sentences[1].begin == 39 and sentences[1].end == 57)
        assert (sentences[2].begin == 58 and sentences[2].end == 84)

    def printDetails(self, sentences, input_str):
        for i in range(0, len(sentences)):
            sentence = sentences[i]
            print('assert (sentences[' + str(i) + '].begin == ' +
                  str(sentence.begin) + ' and sentences[' + str(i) +
                  '].end == ' + str(sentence.end) + ')')
        for i in range(0, len(sentences)):
            sentence = sentences[i]
        # self.printDetails(sentences, input_str)

    def test6(self):
        input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.'''
        sentences = self.rush.segToSentenceSpans(input_str)
        self.printDetails(sentences, input_str)

    def test7(self):
        input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.'''
        rules = []
        rules.append(r'\b(\a	0	stbegin')
        rules.append(r'\a\e	2	stend')
        rules.append(r'. +(This	0	stbegin')
        rules.append(r'](.	2	stend')
        rush = RuSH(rules, enable_logger=True)
        sentences = rush.segToSentenceSpans(input_str)
        self.printDetails(sentences, input_str)

    def test_doc2(self):
        input_str = '''  
9.  Advair b.i.d.
10.  Xopenex q.i.d. and p.r.n.
I will see her in a month to six weeks.  She is to follow up with Dr. X before that.
        self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')),
        sentences = self.rush.segToSentenceSpans(input_str)
        for sent in sentences:
            print('>' + input_str[sent.begin:sent.end] + '<\n')
        assert (len(sentences) == 4)
        sent = sentences[1]
        assert (
            input_str[sent.begin:sent.end] == '10.  Xopenex q.i.d. and p.r.n.')