def __init__(self, targets=None, modifiers=None, feature_inference_rule=None, document_inference_rule=None, rush_rule=None, expected_values=[], save_markups=True): self.document_inferencer = DocumentInferencer(document_inference_rule) self.feature_inferencer = FeatureInferencer(feature_inference_rule) self.conclusions = [] self.modifiers = modifiers self.targets = targets self.save_markups = save_markups self.expected_values = [value.lower() for value in expected_values] self.saved_markups_map = dict() self.pyrush = None if rush_rule is None or not os.path.isfile(rush_rule): rush_rule = ConfigReader.getValue('rush_rules_path') if rush_rule is not None and os.path.isfile(rush_rule): self.pyrush = RuSH(rush_rule) else: logMsg(("File not found", os.path.abspath(rush_rule))) self.last_doc_name = '' if modifiers is not None and targets is not None: if isinstance(modifiers, str) and isinstance(targets, str): if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith( '.txt') or modifiers.endswith('.yml')) \ and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith( '.txt') or targets.endswith('.yml') or targets.startswith('Lex\t')): self.setModifiersTargetsFromFiles(modifiers, targets) else: self.setModifiersTargets(modifiers, targets) RBDocumentClassifier.instance = self
def __init__(self): #initiate necessary components self.target_rules = self.getTargetRegexes() self.negation_rules = self.getNegRegexes() self.section_rules = self.getSectionRegexes() # new self.target_scores = self.target_score() # new self.sentence_rules = 'KB/rush_rules.tsv' self.sentence_segmenter = RuSH(self.sentence_rules)
def __init__(self, sentence_rules, target_rules, context_rules, feature_inference_rule, document_inference_rule): self.sentence_rules = sentence_rules self.target_rules = target_rules self.context_rules = context_rules self.feature_inference_rule = feature_inference_rule self.document_inference_rule = document_inference_rule self.sentence_segmenter = RuSH(self.sentence_rules) self.feature_inferencer = FeatureInferencer( self.feature_inference_rule) self.document_inferencer = DocumentInferencer( self.document_inference_rule) self.targets = get_item_data(self.target_rules) self.modifiers = get_item_data(self.context_rules)
def __init__(self, verbose, mode, sentence_tokenizer, target_rules, modifier_rules): # initiate MyPipe necessary components here self.verbose = verbose self.mode = mode if sentence_tokenizer.lower() == 'pyrush': self.sentence_tokenizer = RuSH( path.abspath(path.join('kb', 'rush_rules.tsv'))) elif sentence_tokenizer.lower() == 'resplit': self.sentence_tokenizer = RESplit( self.verbose, path.abspath(path.join('kb', 'resplit_rules.yml'))) elif sentence_tokenizer.lower() == 'helpers': self.sentence_tokenizer = helpers.sentenceSplitter() self.targets = get_item_data(target_rules) self.modifiers = get_item_data(modifier_rules)
''' # define the feature inference rule feature_inference_rule = ''' #Conclusion type, Evidence type, Modifier values associated with the evidence NEGATED_CONCEPT,FEVER,DEFINITE_NEGATED_EXISTENCE ''' # define the document inference rule document_inference_rule = ''' #Conclusion Type at document level, Evidence type at mention level FEVER_DOC,FEVER #Default document type NO_FEVER ''' sentence_segmenter = RuSH(sentence_rules) feature_inferencer = FeatureInferencer(feature_inference_rule) document_inferencer = DocumentInferencer(document_inference_rule) targets = get_item_data(target_rules) modifiers = get_item_data(context_rules) # Example sentences #input = 'This is a sentence. It is just a test. I like this sentence.' input = ''' No vomiting, chest pain, shortness of breath, nausea, dizziness, or chills on arrival. On operative day three, the patient fever was detected with temperature 101.5 F. After 3 days no fever was detected. Patient came back for a follow up, denies fever. '''
def __init__(self, sentence_rules, target_rules, context_rules): self.sentence_segmenter = RuSH(sentence_rules) self.targets = get_item_data(target_rules) self.modifiers = get_item_data(context_rules)
def __init__(self, sentence_rules, target_rules, between_rules): self.sentence_segmenter = RuSH(sentence_rules) self.targets = get_item_data(target_rules) self.between_rules = between_rules
def setUp(self): self.rush = RuSH('../conf/rush_rules.tsv')
12. Vancomycin 750 mg intravenously twice per day (times 14 days). 13. Codeine/guaifenesin syrup 5 cc to 10 cc by mouth q.6h. as needed. 14. Klonopin 0.75 mg by mouth in the morning and 0.5 mg by mouth at hour of sleep. 15. Multivitamin one tablet by mouth once per day. [**Name6 (MD) 2381**] [**Last Name (NamePattern4) 3424**], M.D. [**MD Number(1) 3425**] Dictated By:[**Last Name (NamePattern1) 3426**] MEDQUIST36 D: [**3399-4-10**] 14:55 T: [**3399-4-12**] 08:50 JOB#: [**Job Number 19798**] 4_4788.txt Open with Displaying 4_4788.txt.''' rush = RuSH('conf/rush_rules.tsv') sentences = rush.segToSentenceSpans(txt) for sentence in sentences: print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end, txt[sentence.begin:sentence.end])) print('\n' + '-' * 100 + '\n')
def __init__(self, annotation_type='SOCIAL_SUPPORT', default_value='no mention', filter_file='conf/keywords_filter.txt', stopwords_file='conf/stop_words.txt', word2vec_file='models/glove.word2vec.txt.bin', rush_rules='conf/rush_rules.tsv', max_token_per_sentence=150): # each time we only train/predict a models for one annotation type # set an arbitrary max length of sentences, so that we can pad sentences without knowing the max length of sentences in testing set. self.max_token_per_sentence = max_token_per_sentence self.annotation_type = annotation_type self.default_value = default_value self.real_max_length = 0 self.rush = RuSH(rush_rules) self.html_tokens_p = re.compile('^\&[a-z]{2,4}\;$') self.punctuations = set(string.punctuation) # keep '?' self.punctuations.remove('?') self.spacy_nlp = spacy.load('en', disable=['parser', 'tagger', 'ner']) self.matcher = None self.corpus = None keywords_filter = [] print('load filter keywords') # load filter keywords if path.isfile(filter_file): f = open(filter_file, 'r') keywords_filter = [ line for line in f.readlines() if not line.startswith('#') ] f.close() if len(keywords_filter) > 0: self.matcher = matcher.PhraseMatcher( self.spacy_nlp.tokenizer.vocab, max_length=6) for keyword in keywords_filter: self.matcher.add(keyword, None) print('load stopwords') # load stop words if path.isfile(stopwords_file): f = open(stopwords_file, 'r') self.my_stopwords = set(f.readlines()) f.close() else: self.my_stopwords = set(nltk.corpus.stopwords.words('english')) f = open(stopwords_file, 'w') f.writelines('\n'.join(self.my_stopwords)) f.close() print('load label dictionary') self.label_dict = None self.label_dict_file = 'models/' + self.annotation_type + '_labels.dict' # load dictionary if path.isfile(self.label_dict_file): self.label_dict = Dictionary.load(self.label_dict_file) print('load glove model') # self.glove_model = glove2word2vec.smart_open(word2vec_file) if path.isfile(word2vec_file): if word2vec_file.endswith('.bin'): self.glove_model = KeyedVectors.load_word2vec_format( word2vec_file, binary=True) else: self.glove_model = KeyedVectors.load_word2vec_format( word2vec_file, binary=False) print('convert txt model to binary model...') self.glove_model.save_word2vec_format(word2vec_file + '.bin', binary=True) pass
def build_n2c2_tokenizer(keep_token_strings=False, enable_pyrush_sentence_tokenizer=False, disable_custom_preprocessing=True): print('Building n2c2 tokenizer...') cs_preprocess_split_re_strings = [] # double newlines cs_preprocess_split_re_strings.append(r'[\r\n]{2,}') # newlines with only spaces cs_preprocess_split_re_strings.append(r'[\r\n]+\s+[\r\n]+') # numbered lists (e.g. "1.", "2)") cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\d+[.)-]') # bulleted lists (e.g."*", "-") cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*[*-]') # starting labels (e.g. "WEIGHT:") cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\w+[:]') # break up other lines separated by dates cs_preprocess_split_re_strings.append( r'(^|\r|\n)+\s*\d{1,2}[/-]\d{1,2}[/-]\d{2,4}') # MIMIC has many lines that start with this [**YYYY-M-DD**] cs_preprocess_split_re_strings.append(r'^\[\*+\d{4}-\d{1,2}-\d{1,2}\*+\]') # TIU notes have long bars like this : '***********' or '===========' or '------' cs_preprocess_split_re_strings.append(r'[*=-]{3,}') # NOTE : This breaking rule was disabled 2-13-18 since the UMass MADE challenge data often ended each line with 2 spaces and a # newline which caused this aggressive rule to fire over and over again. # aggressively break anything with lots of spaces (tabular data) #cs_preprocess_split_re_strings.append(r'\s{3,}') custom_lang_vars = CustomSentenceBreakingLangVars() custom_lang_vars.sent_end_chars = ('.', '!') print(custom_lang_vars.sent_end_chars) punkt_tokenizer2 = PunktSentenceTokenizer(lang_vars=custom_lang_vars) treebank_tokenizer = TreebankWordTokenizer() # looks like "pt." and "D.R." and "P.R." are already being handled #punkt_tokenizer2._params.abbrev_types.update(extra_abbrev) sentence_tokenizer = None if enable_pyrush_sentence_tokenizer: print('Enabling PyRuSH for sentence tokenization...') pyrush_sentence_tokenizer = RuSH( 'resources/PyRuSH/conf/rush_rules.tsv') sentence_tokenizer = pyrush_sentence_tokenizer else: print('Enabling NLTK Punkt for sentence tokenization...') sentence_tokenizer = punkt_tokenizer2 print('Type of sentence tokenizer : {}'.format(type(sentence_tokenizer))) enabled_preprocessing_expressions = [] if not disable_custom_preprocessing: print('Enabling custom preprocessing expressions. Total : {}'.format( len(cs_preprocess_split_re_strings))) enabled_preprocessing_expressions = cs_preprocess_split_re_strings else: print('Not allowing custom preprocessing expressions...') cs_tokenizer = ClinicalSentenceTokenizer( default_sentence_tokenizer=sentence_tokenizer, preprocess_split_re_strs=enabled_preprocessing_expressions) index_tokenizer = IndexTokenizer(cs_tokenizer, treebank_tokenizer, keep_token_strings=keep_token_strings) return index_tokenizer
def __init__(self, rules='./rush_rules.tsv'): self.rules = rules self.rush = RuSH(self.rules)
return sent_spans class DefaultSentenceTokenizer(object): def __init__(self): self.tokenizer = PunktSentenceTokenizer() def tokenize_sents(self, text): """ Returns spans """ return self.tokenizer.span_tokenize(text) if __name__ == '__main__': rush = RuSH('rush_rules.tsv') input_str = "The patient was admitted on 03/26/08\n and was started on IV antibiotics elevation" +\ ", was also counseled to minimizing the cigarette smoking. The patient had edema\n\n" +\ "\n of his bilateral lower extremities. The hospital consult was also obtained to " +\ "address edema issue question was related to his liver hepatitis C. Hospital consult" +\ " was obtained. This included an ultrasound of his abdomen, which showed just mild " +\ "cirrhosis. " with open('failed.txt') as f: input_str = f.read() sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv') sent_tokenizer = DefaultSentenceTokenizer() print(sent_tokenizer.tokenize_sents(input_str)) #print(sent_tokenizer.span_tokenize(input_str)) exit() print(sent_tokenizer.tokenize_sents(input_str))
def pyRuSHSplitter(self, text): rush = RuSH(ConfigReader.getValue('rush_rules_path')) sentences = rush.segToSentenceSpans(text) return [ text[sentence.begin:sentence.end].strip() for sentence in sentences ]