class ClassificationPipe(object): def __init__(self, sentence_rules, target_rules, context_rules): self.sentence_segmenter = RuSH(sentence_rules) self.targets = get_item_data(target_rules) self.modifiers = get_item_data(context_rules) def process(self, doc_text): sentences = self.sentence_segmenter.segToSentenceSpans(doc_text) new_anns = [] for sentence in sentences: start_offset = sentence.begin sentence_text = doc_text[sentence.begin:sentence.end].lower() m = self.markup_sentence(sentence_text, modifiers=self.modifiers, targets=self.targets) annotations = self.convertMarkupsAnnotations( m, sentence_text, start_offset) new_anns.extend(annotations) return new_anns def markup_sentence(self, sentence, modifiers, targets): markup = pyConTextGraph.ConTextMarkup() txt = sentence.lower() markup.setRawText(txt) markup.graph["__txt"] = txt markup.graph["__scope"] = (0, len(txt)) markup.markItems(targets, mode="target") markup.markItems(modifiers, mode="modifier") markup.pruneMarks() markup.dropMarks('Exclusion') markup.applyModifiers() markup.pruneSelfModifyingRelationships() markup.dropInactiveModifiers() return markup def convertMarkupsAnnotations(self, markups, sentence_text, offset=0): annotations = [] nodes = markups.nodes() for n in nodes: new_ann = Annotation( start_index=offset + n.getSpan()[0], end_index=offset + n.getSpan()[1], type=n.getCategory()[0], spanned_text=sentence_text[n.getSpan()[0]:n.getSpan()[1]], ann_id=n.getTagID()) mods = markups.getModifiers(n) if (len(mods) > 0): for modifier in mods: new_ann.attributes[modifier.getCategory() [0]] = modifier.getTagID() annotations.append(new_ann) return annotations
class TestRuSH(unittest.TestCase): def setUp(self): self.rush = RuSH('../conf/rush_rules.tsv') def test1(self): input_str = 'Can Mr. K check it. Look\n good.\n' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 19) assert (sentences[1].begin == 20 and sentences[1].end == 31) def test2(self): input_str = 'S/p C6-7 ACDF. No urgent events overnight. Pain control ON. ' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 14) assert (sentences[1].begin == 15 and sentences[1].end == 42) assert (sentences[2].begin == 43 and sentences[2].end == 59) def test3(self): input_str = ''' • Coagulopathy (HCC) • Hepatic encephalopathy (HCC) • Hepatorenal syndrome (HCC) ''' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 1 and sentences[0].end == 22) assert (sentences[1].begin == 31 and sentences[1].end == 62) assert (sentences[2].begin == 71 and sentences[2].end == 100) def test4(self): input_str = 'Delirium - ' sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 10) pass def test5(self): input_str = "The patient complained about the TIA \n\n No memory issues. \"I \n\nOrdered the MRI scan.- " sentences = self.rush.segToSentenceSpans(input_str) assert (sentences[0].begin == 0 and sentences[0].end == 36) assert (sentences[1].begin == 39 and sentences[1].end == 85) pass def printDetails(self, sentences, input_str): for i in range(0, len(sentences)): sentence = sentences[i] print('assert (sentences[' + str(i) + '].begin == ' + str(sentence.begin) + ' and sentences[' + str(i) + '].end == ' + str(sentence.end) + ')') # self.printDetails(sentences, input_str) pass def test6(self): input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.''' sentences = self.rush.segToSentenceSpans(input_str) self.printDetails(sentences, input_str)
class Mypipe: """PyContextNLP pipeline, sentence_rules, target_rules, context_rules, feature_inference_rule, document_inference_rule""" def __init__(self, sentence_rules, target_rules, context_rules, feature_inference_rule, document_inference_rule): self.sentence_rules = sentence_rules self.target_rules = target_rules self.context_rules = context_rules self.feature_inference_rule = feature_inference_rule self.document_inference_rule = document_inference_rule self.sentence_segmenter = RuSH(self.sentence_rules) self.feature_inferencer = FeatureInferencer( self.feature_inference_rule) self.document_inferencer = DocumentInferencer( self.document_inference_rule) self.targets = get_item_data(self.target_rules) self.modifiers = get_item_data(self.context_rules) def process(self, doc_text): """PyContextNLP, return doc_class, context_doc, annotations, relations""" context_doc = pyConTextGraph.ConTextDocument() sentences = self.sentence_segmenter.segToSentenceSpans(doc_text) for sentence in sentences: sentence_text = doc_text[sentence.begin:sentence.end].lower() # Process every sentence by adding markup m = markup_sentence(sentence_text, modifiers=self.modifiers, targets=self.targets) context_doc.addMarkup(m) context_doc.getSectionMarkups() # print(m) # print(context_doc.getXML()) # convert graphic markups into dataframe markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) # display(annotations) # display(relations) # apply inferences for document classication inferenced_types = self.feature_inferencer.process( annotations, relations) # print('After inferred from modifier values, we got these types:\n '+str(inferenced_types)) doc_class = self.document_inferencer.process(inferenced_types) # print('\nDocument classification: '+ doc_class ) return doc_class, context_doc, annotations, relations
class ClinicalRushSentenceTokenizer(object): def __init__(self, rules='./rush_rules.tsv'): self.rules = rules self.rush = RuSH(self.rules) def tokenize_sents(self, text): #try: # sent_spans = self.rush.segToSentenceSpans(text) #except Exception as e: # # Let's try to track down where this is happening in the text # for i in range(int(len(text)/10)): # start = i * 10 # end = start + 10 # try: # self.rush.segToSentenceSpans(text[start:end]) # except Exception as e: # with open('failed_snippet.txt', 'a') as f: # f.write(text[start:end] + '\n') # print("Failed at {}".format(start)) # raise e #sent_spans = [(s.begin, s.end) for s in sent_spans] sent_spans = self.rush.segToSentenceSpans(text) return sent_spans
document_inferencer = DocumentInferencer(document_inference_rule) targets = get_item_data(target_rules) modifiers = get_item_data(context_rules) # Example sentences #input = 'This is a sentence. It is just a test. I like this sentence.' input = ''' No vomiting, chest pain, shortness of breath, nausea, dizziness, or chills on arrival. On operative day three, the patient fever was detected with temperature 101.5 F. After 3 days no fever was detected. Patient came back for a follow up, denies fever. ''' sentences = sentence_segmenter.segToSentenceSpans(input) # See what the document was splitted into for sentence in sentences: print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end, input[sentence.begin:sentence.end])) print('\n' + '-' * 100 + '\n') # initiate a pyConTextGraph to hold the pyConText output context_doc = pyConTextGraph.ConTextDocument() for sentence in sentences: sentence_text = input[sentence.begin:sentence.end].lower() # Process every sentence by adding markup m = markup_sentence(sentence_text, modifiers=modifiers, targets=targets) context_doc.addMarkup(m)
class ExtractionPipe(object): def __init__(self, sentence_rules, target_rules, between_rules): self.sentence_segmenter = RuSH(sentence_rules) self.targets = get_item_data(target_rules) self.between_rules = between_rules def process(self, doc_text): sentences = self.sentence_segmenter.segToSentenceSpans(doc_text) new_anns = [] for sentence in sentences: start_offset = sentence.begin sentence_text = doc_text[sentence.begin:sentence.end].lower() m = self.markup_sentence_extract(sentence_text, targets=self.targets) annotations = self.classify_relationships(m, sentence_text, start_offset) new_anns.extend(annotations) return new_anns def markup_sentence_extract(self, sentence, targets): markup = pyConTextGraph.ConTextMarkup() txt = sentence.lower() markup.setRawText(txt) markup.graph["__txt"] = txt markup.graph["__scope"] = (0, len(txt)) markup.markItems(targets, mode="target") markup.pruneMarks() markup.dropMarks('Exclusion') markup.pruneSelfModifyingRelationships() return markup def classify_relationships(self, markups, sentence_text, offset=0): all_targets = markups.getMarkedTargets() annotations = [] if len(all_targets) > 1: for index, target in enumerate(all_targets): target_cat = target.getCategory()[0] try: future_target = all_targets[index + 1] future_cat = future_target.getCategory()[0] except Exception: pass if target_cat == 'oxygen_saturation': if future_cat == 'value': start_text_index = target.getSpan()[1] end_text_index = future_target.getSpan()[0] between_text = sentence_text[ start_text_index:end_text_index] between_text = between_text.strip() if between_text in self.between_rules: new_ann = Annotation( start_index=offset + future_target.getSpan()[0], end_index=offset + future_target.getSpan()[1], type=future_cat, spanned_text=sentence_text[ future_target.getSpan( )[0]:future_target.getSpan()[1]], ann_id=future_target.getTagID()) annotations.append(new_ann) return annotations
class NLPClassificationSystem: def __init__(self): #initiate necessary components self.target_rules = self.getTargetRegexes() self.negation_rules = self.getNegRegexes() self.section_rules = self.getSectionRegexes() # new self.target_scores = self.target_score() # new self.sentence_rules = 'KB/rush_rules.tsv' self.sentence_segmenter = RuSH(self.sentence_rules) def process(self, document): # document.text = self.filterSection(document.text) # new document_id = document.document_id ann_index = 0 #--------- #all_sent = sent_tokenize(document.text) sentences = self.sentence_segmenter.segToSentenceSpans(document.text) #sent_begin = 0 for sentence in sentences: sent = document.text[sentence.begin:sentence.end].lower() #--------- for reg in self.target_rules: for match in reg.finditer(sent): ann_id = 'NLP_' + document_id + '_' + str( ann_index ) #str(document_id) if document_id is numeric use this ann_index = ann_index + 1 new_annotation = Annotation( start_index=int(match.start() + sentence.begin), end_index=int(match.end() + sentence.begin), type='psy_ann', ann_id=ann_id) new_annotation.spanned_text = match.group() #new_annotation.spanned_text = sent[new_annotation.start_index:new_annotation.end_index] for neg_regex in self.negation_rules: if re.search(neg_regex, sent): new_annotation.attributes["Negation"] = "Negated" document.annotations.append(new_annotation) #sent_begin = sent_begin + len(sent) return document def getTargetRegexes(self): target_regexes = [] with open('./KB/NIMH_target_1116_ax.csv', 'r') as f1: #NIMH_target_1116.csv regexes = f1.read().splitlines() for reg in regexes: if reg.startswith('#'): # == '#': continue reg = reg.replace("\"", "") target_regexes.append(re.compile(reg, re.IGNORECASE)) return target_regexes def getNegRegexes(self): neg_regexes = [] with open('./KB/NIMH_negation_1116.csv', 'r') as f1: regexes = f1.read().splitlines() for reg in regexes: if reg.startswith('#'): # == '#': continue reg = reg.replace("\"", "") neg_regexes.append(re.compile(reg, re.IGNORECASE)) return neg_regexes def getSectionRegexes(self): # new section_regexes = [] with open('./KB/section_1116_ax.csv', 'r') as f1: regexes = f1.read().splitlines() for reg in regexes: if reg.startswith('#'): # == '#': continue reg = reg.replace("\"", "") section_regexes.append( re.compile( reg, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE)) return section_regexes def filterSection(self, txt): # new txt_list = [] for reg in self.section_rules: for match in reg.finditer(txt): txt_list.append(match.group()) txt_str = '...... '.join(txt_list) return txt_str def target_score(self): ann_target_score = pd.read_csv("./KB/NIMH_target_score_1116.csv", sep='$') score1 = dict() for index, row in ann_target_score.iterrows(): row0 = str(row[0]).lower() score = {row0: row[1]} score1.update(score) return score1
class RBDocumentClassifier(BaseClassifier): ready = True def __init__(self, targets=None, modifiers=None, feature_inference_rule=None, document_inference_rule=None, rush_rule=None, expected_values=[], save_markups=True): self.document_inferencer = DocumentInferencer(document_inference_rule) self.feature_inferencer = FeatureInferencer(feature_inference_rule) self.conclusions = [] self.modifiers = modifiers self.targets = targets self.save_markups = save_markups self.expected_values = [value.lower() for value in expected_values] self.saved_markups_map = dict() self.pyrush = None if rush_rule is None or not os.path.isfile(rush_rule): rush_rule = ConfigReader.getValue('rush_rules_path') if rush_rule is not None and os.path.isfile(rush_rule): self.pyrush = RuSH(rush_rule) else: logMsg(("File not found", os.path.abspath(rush_rule))) self.last_doc_name = '' if modifiers is not None and targets is not None: if isinstance(modifiers, str) and isinstance(targets, str): if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith( '.txt') or modifiers.endswith('.yml')) \ and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith( '.txt') or targets.endswith('.yml') or targets.startswith('Lex\t')): self.setModifiersTargetsFromFiles(modifiers, targets) else: self.setModifiersTargets(modifiers, targets) RBDocumentClassifier.instance = self def setModifiersTargets(self, modifiers, targets): self.modifiers = modifiers self.targets = targets def setModifiersTargetsFromFiles(self, modifiers_file, targets_file): self.targets = get_item_data(targets_file) self.modifiers = get_item_data(modifiers_file) def reset_saved_predictions(self): self.saved_markups_map = {} self.save_markups = True self.expected_value = None def predict(self, doc, doc_name='t_m_p.txt'): self.last_doc_name = doc_name doc_conclusion = self.classify(doc, doc_name) if doc_conclusion in self.expected_values: return 1 return 0 def eval(self, gold_docs): import sklearn import pandas as pd fn_docs = [] fp_docs = [] prediction_metrics = [] gold_labels = [x.positive_label for x in gold_docs.values()] pred_labels = [] logMsg('Start to evaluate against reference standards...') for doc_name, gold_doc in gold_docs.items(): gold_label = gold_doc.positive_label pred_label = self.predict(gold_doc.text, doc_name) pred_labels.append(pred_label) # Differentiate false positive and false negative error if gold_label == 0 and pred_label == 1: fp_docs.append(doc_name) elif gold_label == 1 and pred_label == 0: fn_docs.append(doc_name) precision = sklearn.metrics.precision_score(gold_labels, pred_labels) recall = sklearn.metrics.recall_score(gold_labels, pred_labels) f1 = sklearn.metrics.f1_score(gold_labels, pred_labels) # Let's use Pandas to make a confusion matrix for us confusion_matrix_df = pd.crosstab( pd.Series(gold_labels, name='Actual'), pd.Series(pred_labels, name='Predicted')) prediction_metrics.append('Precision : {0:.3f}'.format(precision)) prediction_metrics.append('Recall : {0:.3f}'.format(recall)) prediction_metrics.append('F1: {0:.3f}'.format(f1)) return fn_docs, fp_docs, '\n'.join( prediction_metrics), confusion_matrix_df[[1, 0]].reindex([1, 0]) def predict_against(self, doc, expected_values, doc_name='t_m_p.txt'): doc_conclusion = self.classify(doc, doc_name) if doc_conclusion in expected_values: return 1 return 0 def classify(self, doc, doc_name='t_m_p.txt'): self.last_doc_name = doc_name if self.modifiers is None or self.targets is None: logMsg( 'DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' + 'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.') try: context_doc = self.markup_context_document(doc, self.modifiers, self.targets) if self.save_markups and doc_name is not None and len( context_doc.getDocumentGraph().nodes()) > 0: self.saved_markups_map[doc_name] = context_doc markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) matched_conclusion_types = self.feature_inferencer.process( annotations, relations) doc_conclusion = self.document_inferencer.process( matched_conclusion_types) except: # pyConText might through errors in some case, will fix it later doc_conclusion = self.document_inferencer.default_conclusion return doc_conclusion def train(self, x, y): """just for implement the interface""" pass def get_last_context_doc(self): if self.last_doc_name in self.saved_markups_map: return self.saved_markups_map[self.last_doc_name] else: return None def markup_context_document(self, report_text, modifiers, targets): context = pyConTextGraph.ConTextDocument() # we will use TextBlob for breaking up sentences if self.pyrush is None: from textblob import TextBlob sentences = [s.raw for s in TextBlob(report_text).sentences] else: sentences = [ report_text[sentence.begin:sentence.end] for sentence in self.pyrush.segToSentenceSpans(report_text) ] for sentence in sentences: m = markup_sentence(sentence, modifiers=modifiers, targets=targets) context.addMarkup(m) context.getSectionMarkups() return context
12. Vancomycin 750 mg intravenously twice per day (times 14 days). 13. Codeine/guaifenesin syrup 5 cc to 10 cc by mouth q.6h. as needed. 14. Klonopin 0.75 mg by mouth in the morning and 0.5 mg by mouth at hour of sleep. 15. Multivitamin one tablet by mouth once per day. [**Name6 (MD) 2381**] [**Last Name (NamePattern4) 3424**], M.D. [**MD Number(1) 3425**] Dictated By:[**Last Name (NamePattern1) 3426**] MEDQUIST36 D: [**3399-4-10**] 14:55 T: [**3399-4-12**] 08:50 JOB#: [**Job Number 19798**] 4_4788.txt Open with Displaying 4_4788.txt.''' rush = RuSH('conf/rush_rules.tsv') sentences = rush.segToSentenceSpans(txt) for sentence in sentences: print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end, txt[sentence.begin:sentence.end])) print('\n' + '-' * 100 + '\n')
class PreProcessing: def __init__(self, annotation_type='SOCIAL_SUPPORT', default_value='no mention', filter_file='conf/keywords_filter.txt', stopwords_file='conf/stop_words.txt', word2vec_file='models/glove.word2vec.txt.bin', rush_rules='conf/rush_rules.tsv', max_token_per_sentence=150): # each time we only train/predict a models for one annotation type # set an arbitrary max length of sentences, so that we can pad sentences without knowing the max length of sentences in testing set. self.max_token_per_sentence = max_token_per_sentence self.annotation_type = annotation_type self.default_value = default_value self.real_max_length = 0 self.rush = RuSH(rush_rules) self.html_tokens_p = re.compile('^\&[a-z]{2,4}\;$') self.punctuations = set(string.punctuation) # keep '?' self.punctuations.remove('?') self.spacy_nlp = spacy.load('en', disable=['parser', 'tagger', 'ner']) self.matcher = None self.corpus = None keywords_filter = [] print('load filter keywords') # load filter keywords if path.isfile(filter_file): f = open(filter_file, 'r') keywords_filter = [ line for line in f.readlines() if not line.startswith('#') ] f.close() if len(keywords_filter) > 0: self.matcher = matcher.PhraseMatcher( self.spacy_nlp.tokenizer.vocab, max_length=6) for keyword in keywords_filter: self.matcher.add(keyword, None) print('load stopwords') # load stop words if path.isfile(stopwords_file): f = open(stopwords_file, 'r') self.my_stopwords = set(f.readlines()) f.close() else: self.my_stopwords = set(nltk.corpus.stopwords.words('english')) f = open(stopwords_file, 'w') f.writelines('\n'.join(self.my_stopwords)) f.close() print('load label dictionary') self.label_dict = None self.label_dict_file = 'models/' + self.annotation_type + '_labels.dict' # load dictionary if path.isfile(self.label_dict_file): self.label_dict = Dictionary.load(self.label_dict_file) print('load glove model') # self.glove_model = glove2word2vec.smart_open(word2vec_file) if path.isfile(word2vec_file): if word2vec_file.endswith('.bin'): self.glove_model = KeyedVectors.load_word2vec_format( word2vec_file, binary=True) else: self.glove_model = KeyedVectors.load_word2vec_format( word2vec_file, binary=False) print('convert txt model to binary model...') self.glove_model.save_word2vec_format(word2vec_file + '.bin', binary=True) pass """ Given a plain text document, return a list of tokenized sentences that contain filter keywords""" def processDocument(self, doc_text, tokenized_sentences=[], labels=[], annotations=None, doc_id=None): print(doc_id) sentences = self.rush.segToSentenceSpans(doc_text) sentences_txt = ([ doc_text[sentence.begin:sentence.end] for sentence in sentences ]) anno_id = 0 for i in range(0, len(sentences_txt)): sentence = sentences_txt[i] label = self.default_value # if annotations are available, read as labels if annotations is not None: if len(annotations) > 0: if anno_id < len(annotations) \ and annotations[anno_id]['start'] >= sentences[i].begin \ and annotations[anno_id]['end'] <= sentences[i].end: label = list( annotations[anno_id]['attributes'].values())[0] anno_id += 1 elif anno_id < len(annotations) \ and annotations[anno_id]['end'] <= sentences[i].begin: print(doc_id + str(annotations[anno_id]) + 'was skipped') i -= 1 anno_id += 1 words = [ token for token in self.spacy_nlp.make_doc(sentence) if len(''.join(ch for ch in token.text if ch not in self.punctuations)) > 0 and not self.html_tokens_p.search(token.text) and not token.text.replace('.', '', 1).isdigit() and not token.text.replace('-', '', 1).isdigit() and token.text not in self.my_stopwords ] if self.real_max_length < len(words): self.real_max_length = len(words) if self.get_matches(words): if len(words) < self.max_token_per_sentence: tokenized_sentences.append( self.pad_sentence([word.text for word in words])) labels.append(label) else: begin = 0 words = [word.text for word in words] while begin <= len(words) - self.max_token_per_sentence: tokenized_sentences.append( words[begin:self.max_token_per_sentence]) # overlap the sliced sub-sentences begin += int(self.max_token_per_sentence / 2) if begin < len(words): tokenized_sentences.append( self.pad_sentence( words[len(words) - self.max_token_per_sentence:])) return tokenized_sentences def get_matches(self, sentence_tokens): if self.matcher is None: return True matches = self.matcher(sentence_tokens) for ent_id, start, end in matches: yield (ent_id, start, end) # def processLabelledCorpus(self, corpus_dir): # corpus_reader = EhostCorpusReader(corpus_dir) # corpus = corpus_reader.parse() # self.corpus = corpus # tokenized_sentences = [] # labels = [] # for doc_id, doc in corpus.items(): # if self.annotation_type in doc['categorized']: # annotations = [doc['annotations'][anno_id] for anno_id in doc['categorized'][self.annotation_type]] # else: # annotations = [] # self.processDocument(doc['text'], tokenized_sentences, labels, annotations, doc_id) # # x, y = self.vectorize(tokenized_sentences, labels) # return x, y def pad_sentence(self, sentence, padding_word="<PAD/>"): """ Revised from alexander-rakhlin's code Pads all sentences to the same length. The length is defined by the longest sentence. Returns padded sentences. """ num_padding = self.max_token_per_sentence - len(sentence) new_sentence = sentence + [padding_word] * num_padding return new_sentence def vectorize(self, sentences, labels=[]): """ Revised from alexander-rakhlin's code, use glove models instead. Maps sentencs and labels to vectors based on a vocabulary. """ print(labels) if self.label_dict is None: self.label_dict = gensim.corpora.Dictionary([set(labels)]) self.label_dict.compactify() self.label_dict.save(self.label_dict_file) self.label_dict.save_as_text(self.label_dict_file + '.txt') print(set(labels)) x = np.array([[ self.glove_model.word_vec(word) if word in self.glove_model.vocab else np.random.uniform(-0.25, 0.25, self.glove_model.vector_size) for word in sentence ] for sentence in sentences]) y = None if len(labels) > 0: y = np.zeros((len(labels), len(self.label_dict.keys()))) for i in range(0, len(labels)): label = labels[i] y[i][self.label_dict.token2id[label]] = 1 shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] return x, y
with open('failed.txt') as f: input_str = f.read() sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv') sent_tokenizer = DefaultSentenceTokenizer() print(sent_tokenizer.tokenize_sents(input_str)) #print(sent_tokenizer.span_tokenize(input_str)) exit() print(sent_tokenizer.tokenize_sents(input_str)) word_tokenizer = TreebankWordTokenizer() doc_tokenizer = DocumentTokenizer(rush, word_tokenizer) print(doc_tokenizer.tokenize_doc(input_str)) exit() sentences = rush.segToSentenceSpans(input_str) #nlp = spacy.load('en_core_web_sm') for sentence in sentences[:1]: print('Sentence({0}-{1}):\t>{2}<'.format( sentence.begin, sentence.end, input_str[sentence.begin:sentence.end])) text = input_str[sentence.begin:sentence.end] print(tokenizer.tokenize(text)) print(tokenizer.span_tokenize(text)) spans = tokenizer.span_tokenize(text) tokens = tokenizer.tokenize(text) for span, token in zip(spans, tokens): print(span, token) assert (text[span[0]:span[1]] == token)
def pyRuSHSplitter(self, text): rush = RuSH(ConfigReader.getValue('rush_rules_path')) sentences = rush.segToSentenceSpans(text) return [ text[sentence.begin:sentence.end].strip() for sentence in sentences ]