コード例 #1
0
    def __init__(self,
                 targets=None,
                 modifiers=None,
                 feature_inference_rule=None,
                 document_inference_rule=None,
                 rush_rule=None,
                 expected_values=[],
                 save_markups=True):
        self.document_inferencer = DocumentInferencer(document_inference_rule)
        self.feature_inferencer = FeatureInferencer(feature_inference_rule)
        self.conclusions = []
        self.modifiers = modifiers
        self.targets = targets
        self.save_markups = save_markups
        self.expected_values = [value.lower() for value in expected_values]
        self.saved_markups_map = dict()
        self.pyrush = None
        if rush_rule is None or not os.path.isfile(rush_rule):
            rush_rule = ConfigReader.getValue('rush_rules_path')
        if rush_rule is not None and os.path.isfile(rush_rule):
            self.pyrush = RuSH(rush_rule)
        else:
            logMsg(("File not found", os.path.abspath(rush_rule)))
        self.last_doc_name = ''

        if modifiers is not None and targets is not None:
            if isinstance(modifiers, str) and isinstance(targets, str):
                if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith(
                        '.txt') or modifiers.endswith('.yml')) \
                        and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith(
                    '.txt') or targets.endswith('.yml') or targets.startswith('Lex\t')):
                    self.setModifiersTargetsFromFiles(modifiers, targets)
            else:
                self.setModifiersTargets(modifiers, targets)
        RBDocumentClassifier.instance = self
コード例 #2
0
 def __init__(self):
     #initiate necessary components
     self.target_rules = self.getTargetRegexes()
     self.negation_rules = self.getNegRegexes()
     self.section_rules = self.getSectionRegexes()  # new
     self.target_scores = self.target_score()  # new
     self.sentence_rules = 'KB/rush_rules.tsv'
     self.sentence_segmenter = RuSH(self.sentence_rules)
コード例 #3
0
ファイル: pynlp_pipe_pe.py プロジェクト: asy1113/NLP_PE
    def __init__(self, sentence_rules, target_rules, context_rules,
                 feature_inference_rule, document_inference_rule):

        self.sentence_rules = sentence_rules
        self.target_rules = target_rules
        self.context_rules = context_rules
        self.feature_inference_rule = feature_inference_rule
        self.document_inference_rule = document_inference_rule

        self.sentence_segmenter = RuSH(self.sentence_rules)
        self.feature_inferencer = FeatureInferencer(
            self.feature_inference_rule)
        self.document_inferencer = DocumentInferencer(
            self.document_inference_rule)
        self.targets = get_item_data(self.target_rules)
        self.modifiers = get_item_data(self.context_rules)
コード例 #4
0
 def __init__(self, verbose, mode, sentence_tokenizer, target_rules,
              modifier_rules):
     # initiate MyPipe necessary components here
     self.verbose = verbose
     self.mode = mode
     if sentence_tokenizer.lower() == 'pyrush':
         self.sentence_tokenizer = RuSH(
             path.abspath(path.join('kb', 'rush_rules.tsv')))
     elif sentence_tokenizer.lower() == 'resplit':
         self.sentence_tokenizer = RESplit(
             self.verbose, path.abspath(path.join('kb',
                                                  'resplit_rules.yml')))
     elif sentence_tokenizer.lower() == 'helpers':
         self.sentence_tokenizer = helpers.sentenceSplitter()
     self.targets = get_item_data(target_rules)
     self.modifiers = get_item_data(modifier_rules)
コード例 #5
0
ファイル: pipeline.py プロジェクト: amberkiser/RSV-NLP
'''
# define the feature inference rule
feature_inference_rule = '''
#Conclusion type, Evidence type, Modifier values associated with the evidence
NEGATED_CONCEPT,FEVER,DEFINITE_NEGATED_EXISTENCE
'''
# define the document inference rule
document_inference_rule = '''
#Conclusion Type at document level, Evidence type at mention level
FEVER_DOC,FEVER

#Default document type
NO_FEVER
'''

sentence_segmenter = RuSH(sentence_rules)
feature_inferencer = FeatureInferencer(feature_inference_rule)
document_inferencer = DocumentInferencer(document_inference_rule)

targets = get_item_data(target_rules)
modifiers = get_item_data(context_rules)

# Example sentences
#input = 'This is a sentence. It is just a test. I like this sentence.'

input = '''
No vomiting, chest pain, shortness of breath, nausea, dizziness, or chills on arrival.
On operative day three, the patient fever was detected with temperature 101.5 F.
After 3 days no fever was detected.
Patient came back for a follow up, denies fever.
'''
コード例 #6
0
 def __init__(self, sentence_rules, target_rules, context_rules):
     self.sentence_segmenter = RuSH(sentence_rules)
     self.targets = get_item_data(target_rules)
     self.modifiers = get_item_data(context_rules)
コード例 #7
0
 def __init__(self, sentence_rules, target_rules, between_rules):
     self.sentence_segmenter = RuSH(sentence_rules)
     self.targets = get_item_data(target_rules)
     self.between_rules = between_rules
コード例 #8
0
 def setUp(self):
     self.rush = RuSH('../conf/rush_rules.tsv')
コード例 #9
0
12.  Vancomycin 750 mg intravenously twice per day (times 14
days).
13.  Codeine/guaifenesin syrup 5 cc to 10 cc by mouth q.6h.
as needed.
14.  Klonopin 0.75 mg by mouth in the morning and 0.5 mg by
mouth at hour of sleep.
15.  Multivitamin one tablet by mouth once per day.




                          [**Name6 (MD) 2381**] [**Last Name (NamePattern4) 3424**], M.D.  [**MD Number(1) 3425**]

Dictated By:[**Last Name (NamePattern1) 3426**]

MEDQUIST36

D:  [**3399-4-10**]  14:55
T:  [**3399-4-12**]  08:50
JOB#:  [**Job Number 19798**]
4_4788.txt
Open with
Displaying 4_4788.txt.'''

rush = RuSH('conf/rush_rules.tsv')
sentences = rush.segToSentenceSpans(txt)
for sentence in sentences:
    print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end,
                                        txt[sentence.begin:sentence.end]))
    print('\n' + '-' * 100 + '\n')
コード例 #10
0
    def __init__(self,
                 annotation_type='SOCIAL_SUPPORT',
                 default_value='no mention',
                 filter_file='conf/keywords_filter.txt',
                 stopwords_file='conf/stop_words.txt',
                 word2vec_file='models/glove.word2vec.txt.bin',
                 rush_rules='conf/rush_rules.tsv',
                 max_token_per_sentence=150):
        # each time we only train/predict a models for one annotation type
        # set an arbitrary max length of sentences, so that we can pad sentences without knowing the max length of sentences in testing set.

        self.max_token_per_sentence = max_token_per_sentence
        self.annotation_type = annotation_type
        self.default_value = default_value
        self.real_max_length = 0
        self.rush = RuSH(rush_rules)
        self.html_tokens_p = re.compile('^\&[a-z]{2,4}\;$')
        self.punctuations = set(string.punctuation)
        # keep '?'
        self.punctuations.remove('?')
        self.spacy_nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
        self.matcher = None
        self.corpus = None
        keywords_filter = []
        print('load filter keywords')
        # load filter keywords
        if path.isfile(filter_file):
            f = open(filter_file, 'r')
            keywords_filter = [
                line for line in f.readlines() if not line.startswith('#')
            ]
            f.close()
        if len(keywords_filter) > 0:
            self.matcher = matcher.PhraseMatcher(
                self.spacy_nlp.tokenizer.vocab, max_length=6)
            for keyword in keywords_filter:
                self.matcher.add(keyword, None)

        print('load stopwords')
        # load stop words
        if path.isfile(stopwords_file):
            f = open(stopwords_file, 'r')
            self.my_stopwords = set(f.readlines())
            f.close()
        else:
            self.my_stopwords = set(nltk.corpus.stopwords.words('english'))
            f = open(stopwords_file, 'w')
            f.writelines('\n'.join(self.my_stopwords))
            f.close()

        print('load label dictionary')
        self.label_dict = None
        self.label_dict_file = 'models/' + self.annotation_type + '_labels.dict'
        # load dictionary
        if path.isfile(self.label_dict_file):
            self.label_dict = Dictionary.load(self.label_dict_file)

        print('load glove model')
        # self.glove_model = glove2word2vec.smart_open(word2vec_file)
        if path.isfile(word2vec_file):
            if word2vec_file.endswith('.bin'):
                self.glove_model = KeyedVectors.load_word2vec_format(
                    word2vec_file, binary=True)
            else:
                self.glove_model = KeyedVectors.load_word2vec_format(
                    word2vec_file, binary=False)
                print('convert txt model to binary model...')
                self.glove_model.save_word2vec_format(word2vec_file + '.bin',
                                                      binary=True)

        pass
コード例 #11
0
def build_n2c2_tokenizer(keep_token_strings=False,
                         enable_pyrush_sentence_tokenizer=False,
                         disable_custom_preprocessing=True):
    print('Building n2c2 tokenizer...')
    cs_preprocess_split_re_strings = []
    # double newlines
    cs_preprocess_split_re_strings.append(r'[\r\n]{2,}')
    # newlines with only spaces
    cs_preprocess_split_re_strings.append(r'[\r\n]+\s+[\r\n]+')
    # numbered lists (e.g. "1.", "2)")
    cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\d+[.)-]')
    # bulleted lists (e.g."*", "-")
    cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*[*-]')
    # starting labels (e.g. "WEIGHT:")
    cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\w+[:]')
    # break up other lines separated by dates
    cs_preprocess_split_re_strings.append(
        r'(^|\r|\n)+\s*\d{1,2}[/-]\d{1,2}[/-]\d{2,4}')
    # MIMIC has many lines that start with this [**YYYY-M-DD**]
    cs_preprocess_split_re_strings.append(r'^\[\*+\d{4}-\d{1,2}-\d{1,2}\*+\]')
    # TIU notes have long bars like this : '***********' or '===========' or '------'
    cs_preprocess_split_re_strings.append(r'[*=-]{3,}')

    # NOTE : This breaking rule was disabled 2-13-18 since the UMass MADE challenge data often ended each line with 2 spaces and a
    # newline which caused this aggressive rule to fire over and over again.
    # aggressively break anything with lots of spaces (tabular data)
    #cs_preprocess_split_re_strings.append(r'\s{3,}')

    custom_lang_vars = CustomSentenceBreakingLangVars()
    custom_lang_vars.sent_end_chars = ('.', '!')
    print(custom_lang_vars.sent_end_chars)

    punkt_tokenizer2 = PunktSentenceTokenizer(lang_vars=custom_lang_vars)
    treebank_tokenizer = TreebankWordTokenizer()

    # looks like "pt." and "D.R." and "P.R." are already being handled
    #punkt_tokenizer2._params.abbrev_types.update(extra_abbrev)

    sentence_tokenizer = None
    if enable_pyrush_sentence_tokenizer:
        print('Enabling PyRuSH for sentence tokenization...')
        pyrush_sentence_tokenizer = RuSH(
            'resources/PyRuSH/conf/rush_rules.tsv')
        sentence_tokenizer = pyrush_sentence_tokenizer
    else:
        print('Enabling NLTK Punkt for sentence tokenization...')
        sentence_tokenizer = punkt_tokenizer2

    print('Type of sentence tokenizer : {}'.format(type(sentence_tokenizer)))

    enabled_preprocessing_expressions = []
    if not disable_custom_preprocessing:
        print('Enabling custom preprocessing expressions.  Total : {}'.format(
            len(cs_preprocess_split_re_strings)))
        enabled_preprocessing_expressions = cs_preprocess_split_re_strings
    else:
        print('Not allowing custom preprocessing expressions...')

    cs_tokenizer = ClinicalSentenceTokenizer(
        default_sentence_tokenizer=sentence_tokenizer,
        preprocess_split_re_strs=enabled_preprocessing_expressions)

    index_tokenizer = IndexTokenizer(cs_tokenizer,
                                     treebank_tokenizer,
                                     keep_token_strings=keep_token_strings)

    return index_tokenizer
コード例 #12
0
 def __init__(self, rules='./rush_rules.tsv'):
     self.rules = rules
     self.rush = RuSH(self.rules)
コード例 #13
0
        return sent_spans


class DefaultSentenceTokenizer(object):
    def __init__(self):
        self.tokenizer = PunktSentenceTokenizer()

    def tokenize_sents(self, text):
        """
        Returns spans
        """
        return self.tokenizer.span_tokenize(text)


if __name__ == '__main__':
    rush = RuSH('rush_rules.tsv')
    input_str = "The             patient was admitted on 03/26/08\n and was started on IV antibiotics elevation" +\
             ", was also counseled to minimizing the cigarette smoking. The patient had edema\n\n" +\
             "\n of his bilateral lower extremities. The hospital consult was also obtained to " +\
             "address edema issue question was related to his liver hepatitis C. Hospital consult" +\
             " was obtained. This included an ultrasound of his abdomen, which showed just mild " +\
             "cirrhosis. "
    with open('failed.txt') as f:
        input_str = f.read()

    sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv')
    sent_tokenizer = DefaultSentenceTokenizer()
    print(sent_tokenizer.tokenize_sents(input_str))
    #print(sent_tokenizer.span_tokenize(input_str))
    exit()
    print(sent_tokenizer.tokenize_sents(input_str))
コード例 #14
0
 def pyRuSHSplitter(self, text):
     rush = RuSH(ConfigReader.getValue('rush_rules_path'))
     sentences = rush.segToSentenceSpans(text)
     return [
         text[sentence.begin:sentence.end].strip() for sentence in sentences
     ]