def __init__(self): super().__init__() self._validation_set = config.get_pipeline_config_item( self.module_name(), 'validation_set_file', None) self._df_notes_labeled_path = config.get_pipeline_config_item( self.module_name(), 'input_note_file', None) self._loaded_df = None self._compare_df = None self._orig_df = None self._loaded_validation = None self._loaded_validation_labels = None self._loaded_validation_label_map = None logger.log_info('Loading validation note labeling file') self._loading_validation_labeling_file() logger.log_info('DONE: Loading validation note labeling file') logger.log_info('Loading NLP pipeline processed note files') self._loading_note_files() logger.log_info('DONE: NLP pipeline processed note files') logger.log_info('Computing and outputting statistics') self._do_statistics()
def _load_note_input_file(self): if not self._df_notes_labeled_path: raise RuntimeError('Please specify a valid note input file.') filename = utils.default_dataframe_name(self._df_notes_labeled_path) assert os.path.isfile( filename), 'Could not find note parquet file: {}'.format(filename) self._loaded_df = pd.read_parquet(filename) #self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID == 23191] # 3083 self._loaded_df.columns = [_.upper() for _ in self._loaded_df.columns] assert 'SUBJECT_ID' in self._loaded_df.columns and 'CHARTDATE' in self._loaded_df.columns and 'CATEGORY' in self._loaded_df.columns and 'TEXT' in self._loaded_df.columns and 'ROW_ID' in self._loaded_df.columns and 'HADM_ID' in self._loaded_df.columns, 'Notes file needs to have columns: Row_id, Subject_id, Hadm_id, chartdate, category and text' logger.log_info('Notes before category removal: {}'.format( len(self._loaded_df))) self._loaded_df['CATEGORY'] = self._loaded_df['CATEGORY'].str.lower() self._keep_categories = [_.lower() for _ in self._keep_categories] filtered_df = self._loaded_df[self._loaded_df['CATEGORY'].isin( self._keep_categories)].copy() del self._loaded_df self._loaded_df = filtered_df logger.log_info('Notes after category removal: {}'.format( len(self._loaded_df))) if not self._debug_row_id is None: self._loaded_df = self._loaded_df[self._loaded_df.ROW_ID == self._debug_row_id] if self._loaded_df.empty: logger.log_error('Could not find requested debugging row id.')
def _label_improve_cohort(self): self._pre_filtered_df['FOUND_EVIDENCE_NEGATED'] = 0 self._pre_filtered_df['PREDICTED_CATEGORIES'] = '' self._pre_filtered_df['MAX_SCORE_CAT'] = '' self._pre_filtered_df['MAX_SCORE'] = -np.inf for _k in self._lexicon_map['positive'].keys(): _k = _k.upper() self._pre_filtered_df[ 'MAX_SCORE_CAT'] = self._pre_filtered_df.apply( lambda x: _k if x[_k + '_TOTAL_SCORE_SUM'] > x[ 'MAX_SCORE'] else x['MAX_SCORE_CAT'], axis=1) self._pre_filtered_df['MAX_SCORE'] = self._pre_filtered_df.apply( lambda x: x[_k + '_TOTAL_SCORE_SUM'] if x[_k + '_TOTAL_SCORE_SUM'] > x['MAX_SCORE'] else x[ 'MAX_SCORE'], axis=1) self._pre_filtered_df[ '_PREDICTED_CATEGORIES'] = self._pre_filtered_df[ _k + '_TOTAL_SCORE_SUM'].apply(lambda x: _k if x > 0 else '') self._pre_filtered_df[ 'PREDICTED_CATEGORIES'] = self._pre_filtered_df.apply( lambda x: x['PREDICTED_CATEGORIES'] + '|' + _k if len(x['_PREDICTED_CATEGORIES']) > 0 else x[ 'PREDICTED_CATEGORIES'], axis=1) del self._pre_filtered_df['_PREDICTED_CATEGORIES'] self._pre_filtered_df.loc[self._pre_filtered_df['MAX_SCORE'] > 0, 'FOUND_EVIDENCE_NEGATED'] = 1 self._filtered_cohort_df = self._pre_filtered_df self._filtered_cohort_df[ 'PREDICTED_CATEGORIES'] = self._filtered_cohort_df[ 'PREDICTED_CATEGORIES'].apply(lambda x: x[1:] if len(x) > 1 else x) if 'FOUND_EVIDENCE' in self._filtered_cohort_df.columns: self._filtered_cohort_df['FOUND_EVIDENCE'] = ( self._filtered_cohort_df['FOUND_EVIDENCE'] > 0) & (self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED'] > 0) else: self._filtered_cohort_df[ 'FOUND_EVIDENCE'] = self._filtered_cohort_df[ 'FOUND_EVIDENCE_NEGATED'] > 0 del self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED'] logger.log_info( 'Total patients (after negex filtering): {} / Total admissions: {}' .format(self._filtered_cohort_df['SUBJECT_ID'].nunique(), self._filtered_cohort_df['HADM_ID'].nunique()))
def _parse_lexicons(self): assert os.path.isdir( self._lexicon_dir), 'Invalid lexicon dir. Does not exist.' assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.' pos_dir = os.path.join(self._lexicon_dir, 'positive') neg_dir = os.path.join(self._lexicon_dir, 'negative') assert os.path.isdir( pos_dir ), 'There needs to be a positive lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.' assert os.path.isdir( neg_dir ), 'There needs to be a negative lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.' pos_files = os.listdir(pos_dir) neg_files = os.listdir(neg_dir) def parse_dir(dirlist, prefix): for _lexi in dirlist: file = open(os.path.join(self._lexicon_dir, prefix, _lexi), 'r') filename = _lexi.strip() filename = re.sub(r'\..*', '', filename) filename = helper_classes.Module.camelcase_to_snakecase( filename) filename = filename.replace(' ', '_') lines = file.readlines() lines = [_.strip() for _ in lines if len(_) > 0] self._lexicon_map[prefix][filename] = [] if lines: for _ in lines: term = _.split(';;') if len(term[0]) < 1: continue assert len( term ) > 0, 'Invalid line found in {} lexicon: {}'.format( prefix, _) if len(term) < 2: self._lexicon_map[prefix][filename].append(term[0]) else: self._lexicon_map[prefix][filename].append(term[0]) file.close() logger.log_debug('Parsing the lexicons..') parse_dir(pos_files, 'positive') parse_dir(neg_files, 'negative') for cat in ['positive', 'negative']: for _k, _v in self._lexicon_map[cat].items(): logger.log_info('{} {} lexicon: {} entries'.format( cat, _k, len(_v))) logger.log_info('Parsed and stored all lexicons.')
def _change_cohort_mappings(self): for _search_type, _lexicons in self._lexicon_map.items(): assert _search_type + '_POSITIVE_LEXICON_SENTENCES' in self._loaded_df.columns, "Missing column in dataframe: {}. This module only suports inputs from the negex_negation_filter module.".format(_search_type + '_POSITIVE_LEXICON_SENTENCES') assert _search_type + '_POSITIVE_LEXICON_NEGATED_PHRASES' in self._loaded_df.columns, "Missing column in dataframe: {}. This module only suports inputs from the negex_negation_filter module.".format(_search_type + '_POSITIVE_LEXICON_NEGATED_PHRASES') assert _search_type + '_POSITIVE_LEXICON_AFFIRMED_PHRASES' in self._loaded_df.columns, "Missing column in dataframe: {}. This module only suports inputs from the negex_negation_filter module.".format(_search_type + '_POSITIVE_LEXICON_AFFIRMED_PHRASES') if self._debug_check: self._loaded_df = self._loaded_df.iloc[:100] logger.log_info('Long dist. matching for: ' + str(_lexicons.keys())) notes_parsed = Parallel(n_jobs=self._njobs)(delayed(self._process_note)(note, _search_type, _lexicons) for note in ([self._loaded_df.iloc[_:min(_+7000, len(self._loaded_df)), :] for _ in range(0, len(self._loaded_df), 7000)])) notes_parsed = [__ for _ in notes_parsed for __ in _] self._loaded_df = pd.DataFrame(notes_parsed, columns=self._loaded_df.columns)
def _parse_lexicons(self): assert os.path.isdir( self._lexicon_dir), 'Invalid lexicon dir. Does not exist.' assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.' pos_dir = os.path.join(self._lexicon_dir, 'positive') neg_dir = os.path.join(self._lexicon_dir, 'negative') assert os.path.isdir(pos_dir), 'There needs to be a positive lexicon.' assert os.path.isdir(neg_dir), 'There needs to be a negative lexicon.' pos_files = os.listdir(pos_dir) neg_files = os.listdir(neg_dir) unknown_files = [_ for _ in neg_files if not _ in pos_files] assert len( unknown_files ) == 0, 'The lexicon filenames in the positive and negative dirs need to match! Found: ' + str( unknown_files) def parse_dir(dirlist, prefix): for _lexi in dirlist: file = open(os.path.join(self._lexicon_dir, prefix, _lexi), 'r') filename = _lexi.strip() filename = re.sub(r'\..*', '', filename) filename = helper_classes.Module.camelcase_to_snakecase( filename) filename = filename.replace(' ', '_') lines = file.readlines() lines = [_.strip() for _ in lines if len(_) > 0] self._lexicon_map[prefix][filename] = [] self._lexicon_weights[prefix][filename] = {} if lines: for _ in lines: term = _.split(';;') assert len( term ) > 0, 'Invalid line found in {} lexicon: {}'.format( prefix, _) if len(term[0]) < 1: continue if len(term) < 2: self._lexicon_map[prefix][filename].append(term[0]) self._lexicon_weights[prefix][filename][ term[0].lower()] = 2 else: self._lexicon_map[prefix][filename].append(term[0]) self._lexicon_weights[prefix][filename][ term[0].lower()] = (int(term[1])) file.close() logger.log_debug('Parsing the lexicons now..') parse_dir(pos_files, 'positive') parse_dir(neg_files, 'negative') for cat in ['positive', 'negative']: for _k, _v in self._lexicon_map[cat].items(): logger.log_info('{} {} lexicon: {} entries'.format( cat, _k, len(_v))) logger.log_info('Parsed and stored all lexicons.')
def _dump_df(self): logger.log_info('Dumping the extracted notes into a parquet file.') filename = utils.default_dataframe_name(self._output_note_file) self._labeled_df.to_parquet(filename) logger.log_info( 'DONE: Dumping the extracted notes into a parquet file.')
def _query_bigquery(self): sql_search = "" merged_lexicon_map = { _k: self._lexicon_map['positive'][_k] + self._lexicon_map['negative'][_k] for _k in self._lexicon_map['positive'].keys() } for _name, _terms in merged_lexicon_map.items(): if not _terms: sql_search = sql_search + "," + " FALSE AS " + _name else: lex = [r'\\b' + x + r'\\b' for x in _terms] sql_search = sql_search + "," + " REGEXP_CONTAINS(text, '(?i)(" + '|'.join( lex) + ")') AS " + _name ignore_str = '\n'.join([ 'AND category NOT LIKE "%{}%"'.format(_) for _ in self._ignore_cat_list ]) use_bqstorage_api = config.get_pipeline_config_item( self.module_name(), "use_bqstorage_api", False) limitstr = "" if config.get_pipeline_config_item(self.module_name(), "debug_download", False): limitstr = 'LIMIT 10' cohort_ids = [] if self._cohort_file and os.path.isfile(self._cohort_file): cohort_ids = pd.read_csv(self._cohort_file) cohort_ids.columns = [_.lower() for _ in cohort_ids.columns] cohort_ids = list(cohort_ids.loc[:, 'hadm_id']) sql = """ SELECT row_id, subject_id, hadm_id, chartdate, category, text{} FROM `physionet-data.mimiciii_notes.noteevents` WHERE hadm_id IS NOT NULL AND hadm_id IN ({}) {} {} """.format(sql_search, ','.join([str(_) for _ in cohort_ids]), ignore_str, limitstr) logger.log_info('Querying noteevents for lexicon occurences.') self._labeled_df = pandas_gbq.read_gbq( sql, project_id=google_tools.PROJECT_ID, dialect='standard', use_bqstorage_api=use_bqstorage_api ) #, progress_bar_type=utils.PROGRESSBAR_TYPE) self._labeled_df.columns = [ _.upper() for _ in self._labeled_df.columns ] if not self._dump_all: mask = None for _ in self._labeled_df.columns: if _.lower() in [ 'subject_id', 'row_id', 'hadm_id', 'chartdate', 'category', 'text' ]: continue if mask is None: mask = self._labeled_df[_].astype(bool) else: mask = mask | self._labeled_df[_].astype(bool) self._labeled_df = self._labeled_df[mask].copy() logger.log_info('DONE: Querying noteevents for lexicon occurences.') logger.log_debug('Number of admissions {}, number of notes {}.'.format( self._labeled_df['HADM_ID'].nunique(), len(self._labeled_df))) for _key in self._lexicon_map['positive'].keys(): _key = _key.upper() logger.log_debug('Number of notes with {}: {}.'.format( _key.lower(), self._labeled_df[_key.upper()].sum()))
def print_row(*args): logger.log_info(' '.join([str(_) for _ in args]))
def _do_statistics(self): validset = self._loaded_validation.sort_values('ROW_ID').reset_index( drop=True)[['ROW_ID', 'NOTE_TYPES']].copy() validset = validset.drop_duplicates(subset=['ROW_ID']) predicted = self._loaded_df[['ROW_ID', 'PREDICTED_CATEGORIES']].copy() predicted = predicted.rename( columns={'PREDICTED_CATEGORIES': 'PREDICTED_CAT'}) predicted = predicted.drop_duplicates(subset=['ROW_ID']) validset = validset.merge(predicted, how='left', on='ROW_ID') validset.loc[validset['PREDICTED_CAT'].isnull(), 'PREDICTED_CAT'] = pd.Series([[1]] * validset.shape[0]) validset.loc[validset['NOTE_TYPES'].isnull(), 'NOTE_TYPES'] = pd.Series([[1]] * validset.shape[0]) validset['MATCHED'] = validset.apply( lambda x: [_ for _ in x.NOTE_TYPES if _ in x.PREDICTED_CAT], axis=1) validset['UNMATCHED_VALID'] = validset.apply( lambda x: [_ for _ in x.NOTE_TYPES if _ not in x.PREDICTED_CAT], axis=1) validset['UNMATCHED_PREDICTED'] = validset.apply( lambda x: [_ for _ in x.PREDICTED_CAT if _ not in x.NOTE_TYPES], axis=1) validset['CORRECT_NOTE'] = False validset.loc[(validset.UNMATCHED_VALID.str.len() == 0) & (validset.UNMATCHED_PREDICTED.str.len() == 0), 'CORRECT_NOTE'] = True max_index = max(self._loaded_validation_label_map.values()) one_hot_valid = np.zeros((validset.shape[0], max_index)) one_hot_pred = np.zeros((validset.shape[0], max_index)) _i = 0 for _, _row in validset.iterrows(): predicted = [_ - 1 for _ in _row.PREDICTED_CAT if _ != 0] valid = [_ - 1 for _ in _row.NOTE_TYPES if _ != 0] if valid: one_hot_valid[_i, valid] = 1 if predicted: one_hot_pred[_i, predicted] = 1 _i += 1 # validset_types = [] # for _, _row in validset.iterrows(): # unmatched_valid = sorted(_row.UNMATCHED_VALID) # unmatched_predicted = sorted(_row.UNMATCHED_PREDICTED) # if 0 in unmatched_valid: # for _ in _row.PREDICTED_CAT: # assert _ > 0 # _row['PREDICTED'] = _ # _row['VALIDATION'] = 0 # continue # if 0 in unmatched_predicted: # for _ in _row.NOTE_TYPES: # assert _ > 0 # _row['PREDICTED'] = 0 # _row['VALIDATION'] = _ # validset_types.append(list(_row)) # continue # for _nomatch in unmatched_valid: # _row['PREDICTED'] = 0 # _row['VALIDATION'] = _nomatch # validset_types.append(list(_row)) # for _nomatch in unmatched_predicted: # _row['PREDICTED'] = _nomatch # _row['VALIDATION'] = 0 # validset_types.append(list(_row)) # for _match in sorted(_row.MATCHED): # _row['PREDICTED'] = _match # _row['VALIDATION'] = _match # validset_types.append(list(_row)) # validset = pd.DataFrame(validset_types, columns=list(validset.columns) + ['PREDICTED', 'VALIDATION']) # validset['_CORRECT_ENTRIES'] = validset['PREDICTED'] == validset['VALIDATION'] # validset['_CORRECT_ENTRIES'] *= 1 # validset['_CORRECT_ENTRIES'] = validset.groupby('ROW_ID')['_CORRECT_ENTRIES'].transform(lambda x: sum(x)) # validset['_TOTAL_ENTRIES'] = validset.groupby('ROW_ID')['_CORRECT_ENTRIES'].transform(lambda x: len(x)) # validset['CORRECT_NOTE'] = validset['_TOTAL_ENTRIES'] == validset['_CORRECT_ENTRIES'] # validset = validset.drop(columns=['_CORRECT_ENTRIES', '_TOTAL_ENTRIES']) assert len( validset.groupby('ROW_ID').first().reset_index()) == len(validset) logger.log_info('Correctly identified notes: {}/{} ({}%)'.format( validset['CORRECT_NOTE'].sum(), len(validset), validset['CORRECT_NOTE'].sum() * 100 / len(validset))) # predicted_labels = validset['PREDICTED'].values # valid_labels = validset['VALIDATION'].values def hamming_score(y_true, y_pred, normalize=True, sample_weight=None): ''' Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case https://stackoverflow.com/q/32239577/395857 ''' acc_list = [] for i in range(y_true.shape[0]): set_true = set(np.where(y_true[i])[0]) set_pred = set(np.where(y_pred[i])[0]) #print('\nset_true: {0}'.format(set_true)) #print('set_pred: {0}'.format(set_pred)) tmp_a = None if len(set_true) == 0 and len(set_pred) == 0: tmp_a = 1 else: tmp_a = len(set_true.intersection(set_pred))/\ float( len(set_true.union(set_pred)) ) #print('tmp_a: {0}'.format(tmp_a)) acc_list.append(tmp_a) return np.mean(acc_list) logger.log_info('Hamming score: {0}'.format( hamming_score(one_hot_valid, one_hot_pred))) # 0.375 (= (0.5+1+0+0)/4) # Subset accuracy # 0.25 (= 0+1+0+0 / 4) --> 1 if the prediction for one sample fully matches the gold. 0 otherwise. logger.log_info('Subset accuracy: {0}'.format( sklearn.metrics.accuracy_score(one_hot_valid, one_hot_pred, normalize=True, sample_weight=None))) logger.log_info('Hamming loss: {0}'.format( sklearn.metrics.hamming_loss(one_hot_valid, one_hot_pred))) logger.log_info('Total instances: {0}'.format(len(one_hot_valid))) logger.log_info('') def print_report(y_true, y_pred): def print_row(*args): logger.log_info(' '.join([str(_) for _ in args])) #print_row('F1 Score', sklearn.metrics.f1_score(y_true, y_pred)) print_row('Precision', sklearn.metrics.precision_score(y_true, y_pred)) print_row('Recall', sklearn.metrics.recall_score(y_true, y_pred)) print_row('Accuracy', sklearn.metrics.accuracy_score(y_true, y_pred), np.sum(y_true == y_pred), '/', len(y_pred), '({}, {})'.format(y_true.sum(), y_pred.sum())) for _cat in range(1, max_index + 1): mapname = [ _[0] for _ in self._loaded_validation_label_map.items() if _cat == _[1] ][0] logger.log_info('{} =================='.format(mapname)) print_report(one_hot_valid[:, _cat - 1], one_hot_pred[:, _cat - 1]) logger.log_info('')
def __init__(self): super().__init__() self._validation_set = config.get_pipeline_config_item( self.module_name(), 'validation_set_file', None) self._df_notes_labeled_paths = config.get_pipeline_config_item( self.module_name(), 'input_note_files', []) self._loaded_df = [] self._compare_df = None self._orig_df = None self._loaded_validation = None self._loaded_validation_labels = None self._loaded_validation_label_map = None logger.log_info('Loading validation note labeling file') self._loading_validation_labeling_file() logger.log_info('DONE: Loading validation note labeling file') logger.log_info('Loading NLP pipeline processed note files') self._loading_note_files() logger.log_info('DONE: NLP pipeline processed note files') logger.log_info('Computing and outputting statistics') line_list = [] for _ in self._loaded_df: line_list.append(';') table = self._do_statistics(_) for _r in range(len(table[0])): elems = [_c[_r] for _c in table] line_list.append(';'.join(elems)) line_list.append(';') line_list.append(';') logger.log_info('CSV Table Output:') for _ in line_list: print(_)
def __init__(self): super().__init__() self._output_note_file = config.get_pipeline_config_item(self.module_name(), 'output_note_file', '_negex_filtered_notes.parquet') self._df_notes_labeled_path = config.get_pipeline_config_item(self.module_name(), 'input_note_file', None) self._word_distance = config.get_pipeline_config_item(self.module_name(), 'word_distance', 10) self._debug_check = config.get_pipeline_config_item(self.module_name(), 'debug_check', True) #self._has_negation_flag = False self._loaded_df = None self._lexicon_dir = config.get_pipeline_config_item(self.module_name(), 'word_filter_dir', './word_cohort_filters') self._lexicon_map = {} self._njobs = config.get('njobs', multiprocessing.cpu_count()) logger.log_info('Loading note file') self._load_note_input_file() logger.log_info('DONE: Loading note file.') logger.log_info('Parsing word filter lexicons.') self._load_word_matching_lexicons() logger.log_info('DONE: Parsing word filter lexicons.') logger.log_info('Word filtering and cohort adaptions.') self._change_cohort_mappings() logger.log_info('DONE: Word filtering and cohort adaptions.') logger.log_info('Dumping changed notes.') self._dump_filtered_df() logger.log_info('DONE: Dumping changed notes.')
def __init__(self): super().__init__() nltk.download('punkt') self._output_note_file = config.get_pipeline_config_item( self.module_name(), 'output_note_file', '_negex_filtered_notes.parquet') self._df_notes_labeled_path = config.get_pipeline_config_item( self.module_name(), 'input_note_file', None) self._keep_categories = config.get_pipeline_config_item( self.module_name(), 'keep_categories', []) self._dont_include_predicted_categories = config.get_pipeline_config_item( self.module_name(), 'dont_include_predicted_categories', False) self._debug_row_id = config.get_pipeline_config_item( self.module_name(), 'debug_row_id', None) self._loaded_df = None self._pre_filtered_df = None self._filtered_cohort_df = None self._labeled_df = None self._lexicon_dir = config.get_pipeline_config_item( self.module_name(), 'lexicon_dir', './used_lexicons') self._lexicon_map = {'positive': {}, 'negative': {}} self._debug_check = config.get_pipeline_config_item( self.module_name(), 'debug_check', False) self._njobs = config.get('njobs', multiprocessing.cpu_count()) logger.log_info('Loading note file') self._load_note_input_file() logger.log_info('DONE: Loading note file.') logger.log_info('Negex note filtering.') self._parse_lexicons() self._check_note_negations() logger.log_info('DONE: Negex note filtering.') logger.log_info('New cohort labeling.') self._label_improve_cohort() logger.log_info('DONE: New cohort labeling.') logger.log_info('Dumping filtered notes.') self._dump_filtered_df() logger.log_info('DONE: Dumping filtered notes.')
def _check_note_negations(self): mask = None for _ in self._loaded_df.columns: if _.lower() in [ 'subject_id', 'row_id', 'hadm_id', 'chartdate', 'category', 'text', 'section_id', 'section_group', 'section_group_new', 'section_name' ]: continue if mask is None: mask = self._loaded_df[_].astype(bool) else: mask = mask | self._loaded_df[_].astype(bool) logger.log_info('Starting negation checking loop') logger.log_debug( str(self._njobs) + ' processes used for check routine.') note_pos_df = self._loaded_df[mask].copy() logger.log_info( 'Total patients (before negex filtering): {} / Total admissions: {}' .format(note_pos_df['SUBJECT_ID'].nunique(), note_pos_df['HADM_ID'].nunique())) del self._loaded_df if self._debug_check: #note_pos_df = note_pos_df[note_pos_df['ROW_ID'] == 33059] note_pos_df = note_pos_df.iloc[0:10] note_infos = Parallel(n_jobs=self._njobs)( delayed(self._process_note)(note) for _, note in progressbar(note_pos_df.iterrows(), total=len(note_pos_df))) logger.log_debug('Found {} note infos.'.format(len(note_infos))) if note_infos: example_note = note_infos[0] logger.log_debug(str(example_note)) note_infos_df = [] cols = ['ROW_ID'] for _entry in note_infos: for _id, _cat_dict in _entry.items(): lis = [_id] for _cat in self._lexicon_map['positive'].keys(): if not _cat in _cat_dict: lis.append(0) lis.append(0) lis.append('') lis.append('') lis.append('') continue _negated = _cat_dict[_cat]['negated'] _key = _cat.upper() positive_terms = len(_cat_dict[_cat]['occurences']) lis.append(positive_terms) lis.append(positive_terms) lis.append('\n'.join(_cat_dict[_cat]['sentences'])) lis.append('\n'.join( [str(_) for _ in _cat_dict[_cat]['occurences']])) lis.append('') note_infos_df.append(lis) cols_suffix = [ 'TOTAL_SCORE_SUM', 'SCORE_SUM_POSITIVE', 'POSITIVE_LEXICON_SENTENCES', 'POSITIVE_LEXICON_AFFIRMED_PHRASES', 'POSITIVE_LEXICON_NEGATED_PHRASES' ] for _key in self._lexicon_map['positive'].keys(): for _suff in cols_suffix: cols.append(_key.upper() + '_' + _suff.upper()) info_df = pd.DataFrame(note_infos_df, columns=cols) note_pos_df = note_pos_df.merge(info_df, how='left', on='ROW_ID') note_pos_df.loc[:, cols] = note_pos_df[cols].fillna('') self._pre_filtered_df = note_pos_df
def _loading_note_files(self): if not self._df_path_a or not self._df_path_b: raise RuntimeError('Please specify a valid note input file.') def load_prediction_file(path): filename = utils.default_dataframe_name(path) assert os.path.isfile( filename), 'Could not find note parquet file: {}'.format( filename) df = pd.read_parquet(filename) df.columns = [_.upper() for _ in df.columns] assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories' assert 'PREDICTED_CATEGORIES' in df.columns, "Processed note file needs to have the PREDICTED_CATEGORIES column generated by e.g. the negation module." df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.upper() df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.replace( ' ', '_') df['PREDICTED_CATEGORIES'] = df.PREDICTED_CATEGORIES.str.split('|') if 'FOUND_EVIDENCE' in df.columns: df['FOUND_EVIDENCE'] = df['FOUND_EVIDENCE'].astype(bool) df = df[df['FOUND_EVIDENCE']] return df def load_sentence_info_file(path, allowed_class_labels): filename = utils.default_dataframe_name(path) assert os.path.isfile( filename), 'Could not find note parquet file: {}'.format( filename) df = pd.read_parquet(filename) df.columns = [_.upper() for _ in df.columns] assert 'ROW_ID' in df.columns, 'Notes file need to have columns: Row_id, predicted_categories' for _ in allowed_class_labels: assert _ in df.columns, "Processed note file has no {} column - class label not found!".format( _) for __ in self._required_tag_list: assert _ + '_' + __ in df.columns, "Processed note file has no {} column - the file needs to be generated by the negex_negation_filter module!".format( _ + '_' + __) return df self._df_a = load_prediction_file(self._df_path_a) self._df_b = load_prediction_file(self._df_path_b) # Identify and map all class labels to integer numbers unique_labels = [] for _ in [ *self._df_a.PREDICTED_CATEGORIES, *self._df_b.PREDICTED_CATEGORIES, self._loaded_validation_labels ]: unique_labels.extend(_) unique_labels = set(unique_labels) unique_labels = set([_.upper() for _ in unique_labels]) unique_labels_unmatched = unique_labels - self._loaded_validation_labels logger.log_info( 'Found the following labels which are present in the predicted notes but not in the validation set: ' + str(unique_labels_unmatched)) lbl_id = 1 self._loaded_validation_label_map = {'NONE': 0} self._inv_loaded_validation_label_map = {0: 'NONE'} for _lbl in unique_labels: self._loaded_validation_label_map[_lbl] = lbl_id self._inv_loaded_validation_label_map[lbl_id] = _lbl lbl_id += 1 for _lbl in unique_labels_unmatched: self._loaded_validation_label_map[_lbl] = 0 self._inv_loaded_validation_label_map[0] = _lbl logger.log_info('Label string to int map: {}'.format( str(self._loaded_validation_label_map))) class_labels = [ _ for _ in self._loaded_validation_label_map.keys() if _ != 'NONE' ] self._df_sents_a = load_sentence_info_file(self._df_path_a_negated, class_labels) self._df_sents_b = load_sentence_info_file(self._df_path_b_negated, class_labels) self._df_a[ 'PREDICTED_CATEGORIES'] = self._df_a.PREDICTED_CATEGORIES.apply( lambda x: [self._loaded_validation_label_map[_] for _ in x]) self._df_b[ 'PREDICTED_CATEGORIES'] = self._df_b.PREDICTED_CATEGORIES.apply( lambda x: [self._loaded_validation_label_map[_] for _ in x]) self._loaded_validation[ 'NOTE_TYPES'] = self._loaded_validation.NOTE_TYPES.apply( lambda x: [self._loaded_validation_label_map[_] for _ in x]) if not self._get_examples_for_categories: self._get_examples_for_categories = [*class_labels, 'NONE'] else: self._get_examples_for_categories = [ _.upper() for _ in self._get_examples_for_categories ] self._get_examples_for_categories = [ _ for _ in self._get_examples_for_categories if _ != 'NONE' ] logger.log_info( 'Dumping the following class labels of interest: {}'.format( str(self._get_examples_for_categories)))
def __init__(self): super().__init__() self._output_note_file = config.get_pipeline_config_item( self.module_name(), 'output_note_file', '_negex_filtered_notes.parquet') self._df_notes_labeled_path = config.get_pipeline_config_item( self.module_name(), 'input_note_file', None) self._catchall = config.get_pipeline_config_item( self.module_name(), 'catchall', False) self._loaded_df = None self._filtered_cohort_df = None self._labeled_df = None logger.log_info('Loading note file') self._load_note_input_file() logger.log_info('DONE: Loading note file.') logger.log_info('Replacing catchall labels.') self._replace_catchall() logger.log_info('DONE: Replacing catchall labels.') logger.log_info('Dumping processed notes.') self._dump_processed_df() logger.log_info('DONE: Dumping processed notes.')
def __init__(self): super().__init__() self._validation_set = config.get_pipeline_config_item( self.module_name(), 'validation_set_file', None) self._df_path_a = config.get_pipeline_config_item( self.module_name(), 'file_prediction_a', None) self._df_path_b = config.get_pipeline_config_item( self.module_name(), 'file_prediction_b', None) self._ignore_b = config.get_pipeline_config_item( self.module_name(), 'ignore_b', False) self._section_info__file = config.get_pipeline_config_item( self.module_name(), 'section_info__file', None) self._df_path_a_negated = config.get_pipeline_config_item( self.module_name(), 'file_sentence_info_a', None) self._df_path_b_negated = config.get_pipeline_config_item( self.module_name(), 'file_sentence_info_b', None) self._get_examples_for_categories = config.get_pipeline_config_item( self.module_name(), 'get_examples_for_categories', None) self._compare_dumping_dir_name = 'dumped_validaton_sentences_between__{}_and_{}' self._required_tag_list = [ 'POSITIVE_LEXICON_SENTENCES', 'POSITIVE_LEXICON_AFFIRMED_PHRASES', 'POSITIVE_LEXICON_NEGATED_PHRASES', 'NEGATIVE_LEXICON_SENTENCES', 'NEGATIVE_LEXICON_AFFIRMED_PHRASES', 'NEGATIVE_LEXICON_NEGATED_PHRASES' ] self._df_a = None self._df_b = None self._df_sents_a = None self._df_sents_b = None self._loaded_validation = None self._loaded_validation_labels = None self._loaded_validation_label_map = None logger.log_info('Processing validation file') self._loading_validation_labeling_file() logger.log_info('DONE: Processing validation file') if self._section_info__file: logger.log_info( 'Loading section file to include section information') self._loading_section_info_file() logger.log_info( 'DONE: Loading section file to include section information') logger.log_info( 'Loading prediction and sentence info file for dataframes A and B to be compared.' ) self._loading_note_files() logger.log_info( 'DONE: Loading prediction and sentence info file for dataframes A and B to be compared.' ) logger.log_info('Dumping sentences into folder') self._dump_examples_for_comparison() logger.log_info('DONE: Dumping sentences into folder')
def _label_improve_cohort(self): self._pre_filtered_df['FOUND_EVIDENCE_NEGATED'] = 0 if not self._dont_include_predicted_categories: self._pre_filtered_df['PREDICTED_CATEGORIES'] = '' self._pre_filtered_df['MAX_SCORE_CAT'] = '' self._pre_filtered_df['MAX_SCORE'] = -np.inf for _k in self._lexicon_map['positive'].keys(): _k = _k.upper() if self._use_only_positive_lexicons: self._pre_filtered_df[ _k + '_TOTAL_SCORE_SUM'] = self._pre_filtered_df[ _k + '_SCORE_SUM_POSITIVE'] if self._use_old_negation_scheme: mask = (self._pre_filtered_df[ _k + '_POSITIVE_LEXICONS_NEGATED_OCCURENCES'] == 1) & (self._pre_filtered_df[ _k + '_POSITIVE_LEXICON_OCCURENCES'] == 1) self._pre_filtered_df[ _k + '_TOTAL_SCORE_SUM'] = self._pre_filtered_df[ _k + '_POSITIVE_LEXICON_OCCURENCES'] self._pre_filtered_df.loc[mask, _k + '_TOTAL_SCORE_SUM'] = -1 self._pre_filtered_df.loc[~mask, _k + '_TOTAL_SCORE_SUM'] = 1 self._pre_filtered_df.loc[ self._pre_filtered_df[_k + '_POSITIVE_LEXICON_OCCURENCES'] == 0, _k + '_TOTAL_SCORE_SUM'] = 0 self._pre_filtered_df[ 'MAX_SCORE_CAT'] = self._pre_filtered_df.apply( lambda x: _k if x[_k + '_TOTAL_SCORE_SUM'] > x[ 'MAX_SCORE'] else x['MAX_SCORE_CAT'], axis=1) self._pre_filtered_df['MAX_SCORE'] = self._pre_filtered_df.apply( lambda x: x[_k + '_TOTAL_SCORE_SUM'] if x[_k + '_TOTAL_SCORE_SUM'] > x['MAX_SCORE'] else x[ 'MAX_SCORE'], axis=1) if not self._dont_include_predicted_categories: self._pre_filtered_df[ '_PREDICTED_CATEGORIES'] = self._pre_filtered_df[ _k + '_TOTAL_SCORE_SUM'].apply(lambda x: _k if x > 0 else '') self._pre_filtered_df[ 'PREDICTED_CATEGORIES'] = self._pre_filtered_df.apply( lambda x: x['PREDICTED_CATEGORIES'] + '|' + _k if len(x['_PREDICTED_CATEGORIES']) > 0 else x[ 'PREDICTED_CATEGORIES'], axis=1) del self._pre_filtered_df['_PREDICTED_CATEGORIES'] if not self._dont_include_predicted_categories: self._pre_filtered_df.loc[self._pre_filtered_df['MAX_SCORE'] > 0, 'FOUND_EVIDENCE_NEGATED'] = 1 if not self._dump_all_notes: self._filtered_cohort_df = self._pre_filtered_df[ self._pre_filtered_df['FOUND_EVIDENCE_NEGATED'] == 1].copy() else: self._filtered_cohort_df = self._pre_filtered_df else: self._filtered_cohort_df = self._pre_filtered_df if not self._dont_include_predicted_categories: self._filtered_cohort_df[ 'PREDICTED_CATEGORIES'] = self._filtered_cohort_df[ 'PREDICTED_CATEGORIES'].apply(lambda x: x[1:] if len(x) > 1 else x) if 'FOUND_EVIDENCE' in self._filtered_cohort_df.columns: self._filtered_cohort_df['FOUND_EVIDENCE'] = ( self._filtered_cohort_df['FOUND_EVIDENCE'] > 0) & ( self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED'] > 0) else: self._filtered_cohort_df[ 'FOUND_EVIDENCE'] = self._filtered_cohort_df[ 'FOUND_EVIDENCE_NEGATED'] del self._filtered_cohort_df['FOUND_EVIDENCE_NEGATED'] logger.log_info( 'Total patients (after negex filtering): {} / Total admissions: {}' .format(self._filtered_cohort_df['SUBJECT_ID'].nunique(), self._filtered_cohort_df['HADM_ID'].nunique()))