def _parse_lexicons(self): assert os.path.isdir( self._lexicon_dir), 'Invalid lexicon dir. Does not exist.' assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.' pos_dir = os.path.join(self._lexicon_dir, 'positive') neg_dir = os.path.join(self._lexicon_dir, 'negative') assert os.path.isdir( pos_dir ), 'There needs to be a positive lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.' assert os.path.isdir( neg_dir ), 'There needs to be a negative lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.' pos_files = os.listdir(pos_dir) neg_files = os.listdir(neg_dir) def parse_dir(dirlist, prefix): for _lexi in dirlist: file = open(os.path.join(self._lexicon_dir, prefix, _lexi), 'r') filename = _lexi.strip() filename = re.sub(r'\..*', '', filename) filename = helper_classes.Module.camelcase_to_snakecase( filename) filename = filename.replace(' ', '_') lines = file.readlines() lines = [_.strip() for _ in lines if len(_) > 0] self._lexicon_map[prefix][filename] = [] if lines: for _ in lines: term = _.split(';;') if len(term[0]) < 1: continue assert len( term ) > 0, 'Invalid line found in {} lexicon: {}'.format( prefix, _) if len(term) < 2: self._lexicon_map[prefix][filename].append(term[0]) else: self._lexicon_map[prefix][filename].append(term[0]) file.close() logger.log_debug('Parsing the lexicons..') parse_dir(pos_files, 'positive') parse_dir(neg_files, 'negative') for cat in ['positive', 'negative']: for _k, _v in self._lexicon_map[cat].items(): logger.log_info('{} {} lexicon: {} entries'.format( cat, _k, len(_v))) logger.log_info('Parsed and stored all lexicons.')
def _parse_lexicons(self): assert os.path.isdir( self._lexicon_dir), 'Invalid lexicon dir. Does not exist.' assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.' pos_dir = os.path.join(self._lexicon_dir, 'positive') neg_dir = os.path.join(self._lexicon_dir, 'negative') assert os.path.isdir(pos_dir), 'There needs to be a positive lexicon.' assert os.path.isdir(neg_dir), 'There needs to be a negative lexicon.' pos_files = os.listdir(pos_dir) neg_files = os.listdir(neg_dir) unknown_files = [_ for _ in neg_files if not _ in pos_files] assert len( unknown_files ) == 0, 'The lexicon filenames in the positive and negative dirs need to match! Found: ' + str( unknown_files) def parse_dir(dirlist, prefix): for _lexi in dirlist: file = open(os.path.join(self._lexicon_dir, prefix, _lexi), 'r') filename = _lexi.strip() filename = re.sub(r'\..*', '', filename) filename = helper_classes.Module.camelcase_to_snakecase( filename) filename = filename.replace(' ', '_') lines = file.readlines() lines = [_.strip() for _ in lines if len(_) > 0] self._lexicon_map[prefix][filename] = [] self._lexicon_weights[prefix][filename] = {} if lines: for _ in lines: term = _.split(';;') assert len( term ) > 0, 'Invalid line found in {} lexicon: {}'.format( prefix, _) if len(term[0]) < 1: continue if len(term) < 2: self._lexicon_map[prefix][filename].append(term[0]) self._lexicon_weights[prefix][filename][ term[0].lower()] = 2 else: self._lexicon_map[prefix][filename].append(term[0]) self._lexicon_weights[prefix][filename][ term[0].lower()] = (int(term[1])) file.close() logger.log_debug('Parsing the lexicons now..') parse_dir(pos_files, 'positive') parse_dir(neg_files, 'negative') for cat in ['positive', 'negative']: for _k, _v in self._lexicon_map[cat].items(): logger.log_info('{} {} lexicon: {} entries'.format( cat, _k, len(_v))) logger.log_info('Parsed and stored all lexicons.')
def _query_bigquery(self): sql_search = "" merged_lexicon_map = { _k: self._lexicon_map['positive'][_k] + self._lexicon_map['negative'][_k] for _k in self._lexicon_map['positive'].keys() } for _name, _terms in merged_lexicon_map.items(): if not _terms: sql_search = sql_search + "," + " FALSE AS " + _name else: lex = [r'\\b' + x + r'\\b' for x in _terms] sql_search = sql_search + "," + " REGEXP_CONTAINS(text, '(?i)(" + '|'.join( lex) + ")') AS " + _name ignore_str = '\n'.join([ 'AND category NOT LIKE "%{}%"'.format(_) for _ in self._ignore_cat_list ]) use_bqstorage_api = config.get_pipeline_config_item( self.module_name(), "use_bqstorage_api", False) limitstr = "" if config.get_pipeline_config_item(self.module_name(), "debug_download", False): limitstr = 'LIMIT 10' cohort_ids = [] if self._cohort_file and os.path.isfile(self._cohort_file): cohort_ids = pd.read_csv(self._cohort_file) cohort_ids.columns = [_.lower() for _ in cohort_ids.columns] cohort_ids = list(cohort_ids.loc[:, 'hadm_id']) sql = """ SELECT row_id, subject_id, hadm_id, chartdate, category, text{} FROM `physionet-data.mimiciii_notes.noteevents` WHERE hadm_id IS NOT NULL AND hadm_id IN ({}) {} {} """.format(sql_search, ','.join([str(_) for _ in cohort_ids]), ignore_str, limitstr) logger.log_info('Querying noteevents for lexicon occurences.') self._labeled_df = pandas_gbq.read_gbq( sql, project_id=google_tools.PROJECT_ID, dialect='standard', use_bqstorage_api=use_bqstorage_api ) #, progress_bar_type=utils.PROGRESSBAR_TYPE) self._labeled_df.columns = [ _.upper() for _ in self._labeled_df.columns ] if not self._dump_all: mask = None for _ in self._labeled_df.columns: if _.lower() in [ 'subject_id', 'row_id', 'hadm_id', 'chartdate', 'category', 'text' ]: continue if mask is None: mask = self._labeled_df[_].astype(bool) else: mask = mask | self._labeled_df[_].astype(bool) self._labeled_df = self._labeled_df[mask].copy() logger.log_info('DONE: Querying noteevents for lexicon occurences.') logger.log_debug('Number of admissions {}, number of notes {}.'.format( self._labeled_df['HADM_ID'].nunique(), len(self._labeled_df))) for _key in self._lexicon_map['positive'].keys(): _key = _key.upper() logger.log_debug('Number of notes with {}: {}.'.format( _key.lower(), self._labeled_df[_key.upper()].sum()))
def _check_note_negations(self): mask = None for _ in self._loaded_df.columns: if _.lower() in [ 'subject_id', 'row_id', 'hadm_id', 'chartdate', 'category', 'text', 'section_id', 'section_group', 'section_group_new', 'section_name' ]: continue if mask is None: mask = self._loaded_df[_].astype(bool) else: mask = mask | self._loaded_df[_].astype(bool) logger.log_info('Starting negation checking loop') logger.log_debug( str(self._njobs) + ' processes used for check routine.') note_pos_df = self._loaded_df[mask].copy() logger.log_info( 'Total patients (before negex filtering): {} / Total admissions: {}' .format(note_pos_df['SUBJECT_ID'].nunique(), note_pos_df['HADM_ID'].nunique())) del self._loaded_df if self._debug_check: #note_pos_df = note_pos_df[note_pos_df['ROW_ID'] == 33059] note_pos_df = note_pos_df.iloc[0:10] note_infos = Parallel(n_jobs=self._njobs)( delayed(self._process_note)(note) for _, note in progressbar(note_pos_df.iterrows(), total=len(note_pos_df))) logger.log_debug('Found {} note infos.'.format(len(note_infos))) if note_infos: example_note = note_infos[0] logger.log_debug(str(example_note)) note_infos_df = [] cols = ['ROW_ID'] for _entry in note_infos: for _id, _cat_dict in _entry.items(): lis = [_id] for _cat in self._lexicon_map['positive'].keys(): if not _cat in _cat_dict: lis.append(0) lis.append(0) lis.append('') lis.append('') lis.append('') continue _negated = _cat_dict[_cat]['negated'] _key = _cat.upper() positive_terms = len(_cat_dict[_cat]['occurences']) lis.append(positive_terms) lis.append(positive_terms) lis.append('\n'.join(_cat_dict[_cat]['sentences'])) lis.append('\n'.join( [str(_) for _ in _cat_dict[_cat]['occurences']])) lis.append('') note_infos_df.append(lis) cols_suffix = [ 'TOTAL_SCORE_SUM', 'SCORE_SUM_POSITIVE', 'POSITIVE_LEXICON_SENTENCES', 'POSITIVE_LEXICON_AFFIRMED_PHRASES', 'POSITIVE_LEXICON_NEGATED_PHRASES' ] for _key in self._lexicon_map['positive'].keys(): for _suff in cols_suffix: cols.append(_key.upper() + '_' + _suff.upper()) info_df = pd.DataFrame(note_infos_df, columns=cols) note_pos_df = note_pos_df.merge(info_df, how='left', on='ROW_ID') note_pos_df.loc[:, cols] = note_pos_df[cols].fillna('') self._pre_filtered_df = note_pos_df