def _parse_lexicons(self):
        assert os.path.isdir(
            self._lexicon_dir), 'Invalid lexicon dir. Does not exist.'
        assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.'
        pos_dir = os.path.join(self._lexicon_dir, 'positive')
        neg_dir = os.path.join(self._lexicon_dir, 'negative')

        assert os.path.isdir(
            pos_dir
        ), 'There needs to be a positive lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.'
        assert os.path.isdir(
            neg_dir
        ), 'There needs to be a negative lexicon. If you just want to use one variant of the lexicons, create a positive lexicon folders with empty text files.'

        pos_files = os.listdir(pos_dir)
        neg_files = os.listdir(neg_dir)

        def parse_dir(dirlist, prefix):
            for _lexi in dirlist:
                file = open(os.path.join(self._lexicon_dir, prefix, _lexi),
                            'r')
                filename = _lexi.strip()
                filename = re.sub(r'\..*', '', filename)
                filename = helper_classes.Module.camelcase_to_snakecase(
                    filename)
                filename = filename.replace(' ', '_')
                lines = file.readlines()
                lines = [_.strip() for _ in lines if len(_) > 0]

                self._lexicon_map[prefix][filename] = []
                if lines:
                    for _ in lines:
                        term = _.split(';;')
                        if len(term[0]) < 1:
                            continue
                        assert len(
                            term
                        ) > 0, 'Invalid line found in {} lexicon: {}'.format(
                            prefix, _)
                        if len(term) < 2:
                            self._lexicon_map[prefix][filename].append(term[0])
                        else:
                            self._lexicon_map[prefix][filename].append(term[0])
                file.close()

        logger.log_debug('Parsing the lexicons..')
        parse_dir(pos_files, 'positive')
        parse_dir(neg_files, 'negative')

        for cat in ['positive', 'negative']:
            for _k, _v in self._lexicon_map[cat].items():
                logger.log_info('{} {} lexicon: {} entries'.format(
                    cat, _k, len(_v)))

        logger.log_info('Parsed and stored all lexicons.')
    def _parse_lexicons(self):
        assert os.path.isdir(
            self._lexicon_dir), 'Invalid lexicon dir. Does not exist.'
        assert len(os.listdir(self._lexicon_dir)) > 0, 'Lexicon dir is empty.'
        pos_dir = os.path.join(self._lexicon_dir, 'positive')
        neg_dir = os.path.join(self._lexicon_dir, 'negative')
        assert os.path.isdir(pos_dir), 'There needs to be a positive lexicon.'
        assert os.path.isdir(neg_dir), 'There needs to be a negative lexicon.'

        pos_files = os.listdir(pos_dir)
        neg_files = os.listdir(neg_dir)

        unknown_files = [_ for _ in neg_files if not _ in pos_files]
        assert len(
            unknown_files
        ) == 0, 'The lexicon filenames in the positive and negative dirs need to match! Found: ' + str(
            unknown_files)

        def parse_dir(dirlist, prefix):
            for _lexi in dirlist:
                file = open(os.path.join(self._lexicon_dir, prefix, _lexi),
                            'r')
                filename = _lexi.strip()
                filename = re.sub(r'\..*', '', filename)
                filename = helper_classes.Module.camelcase_to_snakecase(
                    filename)
                filename = filename.replace(' ', '_')
                lines = file.readlines()
                lines = [_.strip() for _ in lines if len(_) > 0]

                self._lexicon_map[prefix][filename] = []
                self._lexicon_weights[prefix][filename] = {}
                if lines:
                    for _ in lines:
                        term = _.split(';;')
                        assert len(
                            term
                        ) > 0, 'Invalid line found in {} lexicon: {}'.format(
                            prefix, _)
                        if len(term[0]) < 1:
                            continue
                        if len(term) < 2:
                            self._lexicon_map[prefix][filename].append(term[0])
                            self._lexicon_weights[prefix][filename][
                                term[0].lower()] = 2
                        else:
                            self._lexicon_map[prefix][filename].append(term[0])
                            self._lexicon_weights[prefix][filename][
                                term[0].lower()] = (int(term[1]))
                file.close()

        logger.log_debug('Parsing the lexicons now..')
        parse_dir(pos_files, 'positive')
        parse_dir(neg_files, 'negative')

        for cat in ['positive', 'negative']:
            for _k, _v in self._lexicon_map[cat].items():
                logger.log_info('{} {} lexicon: {} entries'.format(
                    cat, _k, len(_v)))

        logger.log_info('Parsed and stored all lexicons.')
    def _query_bigquery(self):

        sql_search = ""

        merged_lexicon_map = {
            _k: self._lexicon_map['positive'][_k] +
            self._lexicon_map['negative'][_k]
            for _k in self._lexicon_map['positive'].keys()
        }

        for _name, _terms in merged_lexicon_map.items():
            if not _terms:
                sql_search = sql_search + "," + " FALSE AS " + _name
            else:
                lex = [r'\\b' + x + r'\\b' for x in _terms]
                sql_search = sql_search + "," + " REGEXP_CONTAINS(text, '(?i)(" + '|'.join(
                    lex) + ")') AS " + _name

        ignore_str = '\n'.join([
            'AND category NOT LIKE "%{}%"'.format(_)
            for _ in self._ignore_cat_list
        ])

        use_bqstorage_api = config.get_pipeline_config_item(
            self.module_name(), "use_bqstorage_api", False)

        limitstr = ""
        if config.get_pipeline_config_item(self.module_name(),
                                           "debug_download", False):
            limitstr = 'LIMIT 10'

        cohort_ids = []
        if self._cohort_file and os.path.isfile(self._cohort_file):
            cohort_ids = pd.read_csv(self._cohort_file)
            cohort_ids.columns = [_.lower() for _ in cohort_ids.columns]
            cohort_ids = list(cohort_ids.loc[:, 'hadm_id'])

        sql = """
        SELECT row_id, subject_id, hadm_id, chartdate, category, text{}
        FROM `physionet-data.mimiciii_notes.noteevents`
        WHERE hadm_id IS NOT NULL 
        AND hadm_id IN ({})
        {}
        {}
        """.format(sql_search, ','.join([str(_) for _ in cohort_ids]),
                   ignore_str, limitstr)

        logger.log_info('Querying noteevents for lexicon occurences.')
        self._labeled_df = pandas_gbq.read_gbq(
            sql,
            project_id=google_tools.PROJECT_ID,
            dialect='standard',
            use_bqstorage_api=use_bqstorage_api
        )  #, progress_bar_type=utils.PROGRESSBAR_TYPE)
        self._labeled_df.columns = [
            _.upper() for _ in self._labeled_df.columns
        ]

        if not self._dump_all:
            mask = None
            for _ in self._labeled_df.columns:
                if _.lower() in [
                        'subject_id', 'row_id', 'hadm_id', 'chartdate',
                        'category', 'text'
                ]:
                    continue
                if mask is None:
                    mask = self._labeled_df[_].astype(bool)
                else:
                    mask = mask | self._labeled_df[_].astype(bool)
            self._labeled_df = self._labeled_df[mask].copy()

        logger.log_info('DONE: Querying noteevents for lexicon occurences.')
        logger.log_debug('Number of admissions {}, number of notes {}.'.format(
            self._labeled_df['HADM_ID'].nunique(), len(self._labeled_df)))
        for _key in self._lexicon_map['positive'].keys():
            _key = _key.upper()
            logger.log_debug('Number of notes with {}: {}.'.format(
                _key.lower(), self._labeled_df[_key.upper()].sum()))
    def _check_note_negations(self):
        mask = None
        for _ in self._loaded_df.columns:
            if _.lower() in [
                    'subject_id', 'row_id', 'hadm_id', 'chartdate', 'category',
                    'text', 'section_id', 'section_group', 'section_group_new',
                    'section_name'
            ]:
                continue
            if mask is None:
                mask = self._loaded_df[_].astype(bool)
            else:
                mask = mask | self._loaded_df[_].astype(bool)

        logger.log_info('Starting negation checking loop')
        logger.log_debug(
            str(self._njobs) + ' processes used for check routine.')
        note_pos_df = self._loaded_df[mask].copy()

        logger.log_info(
            'Total patients (before negex filtering): {} / Total admissions: {}'
            .format(note_pos_df['SUBJECT_ID'].nunique(),
                    note_pos_df['HADM_ID'].nunique()))
        del self._loaded_df

        if self._debug_check:
            #note_pos_df = note_pos_df[note_pos_df['ROW_ID'] == 33059]
            note_pos_df = note_pos_df.iloc[0:10]

        note_infos = Parallel(n_jobs=self._njobs)(
            delayed(self._process_note)(note)
            for _, note in progressbar(note_pos_df.iterrows(),
                                       total=len(note_pos_df)))
        logger.log_debug('Found {} note infos.'.format(len(note_infos)))
        if note_infos:
            example_note = note_infos[0]
            logger.log_debug(str(example_note))

        note_infos_df = []
        cols = ['ROW_ID']

        for _entry in note_infos:
            for _id, _cat_dict in _entry.items():
                lis = [_id]

                for _cat in self._lexicon_map['positive'].keys():
                    if not _cat in _cat_dict:
                        lis.append(0)
                        lis.append(0)

                        lis.append('')
                        lis.append('')
                        lis.append('')
                        continue

                    _negated = _cat_dict[_cat]['negated']
                    _key = _cat.upper()

                    positive_terms = len(_cat_dict[_cat]['occurences'])

                    lis.append(positive_terms)
                    lis.append(positive_terms)

                    lis.append('\n'.join(_cat_dict[_cat]['sentences']))
                    lis.append('\n'.join(
                        [str(_) for _ in _cat_dict[_cat]['occurences']]))
                    lis.append('')

                note_infos_df.append(lis)

        cols_suffix = [
            'TOTAL_SCORE_SUM', 'SCORE_SUM_POSITIVE',
            'POSITIVE_LEXICON_SENTENCES', 'POSITIVE_LEXICON_AFFIRMED_PHRASES',
            'POSITIVE_LEXICON_NEGATED_PHRASES'
        ]
        for _key in self._lexicon_map['positive'].keys():
            for _suff in cols_suffix:
                cols.append(_key.upper() + '_' + _suff.upper())

        info_df = pd.DataFrame(note_infos_df, columns=cols)
        note_pos_df = note_pos_df.merge(info_df, how='left', on='ROW_ID')

        note_pos_df.loc[:, cols] = note_pos_df[cols].fillna('')
        self._pre_filtered_df = note_pos_df