Exemple #1
0
    def normalize(
        self,
        string: str,
        check_english: bool = True,
        normalize_text: bool = True,
        normalize_entity: bool = True,
        normalize_url: bool = False,
        normalize_email: bool = False,
        normalize_year: bool = True,
        normalize_telephone: bool = True,
        logging: bool = False,
    ):
        """
        Normalize a string.

        Parameters
        ----------
        string : str
        check_english: bool, (default=True)
            check a word in english dictionary.
        normalize_text: bool, (default=True)
            if True, will try to replace shortforms with internal corpus.
        normalize_entity: bool, (default=True)
            normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only.
        normalize_url: bool, (default=False)
            if True, replace `://` with empty and `.` with `dot`.
            `https://huseinhouse.com` -> `https huseinhouse dot com`.
        normalize_email: bool, (default=False)
            if True, replace `@` with `di`, `.` with `dot`.
            `[email protected]` -> `husein dot zol kosong lima di gmail dot com`.
        normalize_year: bool, (default=True)
            if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`.
            if True, `1970-an` -> `sembilan belas tujuh puluh an`.
            if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`.
        normalize_telephone: bool, (default=True)
            if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh`
        logging: bool, (default=False)
            if True, will log index and token queue using `logging.warn`.

        Returns
        -------
        string: normalized string
        """

        string = ' '.join(self._tokenizer(string))
        string = groupby(string)

        if normalize_text:
            string = replace_laugh(string)
            string = replace_mengeluh(string)
            string = _replace_compoud(string)

        if hasattr(self._speller, 'normalize_elongated'):
            string = [
                self._speller.normalize_elongated(word) if
                len(re.findall(r'(.)\1{1}', word)) and not word[0].isupper()
                and not word.lower().startswith('ke-')
                and not _is_number_regex(word) else word
                for word in string.split()
            ]
            string = ' '.join(string)

        result, normalized = [], []

        tokenized = self._tokenizer(string)
        index = 0
        while index < len(tokenized):
            word = tokenized[index]
            word_lower = word.lower()
            word_upper = word.upper()
            first_c = word[0].isupper()

            if logging:
                s = f'index: {index}, word: {word}, queue: {result}'
                warn(s)

            if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
                result.append(word)
                index += 1
                continue

            normalized.append(rules_normalizer.get(word_lower, word_lower))

            if word_lower in ignore_words:
                result.append(word)
                index += 1
                continue

            if first_c and not len(re.findall(_money, word_lower)):
                if word_lower in rules_normalizer and normalize_text:
                    result.append(case_of(word)(rules_normalizer[word_lower]))
                    index += 1
                    continue
                elif word_upper not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']:
                    result.append(
                        _normalize_title(word) if normalize_text else word)
                    index += 1
                    continue

            if check_english:
                if word_lower in ENGLISH_WORDS:
                    result.append(word)
                    index += 1
                    continue

            if word_lower in MALAY_WORDS and word_lower not in ['pada', 'ke']:
                result.append(word)
                index += 1
                continue

            if len(word) > 2:
                if word[-2] in consonants and word[-1] == 'e':
                    word = word[:-1] + 'a'

            if word[0] == 'x' and len(word) > 1:
                result_string = 'tak '
                word = word[1:]
            else:
                result_string = ''

            if word_lower == 'ke' and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                tokenized[index + 2]))
                    index += 3
                    continue
                elif tokenized[index + 1] == '-' and re.match(
                        '.*(V|X|I|L|D)', tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                str(rom_to_int(tokenized[index + 2]))))
                    index += 3
                    continue
                else:
                    result.append('ke')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        to_cardinal(_string_to_num(word)) + ' hingga ' +
                        to_cardinal(_string_to_num(tokenized[index + 2])))
                    index += 3
                    continue

            if word_lower == 'pada' and index < (len(tokenized) - 3):
                if (_is_number_regex(tokenized[index + 1])
                        and tokenized[index + 2] in '/-'
                        and _is_number_regex(tokenized[index + 3])):
                    result.append('pada %s hari bulan %s' % (
                        to_cardinal(_string_to_num(tokenized[index + 1])),
                        to_cardinal(_string_to_num(tokenized[index + 3])),
                    ))
                    index += 4
                    continue

            if (word_lower in ['tahun', 'thun'] and index <
                (len(tokenized) - 1) and normalize_year):
                if (_is_number_regex(tokenized[index + 1])
                        and len(tokenized[index + 1]) == 4):
                    t = tokenized[index + 1]
                    if t[1] != '0':
                        l = to_cardinal(int(t[:2]))
                        r = to_cardinal(int(t[2:]))
                        c = f'{l} {r}'
                    else:
                        c = to_cardinal(int(t))
                    if (index < (len(tokenized) - 3)
                            and tokenized[index + 2] == '-'
                            and tokenized[index + 3].lower() == 'an'):
                        end = 'an'
                        plus = 4
                    else:
                        end = ''
                        plus = 2
                    result.append(f'tahun {c}{end}')
                    index += plus
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '/' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        fraction(word + tokenized[index + 1] +
                                 tokenized[index + 2]))
                    index += 3
                    continue

                if (tokenized[index + 1] == '-'
                        and tokenized[index + 2].lower() == 'an'
                        and normalize_year and len(word) == 4):
                    t = word
                    if t[1] != '0':
                        l = to_cardinal(int(t[:2]))
                        r = to_cardinal(int(t[2:]))
                        c = f'{l} {r}'
                    else:
                        c = to_cardinal(int(t))
                    result.append(f'{c}an')
                    index += 3
                    continue

            if re.findall(_money, word_lower):
                money_, _ = money(word)
                result.append(money_)
                if index < (len(tokenized) - 1):
                    if tokenized[index + 1].lower() in ('sen', 'cent'):
                        index += 2
                    else:
                        index += 1
                else:
                    index += 1
                continue

            if re.findall(_date, word_lower):
                word = word_lower
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%d/%m/%Y'))
                else:
                    result.append(word)
                index += 1
                continue

            if re.findall(_expressions['time'], word_lower):
                word = word_lower
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%H:%M:%S'))
                else:
                    result.append(word)
                index += 1
                continue

            if re.findall(_expressions['hashtag'], word_lower):
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['url'], word_lower):
                if normalize_url:
                    word = word.replace('://', ' ').replace('.', ' dot ')
                    word = put_spacing_num(word)
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['email'], word_lower):
                if normalize_email:
                    word = (word.replace('://', ' ').replace('.',
                                                             ' dot ').replace(
                                                                 '@', ' di '))
                    word = put_spacing_num(word)
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['phone'], word_lower):
                if normalize_telephone:
                    splitted = word.split('-')
                    left = put_spacing_num(splitted[0])
                    right = put_spacing_num(splitted[1])
                    word = f'{left}, {right}'
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['user'], word_lower):
                result.append(word)
                index += 1
                continue

            if (re.findall(_expressions['temperature'], word_lower)
                    or re.findall(_expressions['distance'], word_lower)
                    or re.findall(_expressions['volume'], word_lower)
                    or re.findall(_expressions['duration'], word_lower)
                    or re.findall(_expressions['weight'], word_lower)):
                word = word.replace(' ', '')
                result.append(digit_unit(word))
                index += 1
                continue

            cardinal_ = cardinal(word)
            if cardinal_ != word:
                result.append(cardinal_)
                index += 1
                continue

            normalized_ke = ordinal(word)
            if normalized_ke != word:
                result.append(normalized_ke)
                index += 1
                continue

            word, end_result_string = _remove_postfix(word)
            word, repeat = check_repeat(word)

            if normalize_text:
                if word in sounds:
                    selected = sounds[word]
                elif word in rules_normalizer:
                    selected = rules_normalizer[word]
                elif self._speller:
                    selected = self._speller.correct(
                        word, string=' '.join(tokenized), index=index)
                else:
                    selected = word

            else:
                selected = word

            selected = '-'.join([selected] * repeat)
            result.append(result_string + selected + end_result_string)
            index += 1

        result = ' '.join(result)
        normalized = ' '.join(normalized)

        if normalize_entity:
            dates_, money_ = normalized_entity(normalized)

        else:
            dates_, money_ = {}, {}
        return {'normalize': result, 'date': dates_, 'money': money_}
Exemple #2
0
    def normalize(
        self,
        string: str,
        normalize_text: bool = True,
        normalize_entity: bool = True,
        normalize_url: bool = False,
        normalize_email: bool = False,
        normalize_year: bool = True,
        normalize_telephone: bool = True,
        normalize_date: bool = True,
        normalize_time: bool = True,
        check_english_func=is_english,
        check_malay_func=is_malay,
        **kwargs,
    ):
        """
        Normalize a string.

        Parameters
        ----------
        string : str
        normalize_text: bool, (default=True)
            if True, will try to replace shortforms with internal corpus.
        normalize_entity: bool, (default=True)
            normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only.
        normalize_url: bool, (default=False)
            if True, replace `://` with empty and `.` with `dot`.
            `https://huseinhouse.com` -> `https huseinhouse dot com`.
        normalize_email: bool, (default=False)
            if True, replace `@` with `di`, `.` with `dot`.
            `[email protected]` -> `husein dot zol kosong lima di gmail dot com`.
        normalize_year: bool, (default=True)
            if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`.
            if True, `1970-an` -> `sembilan belas tujuh puluh an`.
            if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`.
        normalize_telephone: bool, (default=True)
            if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh`
        normalize_date: bool, (default=True)
            if True, `01/12/2001` -> `satu disember dua ribu satu`.
            if True, `Jun 2017` -> `satu Jun dua ribu tujuh belas`.
            if True, `2017 Jun` -> `satu Jun dua ribu tujuh belas`.
            if False, `2017 Jun` -> `01/06/2017`.
            if False, `Jun 2017` -> `01/06/2017`.
        normalize_time: bool, (default=True)
            if True, `pukul 2.30` -> `pukul dua tiga puluh minit`.
            if False, `pukul 2.30` -> `'02:00:00'`
        check_english_func: Callable, (default=malaya.text.is_english)
            function to check a word in english dictionary, default is malaya.text.is_english.
        check_malay_func: Callable, (default=malaya.text.is_malay)
            function to check a word in malay dictionary, default is malaya.text.is_malay.

        Returns
        -------
        string: {'normalize', 'date', 'money'}
        """
        tokenized = self._tokenizer(string)
        s = f'tokenized: {tokenized}'
        logger.debug(s)
        string = ' '.join(tokenized)
        string = groupby(string)

        if normalize_text:
            string = replace_laugh(string)
            string = replace_mengeluh(string)
            string = _replace_compound(string)

        if hasattr(self._speller, 'normalize_elongated'):
            string = [
                self._speller.normalize_elongated(word)
                if len(re.findall(r'(.)\1{1}', word))
                and not word[0].isupper()
                and not word.lower().startswith('ke-')
                and not _is_number_regex(word)
                else word
                for word in string.split()
            ]
            string = ' '.join(string)

        result, normalized = [], []

        tokenized = self._tokenizer(string)
        index = 0
        while index < len(tokenized):
            word = tokenized[index]
            word_lower = word.lower()
            word_upper = word.upper()
            first_c = word[0].isupper()

            s = f'index: {index}, word: {word}, queue: {result}'
            logger.debug(s)

            if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
                s = f'index: {index}, word: {word}, condition punct'
                logger.debug(s)
                result.append(word)
                index += 1
                continue

            normalized.append(rules_normalizer.get(word_lower, word_lower))

            if word_lower in ignore_words:
                s = f'index: {index}, word: {word}, condition ignore words'
                logger.debug(s)
                result.append(word)
                index += 1
                continue

            if (
                first_c
                and not len(re.findall(_expressions['money'], word_lower))
                and not len(re.findall(_expressions['date'], word_lower))
            ):
                s = f'index: {index}, word: {word}, condition not in money and date'
                logger.debug(s)
                if word_lower in rules_normalizer and normalize_text:
                    result.append(case_of(word)(rules_normalizer[word_lower]))
                    index += 1
                    continue
                elif word_upper not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']:
                    result.append(
                        _normalize_title(word) if normalize_text else word
                    )
                    index += 1
                    continue

            if check_english_func is not None:
                s = f'index: {index}, word: {word}, condition check english'
                logger.debug(s)
                if check_english_func(word_lower):
                    result.append(word)
                    index += 1
                    continue

            if check_malay_func is not None:
                s = f'index: {index}, word: {word}, condition check malay'
                logger.debug(s)
                if check_malay_func(word_lower) and word_lower not in ['pada', 'ke']:
                    result.append(word)
                    index += 1
                    continue

            if len(word) > 2 and normalize_text:
                s = f'index: {index}, word: {word}, condition len(word) > 2 and norm text'
                logger.debug(s)
                if word[-2] in consonants and word[-1] == 'e':
                    word = word[:-1] + 'a'

            if word[0] == 'x' and len(word) > 1 and normalize_text:
                s = f'index: {index}, word: {word}, condition word[0] == `x` and len(word) > 1 and norm text'
                logger.debug(s)
                result_string = 'tak '
                word = word[1:]
            else:
                s = f'index: {index}, word: {word}, condition else for (word[0] == `x` and len(word) > 1 and norm text)'
                logger.debug(s)
                result_string = ''

            if word_lower == 'ke' and index < (len(tokenized) - 2):
                s = f'index: {index}, word: {word}, condition ke'
                logger.debug(s)
                if tokenized[index + 1] == '-' and _is_number_regex(
                    tokenized[index + 2]
                ):
                    result.append(
                        ordinal(
                            word + tokenized[index + 1] + tokenized[index + 2]
                        )
                    )
                    index += 3
                    continue
                elif tokenized[index + 1] == '-' and re.match(
                    '.*(V|X|I|L|D)', tokenized[index + 2]
                ):
                    result.append(
                        ordinal(
                            word
                            + tokenized[index + 1]
                            + str(rom_to_int(tokenized[index + 2]))
                        )
                    )
                    index += 3
                    continue
                else:
                    result.append('ke')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                s = f'index: {index}, word: {word}, condition hingga'
                logger.debug(s)
                if tokenized[index + 1] == '-' and _is_number_regex(
                    tokenized[index + 2]
                ):
                    result.append(
                        to_cardinal(_string_to_num(word))
                        + ' hingga '
                        + to_cardinal(_string_to_num(tokenized[index + 2]))
                    )
                    index += 3
                    continue

            if word_lower == 'pada' and index < (len(tokenized) - 3):
                s = f'index: {index}, word: {word}, condition pada hari bulan'
                logger.debug(s)
                if (
                    _is_number_regex(tokenized[index + 1])
                    and tokenized[index + 2] in '/-'
                    and _is_number_regex(tokenized[index + 3])
                ):
                    result.append(
                        'pada %s hari bulan %s'
                        % (
                            to_cardinal(_string_to_num(tokenized[index + 1])),
                            to_cardinal(_string_to_num(tokenized[index + 3])),
                        )
                    )
                    index += 4
                    continue

            if (
                word_lower in ['tahun', 'thun']
                and index < (len(tokenized) - 1)
                and normalize_year
            ):
                s = f'index: {index}, word: {word}, condition tahun'
                logger.debug(s)
                if (
                    _is_number_regex(tokenized[index + 1])
                    and len(tokenized[index + 1]) == 4
                ):
                    t = tokenized[index + 1]
                    if t[1] != '0':
                        l = to_cardinal(int(t[:2]))
                        r = to_cardinal(int(t[2:]))
                        c = f'{l} {r}'
                    else:
                        c = to_cardinal(int(t))
                    if (
                        index < (len(tokenized) - 3)
                        and tokenized[index + 2] == '-'
                        and tokenized[index + 3].lower() == 'an'
                    ):
                        end = 'an'
                        plus = 4
                    else:
                        end = ''
                        plus = 2
                    result.append(f'tahun {c}{end}')
                    index += plus
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                s = f'index: {index}, word: {word}, condition fraction'
                logger.debug(s)
                if tokenized[index + 1] == '/' and _is_number_regex(
                    tokenized[index + 2]
                ):
                    result.append(
                        fraction(
                            word + tokenized[index + 1] + tokenized[index + 2]
                        )
                    )
                    index += 3
                    continue

                if (
                    tokenized[index + 1] == '-'
                    and tokenized[index + 2].lower() == 'an'
                    and normalize_year
                    and len(word) == 4
                ):
                    t = word
                    if t[1] != '0':
                        l = to_cardinal(int(t[:2]))
                        r = to_cardinal(int(t[2:]))
                        c = f'{l} {r}'
                    else:
                        c = to_cardinal(int(t))
                    result.append(f'{c}an')
                    index += 3
                    continue

            if re.findall(_expressions['money'], word_lower):
                s = f'index: {index}, word: {word}, condition money'
                logger.debug(s)
                money_, _ = money(word)
                result.append(money_)
                if index < (len(tokenized) - 1):
                    if tokenized[index + 1].lower() in ('sen', 'cent'):
                        index += 2
                    else:
                        index += 1
                else:
                    index += 1
                continue

            if re.findall(_expressions['date'], word_lower):
                s = f'index: {index}, word: {word}, condition date'
                logger.debug(s)
                word = word_lower
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                try:
                    s = f'index: {index}, word: {word}, parsing date'
                    logger.debug(s)
                    parsed = dateparser.parse(word)
                    if parsed:
                        word = parsed.strftime('%d/%m/%Y')
                        if normalize_date:
                            day, month, year = word.split('/')
                            day = cardinal(day)
                            month = bulan[int(month)].title()
                            year = cardinal(year)

                            word = f'{day} {month} {year}'
                except Exception as e:
                    logger.warning(str(e))
                result.append(word)

                index += 1
                continue

            if (
                re.findall(_expressions['time'], word_lower)
                or re.findall(_expressions['time_pukul'], word_lower)
            ):
                s = f'index: {index}, word: {word}, condition time'
                logger.debug(s)
                word = word_lower
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                try:
                    s = f'index: {index}, word: {word}, parsing time'
                    logger.debug(s)
                    parsed = dateparser.parse(word.replace('.', ':'))
                    if parsed:
                        word = parsed.strftime('%H:%M:%S')
                        if normalize_time:
                            hour, minute, second = word.split(':')
                            hour = cardinal(hour)
                            if int(minute) > 0:
                                minute = cardinal(minute)
                                minute = f'{minute} minit'
                            else:
                                minute = ''
                            if int(second) > 0:
                                second = cardinal(second)
                                second = f'{second} saat'
                            else:
                                second = ''
                            word = f'pukul {hour} {minute} {second}'
                            word = re.sub(r'[ ]+', ' ', word).strip()
                except Exception as e:
                    logger.warning(str(e))
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['hashtag'], word_lower):
                s = f'index: {index}, word: {word}, condition hashtag'
                logger.debug(s)
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['url'], word_lower):
                s = f'index: {index}, word: {word}, condition url'
                logger.debug(s)
                if normalize_url:
                    word = word.replace('://', ' ').replace('.', ' dot ')
                    word = put_spacing_num(word)
                    word = word.replace('https', 'HTTPS').replace('http', 'HTTP').replace('www', 'WWW')
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['email'], word_lower):
                s = f'index: {index}, word: {word}, condition email'
                logger.debug(s)
                if normalize_email:
                    word = (
                        word.replace('://', ' ')
                        .replace('.', ' dot ')
                        .replace('@', ' di ')
                    )
                    word = put_spacing_num(word)
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['phone'], word_lower):
                s = f'index: {index}, word: {word}, condition phone'
                logger.debug(s)
                if normalize_telephone:
                    splitted = word.split('-')
                    if len(splitted) == 2:
                        left = put_spacing_num(splitted[0])
                        right = put_spacing_num(splitted[1])
                        word = f'{left}, {right}'
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['user'], word_lower):
                s = f'index: {index}, word: {word}, condition user'
                logger.debug(s)
                result.append(word)
                index += 1
                continue

            if (
                re.findall(_expressions['temperature'], word_lower)
                or re.findall(_expressions['distance'], word_lower)
                or re.findall(_expressions['volume'], word_lower)
                or re.findall(_expressions['duration'], word_lower)
                or re.findall(_expressions['weight'], word_lower)
            ):
                s = f'index: {index}, word: {word}, condition units'
                logger.debug(s)
                word = word.replace(' ', '')
                result.append(digit_unit(word))
                index += 1
                continue

            if (
                re.findall(_expressions['percent'], word_lower)
            ):
                s = f'index: {index}, word: {word}, condition percent'
                logger.debug(s)
                word = word.replace('%', '')
                result.append(cardinal(word) + ' peratus')
                index += 1
                continue

            if re.findall(_expressions['ic'], word_lower):
                s = f'index: {index}, word: {word}, condition IC'
                logger.debug(s)
                result.append(digit(word))
                index += 1
                continue

            if (
                re.findall(_expressions['number'], word_lower)
                and word_lower[0] == '0'
                and '.' not in word_lower
            ):
                s = f'index: {index}, word: {word}, condition digit and word[0] == `0`'
                logger.debug(s)
                result.append(digit(word))
                index += 1
                continue

            cardinal_ = cardinal(word)
            if cardinal_ != word:
                s = f'index: {index}, word: {word}, condition cardinal'
                logger.debug(s)
                result.append(cardinal_)
                index += 1
                continue

            normalized_ke = ordinal(word)
            if normalized_ke != word:
                s = f'index: {index}, word: {word}, condition normalized ke'
                logger.debug(s)
                result.append(normalized_ke)
                index += 1
                continue

            word, end_result_string = _remove_postfix(word)
            if normalize_text:
                word, repeat = check_repeat(word)
            else:
                repeat = 1

            if normalize_text:
                s = f'index: {index}, word: {word}, condition normalize text'
                logger.debug(s)
                if word in sounds:
                    selected = sounds[word]
                elif word in rules_normalizer:
                    selected = rules_normalizer[word]
                elif self._speller:
                    selected = self._speller.correct(
                        word, string=' '.join(tokenized), index=index
                    )
                else:
                    selected = word

            else:
                selected = word

            selected = '-'.join([selected] * repeat)
            result.append(result_string + selected + end_result_string)
            index += 1

        result = ' '.join(result)
        normalized = ' '.join(normalized)

        if normalize_entity:
            dates_, money_ = normalized_entity(normalized)

        else:
            dates_, money_ = {}, {}
        return {'normalize': result, 'date': dates_, 'money': money_}
Exemple #3
0
    def normalize(self, string: str, check_english: bool = True):
        """
        Normalize a string

        Parameters
        ----------
        string : str
        check_english: bool, (default=True)
            check a word in english dictionary.

        Returns
        -------
        string: normalized string
        """

        result, normalized = [], []
        tokenized = _tokenizer(string)
        print(tokenized)
        index = 0
        while index < len(tokenized):
            word = tokenized[index]
            if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
                result.append(word)
                index += 1
                continue
            normalized.append(rules_normalizer.get(word.lower(), word.lower()))
            if word.lower() in ignore_words:
                result.append(word)
                index += 1
                continue
            if word[0].isupper():
                if word.upper() not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']:
                    result.append(_normalize_title(word))
                    index += 1
                    continue
            if check_english:
                if word.lower() in ENGLISH_WORDS:
                    result.append(word)
                    index += 1
                    continue
            if word.lower() in MALAY_WORDS and word.lower() not in [
                    'pada',
                    'ke',
            ]:
                result.append(word)
                index += 1
                continue
            if len(word) > 2:
                if word[-2] in consonants and word[-1] == 'e':
                    word = word[:-1] + 'a'
            if word[0] == 'x' and len(word) > 1:
                result_string = 'tak '
                word = word[1:]
            else:
                result_string = ''

            if word.lower() == 'ke' and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                tokenized[index + 2]))
                    index += 3
                    continue
                elif tokenized[index + 1] == '-' and re.match(
                        '.*(V|X|I|L|D)', tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                str(rom_to_int(tokenized[index + 2]))))
                    index += 3
                    continue
                else:
                    result.append('ke')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        to_cardinal(_string_to_num(word)) + ' hingga ' +
                        to_cardinal(_string_to_num(tokenized[index + 2])))
                    index += 3
                    continue
            if word.lower() == 'pada' and index < (len(tokenized) - 3):
                if (_is_number_regex(tokenized[index + 1])
                        and tokenized[index + 2] in '/-'
                        and _is_number_regex(tokenized[index + 3])):
                    result.append('pada %s hari bulan %s' % (
                        to_cardinal(_string_to_num(tokenized[index + 1])),
                        to_cardinal(_string_to_num(tokenized[index + 3])),
                    ))
                    index += 4
                    continue
                else:
                    result.append('pada')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '/' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        fraction(word + tokenized[index + 1] +
                                 tokenized[index + 2]))
                    index += 3
                    continue

            if re.findall(_money, word.lower()):
                money_, _ = money(word)
                result.append(money_)
                index += 1
                continue

            if re.findall(_date, word.lower()):
                word = word.lower()
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%d/%m/%Y'))
                else:
                    result.append(word)
                index += 1
                continue

            if re.findall(_expressions['time'], word.lower()):
                word = word.lower()
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%H:%M:%S'))
                else:
                    result.append(word)
                index += 1
                continue

            cardinal_ = cardinal(word)
            if cardinal_ != word:
                result.append(cardinal_)
                index += 1
                continue

            normalized_ke = ordinal(word)
            if normalized_ke != word:
                result.append(normalized_ke)
                index += 1
                continue
            word, end_result_string = _remove_postfix(word)
            if word in sounds:
                result.append(result_string + sounds[word] + end_result_string)
                index += 1
                continue
            if word in rules_normalizer:
                result.append(result_string + rules_normalizer[word] +
                              end_result_string)
                index += 1
                continue
            selected = self._speller.correct(word,
                                             string=' '.join(tokenized),
                                             index=index)
            result.append(result_string + selected + end_result_string)
            index += 1

        result = ' '.join(result)
        normalized = ' '.join(normalized)
        money_ = re.findall(_money, normalized)
        money_ = [(s, money(s)[1]) for s in money_]
        dates_ = re.findall(_date, normalized)

        past_date_string_ = re.findall(_past_date_string, normalized)
        now_date_string_ = re.findall(_now_date_string, normalized)
        future_date_string_ = re.findall(_future_date_string, normalized)
        yesterday_date_string_ = re.findall(_yesterday_tomorrow_date_string,
                                            normalized)
        depan_date_string_ = re.findall(_depan_date_string, normalized)
        today_time_ = re.findall(_today_time, normalized)
        time_ = re.findall(_expressions['time'], normalized)

        left_datetime_ = [
            f'{i[0]} {i[1]}' for i in re.findall(_left_datetime, normalized)
        ]
        right_datetime_ = [
            f'{i[0]} {i[1]}' for i in re.findall(_right_datetime, normalized)
        ]
        today_left_datetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_left_datetodaytime, normalized)
        ]
        today_right_datetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_right_datetodaytime, normalized)
        ]
        left_yesterdaydatetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_left_yesterdaydatetime, normalized)
        ]
        right_yesterdaydatetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_right_yesterdaydatetime, normalized)
        ]
        left_yesterdaydatetodaytime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_left_yesterdaydatetodaytime, normalized)
        ]
        right_yesterdaydatetodaytime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_right_yesterdaydatetodaytime, normalized)
        ]

        dates_ = (dates_ + past_date_string_ + now_date_string_ +
                  future_date_string_ + yesterday_date_string_ +
                  depan_date_string_ + time_ + today_time_ + left_datetime_ +
                  right_datetime_ + today_left_datetime_ +
                  today_right_datetime_ + left_yesterdaydatetime_ +
                  right_yesterdaydatetime_ + left_yesterdaydatetodaytime_ +
                  right_yesterdaydatetodaytime_)
        dates_ = [multireplace(s, date_replace) for s in dates_]
        dates_ = [re.sub(r'[ ]+', ' ', s).strip() for s in dates_]
        dates_ = cluster_words(dates_)
        dates_ = {s: dateparser.parse(s) for s in dates_}
        money_ = {s[0]: s[1] for s in money_}
        return {'normalize': result, 'date': dates_, 'money': money_}
Exemple #4
0
    def normalize(self,
                  string: str,
                  check_english: bool = True,
                  normalize_entity=True):
        """
        Normalize a string

        Parameters
        ----------
        string : str
        check_english: bool, (default=True)
            check a word in english dictionary.
        normalize_entity: bool, (default=True)
            normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only.

        Returns
        -------
        string: normalized string
        """

        result, normalized = [], []
        tokenized = self._tokenizer(string)
        index = 0
        while index < len(tokenized):
            word = tokenized[index]
            if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
                result.append(word)
                index += 1
                continue
            normalized.append(rules_normalizer.get(word.lower(), word.lower()))
            if word.lower() in ignore_words:
                result.append(word)
                index += 1
                continue
            if word[0].isupper() and not len(re.findall(_money, word.lower())):
                if word.upper() not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']:
                    result.append(_normalize_title(word))
                    index += 1
                    continue
            if check_english:
                if word.lower() in ENGLISH_WORDS:
                    result.append(word)
                    index += 1
                    continue
            if word.lower() in MALAY_WORDS and word.lower() not in [
                    'pada',
                    'ke',
            ]:
                result.append(word)
                index += 1
                continue
            if len(word) > 2:
                if word[-2] in consonants and word[-1] == 'e':
                    word = word[:-1] + 'a'
            if word[0] == 'x' and len(word) > 1:
                result_string = 'tak '
                word = word[1:]
            else:
                result_string = ''

            if word.lower() == 'ke' and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                tokenized[index + 2]))
                    index += 3
                    continue
                elif tokenized[index + 1] == '-' and re.match(
                        '.*(V|X|I|L|D)', tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                str(rom_to_int(tokenized[index + 2]))))
                    index += 3
                    continue
                else:
                    result.append('ke')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        to_cardinal(_string_to_num(word)) + ' hingga ' +
                        to_cardinal(_string_to_num(tokenized[index + 2])))
                    index += 3
                    continue
            if word.lower() == 'pada' and index < (len(tokenized) - 3):
                if (_is_number_regex(tokenized[index + 1])
                        and tokenized[index + 2] in '/-'
                        and _is_number_regex(tokenized[index + 3])):
                    result.append('pada %s hari bulan %s' % (
                        to_cardinal(_string_to_num(tokenized[index + 1])),
                        to_cardinal(_string_to_num(tokenized[index + 3])),
                    ))
                    index += 4
                    continue
                else:
                    result.append('pada')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '/' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        fraction(word + tokenized[index + 1] +
                                 tokenized[index + 2]))
                    index += 3
                    continue

            if re.findall(_money, word.lower()):
                money_, _ = money(word)
                result.append(money_)
                if index < (len(tokenized) - 1):
                    if tokenized[index + 1].lower() in ('sen', 'cent'):
                        index += 2
                    else:
                        index += 1
                else:
                    index += 1
                continue

            if re.findall(_date, word.lower()):
                word = word.lower()
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%d/%m/%Y'))
                else:
                    result.append(word)
                index += 1
                continue

            if re.findall(_expressions['time'], word.lower()):
                word = word.lower()
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%H:%M:%S'))
                else:
                    result.append(word)
                index += 1
                continue

            if re.findall(_expressions['hashtag'], word.lower()):
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['url'], word.lower()):
                result.append(word)
                index += 1
                continue

            if re.findall(_expressions['user'], word.lower()):
                result.append(word)
                index += 1
                continue

            if (re.findall(_expressions['temperature'], word.lower())
                    or re.findall(_expressions['distance'], word.lower())
                    or re.findall(_expressions['volume'], word.lower())
                    or re.findall(_expressions['duration'], word.lower())
                    or re.findall(_expressions['weight'], word.lower())):
                word = word.replace(' ', '')
                result.append(digit_unit(word))
                index += 1
                continue

            cardinal_ = cardinal(word)
            if cardinal_ != word:
                result.append(cardinal_)
                index += 1
                continue

            normalized_ke = ordinal(word)
            if normalized_ke != word:
                result.append(normalized_ke)
                index += 1
                continue

            word, end_result_string = _remove_postfix(word)
            word, repeat = check_repeat(word)
            if word in sounds:
                selected = sounds[word]
            elif word in rules_normalizer:
                selected = rules_normalizer[word]
            else:
                selected = self._speller.correct(word,
                                                 string=' '.join(tokenized),
                                                 index=index)
            selected = ' - '.join([selected] * repeat)
            result.append(result_string + selected + end_result_string)
            index += 1

        result = ' '.join(result)
        normalized = ' '.join(normalized)

        if normalize_entity:
            dates_, money_ = normalized_entity(normalized)

        else:
            dates_, money_ = {}, {}
        return {'normalize': result, 'date': dates_, 'money': money_}