Ejemplo n.º 1
0
    def _predict(
        self,
        left_strings: List[str],
        right_strings: List[str],
        aggregation: str = 'mean',
        similarity: str = 'cosine',
        soft: bool = True,
    ):

        if len(left_strings) != len(right_strings):
            raise ValueError(
                'length list of left strings must be same with length list of right strings'
            )
        identical = left_strings == right_strings

        aggregation = aggregation.lower()
        if aggregation == 'mean':
            aggregation_function = np.mean
        elif aggregation == 'min':
            aggregation_function = np.min
        elif aggregation == 'max':
            aggregation_function = np.max
        elif aggregation == 'sum':
            aggregation_function = np.sum
        elif aggregation == 'sqrt':
            aggregation_function = np.sqrt
        else:
            raise ValueError(
                "aggregation only supports 'mean', 'min', 'max', 'sum' and 'sqrt'"
            )

        similarity = similarity.lower()
        if similarity == 'cosine':
            similarity_function = cosine_similarity
        elif similarity == 'euclidean':
            similarity_function = euclidean_distances
        elif similarity == 'manhattan':
            similarity_function = manhattan_distances
        else:
            raise ValueError(
                "similarity only supports 'cosine', 'euclidean', and 'manhattan'"
            )

        left_vectors, right_vectors = [], []
        for i in range(len(left_strings)):
            left_string = left_strings[i]
            right_string = right_strings[i]
            left_tokenized = _tokenizer(left_string)
            if not len(left_tokenized):
                raise ValueError('insert not empty left string')
            right_tokenized = _tokenizer(right_string)
            if not len(right_tokenized):
                raise ValueError('insert not empty right string')

            in_vector = []
            for token in left_tokenized:
                try:
                    in_vector.append(
                        self._vectorizer.get_vector_by_name(token))
                except:
                    if not soft:
                        pass
                    else:
                        arr = np.array([
                            self._jarowinkler.similarity(token, k)
                            for k in self._vectorizer.words
                        ])
                        idx = (-arr).argsort()[0]
                        in_vector.append(
                            self._vectorizer.get_vector_by_name(
                                self._vectorizer.words[idx]))
            left_vectors.append(aggregation_function(in_vector, axis=0))

            if not identical:
                in_vector = []
                for token in right_tokenized:
                    try:
                        in_vector.append(
                            self._vectorizer.get_vector_by_name(token))
                    except:
                        if not soft:
                            pass
                        else:
                            arr = np.array([
                                self._jarowinkler.similarity(token, k)
                                for k in self._vectorizer.words
                            ])
                            idx = (-arr).argsort()[0]
                            in_vector.append(
                                self._vectorizer.get_vector_by_name(
                                    self._vectorizer.words[idx]))

                right_vectors.append(aggregation_function(in_vector, axis=0))

        if identical:
            similar = similarity_function(left_vectors, left_vectors)
        else:
            similar = similarity_function(left_vectors, right_vectors)

        if similarity == 'cosine':
            return (similar + 1) / 2
        else:
            return 1 / (similar + 1)
Ejemplo n.º 2
0
def transformer_augmentation(
    string: str,
    model,
    threshold: float = 0.5,
    top_p: float = 0.8,
    top_k: int = 100,
    temperature: float = 0.8,
    top_n: int = 5,
    cleaning_function: Callable = None,
):

    """
    augmenting a string using transformer + nucleus sampling / top-k sampling.

    Parameters
    ----------
    string: str
    model: object
        transformer interface object. Right now only supported BERT.
    threshold: float, optional (default=0.5)
        random selection for a word.
    top_p: float, optional (default=0.8)
        cumulative sum of probabilities to sample a word. If top_n bigger than 0, the model will use nucleus sampling, else top-k sampling.
    top_k: int, optional (default=100)
        k for top-k sampling.
    temperature: float, optional (default=0.8)
        logits * temperature.
    top_n: int, (default=5)
        number of nearest neighbors returned.
    cleaning_function: function, (default=None)
        function to clean text.

    Returns
    -------
    result: list
    """

    if not hasattr(model, 'samples'):
        raise ValueError('model must has `samples` attribute')
    if not (threshold > 0 and threshold < 1):
        raise ValueError('threshold must be bigger than 0 and less than 1')
    if not top_p > 0:
        raise ValueError('top_p must be bigger than 0')
    if not top_k > 0:
        raise ValueError('top_k must be bigger than 0')
    if not (temperature > 0 and threshold < 1):
        raise ValueError('temperature must be bigger than 0 and less than 1')
    if not top_n > 0:
        raise ValueError('top_n must be bigger than 0')
    if top_n > top_k:
        raise ValueError('top_k must be bigger than top_n')

    original_string = string
    if cleaning_function:
        string = cleaning_function(string)
    string = _tokenizer(string)
    results = []
    for token_idx, token in enumerate(string):
        if token in string_function.punctuation:
            continue
        if token[0].isupper():
            continue
        if token.isdigit():
            continue
        if random.random() > threshold:
            results.append(token_idx)

    if not len(results):
        raise ValueError(
            'no words can augmented, make sure words available are not punctuation or proper nouns.'
        )

    maskeds, indices, input_masks = [], [], []
    for index in results:
        new = string[:]
        new[index] = '[MASK]'
        mask, ind = to_ids(new, model._tokenizer)
        maskeds.append(mask)
        indices.append(ind)
        input_masks.append([1] * len(mask))

    masked_padded = pad_sequences(maskeds, padding = 'post')
    input_masks = pad_sequences(input_masks, padding = 'post')
    batch_indices = np.array([np.arange(len(indices)), indices]).T
    samples = model._sess.run(
        model.samples,
        feed_dict = {
            model.X: masked_padded,
            model.MASK: input_masks,
            model.top_p: top_p,
            model.top_k: top_k,
            model.temperature: temperature,
            model.indices: batch_indices,
            model.k: top_n,
        },
    )
    outputs = []
    for i in range(samples.shape[1]):
        sample_i = samples[:, i]
        samples_tokens = model._tokenizer.convert_ids_to_tokens(
            sample_i.tolist()
        )
        new_splitted = ['▁' + w if len(w) > 1 else w for w in string]
        for no, index in enumerate(results):
            new_splitted[index] = samples_tokens[no]
        new = ''.join(model._tokenizer.sp_model.DecodePieces(new_splitted))
        outputs.append(new)
    return outputs
Ejemplo n.º 3
0
    def normalize(self, string: str, check_english: bool = True):
        """
        Normalize a string

        Parameters
        ----------
        string : str
        check_english: bool, (default=True)
            check a word in english dictionary.

        Returns
        -------
        string: normalized string
        """

        result, normalized = [], []
        tokenized = _tokenizer(string)
        print(tokenized)
        index = 0
        while index < len(tokenized):
            word = tokenized[index]
            if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
                result.append(word)
                index += 1
                continue
            normalized.append(rules_normalizer.get(word.lower(), word.lower()))
            if word.lower() in ignore_words:
                result.append(word)
                index += 1
                continue
            if word[0].isupper():
                if word.upper() not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']:
                    result.append(_normalize_title(word))
                    index += 1
                    continue
            if check_english:
                if word.lower() in ENGLISH_WORDS:
                    result.append(word)
                    index += 1
                    continue
            if word.lower() in MALAY_WORDS and word.lower() not in [
                    'pada',
                    'ke',
            ]:
                result.append(word)
                index += 1
                continue
            if len(word) > 2:
                if word[-2] in consonants and word[-1] == 'e':
                    word = word[:-1] + 'a'
            if word[0] == 'x' and len(word) > 1:
                result_string = 'tak '
                word = word[1:]
            else:
                result_string = ''

            if word.lower() == 'ke' and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                tokenized[index + 2]))
                    index += 3
                    continue
                elif tokenized[index + 1] == '-' and re.match(
                        '.*(V|X|I|L|D)', tokenized[index + 2]):
                    result.append(
                        ordinal(word + tokenized[index + 1] +
                                str(rom_to_int(tokenized[index + 2]))))
                    index += 3
                    continue
                else:
                    result.append('ke')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '-' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        to_cardinal(_string_to_num(word)) + ' hingga ' +
                        to_cardinal(_string_to_num(tokenized[index + 2])))
                    index += 3
                    continue
            if word.lower() == 'pada' and index < (len(tokenized) - 3):
                if (_is_number_regex(tokenized[index + 1])
                        and tokenized[index + 2] in '/-'
                        and _is_number_regex(tokenized[index + 3])):
                    result.append('pada %s hari bulan %s' % (
                        to_cardinal(_string_to_num(tokenized[index + 1])),
                        to_cardinal(_string_to_num(tokenized[index + 3])),
                    ))
                    index += 4
                    continue
                else:
                    result.append('pada')
                    index += 1
                    continue

            if _is_number_regex(word) and index < (len(tokenized) - 2):
                if tokenized[index + 1] == '/' and _is_number_regex(
                        tokenized[index + 2]):
                    result.append(
                        fraction(word + tokenized[index + 1] +
                                 tokenized[index + 2]))
                    index += 3
                    continue

            if re.findall(_money, word.lower()):
                money_, _ = money(word)
                result.append(money_)
                index += 1
                continue

            if re.findall(_date, word.lower()):
                word = word.lower()
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%d/%m/%Y'))
                else:
                    result.append(word)
                index += 1
                continue

            if re.findall(_expressions['time'], word.lower()):
                word = word.lower()
                word = multireplace(word, date_replace)
                word = re.sub(r'[ ]+', ' ', word).strip()
                parsed = dateparser.parse(word)
                if parsed:
                    result.append(parsed.strftime('%H:%M:%S'))
                else:
                    result.append(word)
                index += 1
                continue

            cardinal_ = cardinal(word)
            if cardinal_ != word:
                result.append(cardinal_)
                index += 1
                continue

            normalized_ke = ordinal(word)
            if normalized_ke != word:
                result.append(normalized_ke)
                index += 1
                continue
            word, end_result_string = _remove_postfix(word)
            if word in sounds:
                result.append(result_string + sounds[word] + end_result_string)
                index += 1
                continue
            if word in rules_normalizer:
                result.append(result_string + rules_normalizer[word] +
                              end_result_string)
                index += 1
                continue
            selected = self._speller.correct(word,
                                             string=' '.join(tokenized),
                                             index=index)
            result.append(result_string + selected + end_result_string)
            index += 1

        result = ' '.join(result)
        normalized = ' '.join(normalized)
        money_ = re.findall(_money, normalized)
        money_ = [(s, money(s)[1]) for s in money_]
        dates_ = re.findall(_date, normalized)

        past_date_string_ = re.findall(_past_date_string, normalized)
        now_date_string_ = re.findall(_now_date_string, normalized)
        future_date_string_ = re.findall(_future_date_string, normalized)
        yesterday_date_string_ = re.findall(_yesterday_tomorrow_date_string,
                                            normalized)
        depan_date_string_ = re.findall(_depan_date_string, normalized)
        today_time_ = re.findall(_today_time, normalized)
        time_ = re.findall(_expressions['time'], normalized)

        left_datetime_ = [
            f'{i[0]} {i[1]}' for i in re.findall(_left_datetime, normalized)
        ]
        right_datetime_ = [
            f'{i[0]} {i[1]}' for i in re.findall(_right_datetime, normalized)
        ]
        today_left_datetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_left_datetodaytime, normalized)
        ]
        today_right_datetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_right_datetodaytime, normalized)
        ]
        left_yesterdaydatetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_left_yesterdaydatetime, normalized)
        ]
        right_yesterdaydatetime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_right_yesterdaydatetime, normalized)
        ]
        left_yesterdaydatetodaytime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_left_yesterdaydatetodaytime, normalized)
        ]
        right_yesterdaydatetodaytime_ = [
            f'{i[0]} {i[1]}'
            for i in re.findall(_right_yesterdaydatetodaytime, normalized)
        ]

        dates_ = (dates_ + past_date_string_ + now_date_string_ +
                  future_date_string_ + yesterday_date_string_ +
                  depan_date_string_ + time_ + today_time_ + left_datetime_ +
                  right_datetime_ + today_left_datetime_ +
                  today_right_datetime_ + left_yesterdaydatetime_ +
                  right_yesterdaydatetime_ + left_yesterdaydatetodaytime_ +
                  right_yesterdaydatetodaytime_)
        dates_ = [multireplace(s, date_replace) for s in dates_]
        dates_ = [re.sub(r'[ ]+', ' ', s).strip() for s in dates_]
        dates_ = cluster_words(dates_)
        dates_ = {s: dateparser.parse(s) for s in dates_}
        money_ = {s[0]: s[1] for s in money_}
        return {'normalize': result, 'date': dates_, 'money': money_}
Ejemplo n.º 4
0
def wordvector_augmentation(
    string: str,
    wordvector,
    threshold: float = 0.5,
    top_n: int = 5,
    soft: bool = False,
    cleaning_function: Callable = None,
):
    """
    augmenting a string using wordvector.

    Parameters
    ----------
    string: str
    wordvector: object
        wordvector interface object.
    threshold: float, optional (default=0.5)
        random selection for a word.
    soft: bool, optional (default=False)
        if True, a word not in the dictionary will be replaced with nearest jarowrinkler ratio.
        if False, it will throw an exception if a word not in the dictionary.
    top_n: int, (default=5)
        number of nearest neighbors returned.
    cleaning_function: function, (default=None)
        function to clean text.

    Returns
    -------
    result: list
    """
    if not hasattr(wordvector, 'batch_n_closest'):
        raise ValueError('wordvector must has `batch_n_closest` method')
    if not hasattr(wordvector, '_dictionary'):
        raise ValueError('wordvector must has `_dictionary` attribute')

    original_string = string
    if cleaning_function:
        string = cleaning_function(string)
    string = _tokenizer(string)
    original_string = string[:]
    selected = []
    for no, w in enumerate(string):
        if w in string_function.punctuation:
            continue
        if w[0].isupper():
            continue
        if random.random() > threshold:
            selected.append((no, w))

    if not len(selected):
        raise ValueError(
            'no words can augmented, make sure words available are not punctuation or proper nouns.'
        )

    indices, words = [i[0] for i in selected], [i[1] for i in selected]
    batch_parameters = list(
        inspect.signature(wordvector.batch_n_closest).parameters.keys()
    )
    if 'soft' in batch_parameters:
        results = wordvector.batch_n_closest(
            words, num_closest = top_n, soft = soft
        )
    else:
        results = wordvector.batch_n_closest(words, num_closest = top_n)

    augmented = []
    for i in range(top_n):
        string_ = string[:]
        for no in range(len(results)):
            string_[indices[no]] = results[no][i]
        augmented.append(
            _make_upper(' '.join(string_), ' '.join(original_string))
        )
    return augmented