Beispiel #1
0
def transformer(model, sentence_piece: bool = False, **kwargs):
    """
    Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.

    Returns
    -------
    result: malaya.spell.TRANSFORMER class
    """
    if not hasattr(model, '_log_vectorize'):
        raise ValueError('model must has `_log_vectorize` method')

    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab_model, vocab)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return TRANSFORMER(model, corpus, tokenizer)
Beispiel #2
0
def probability(sentence_piece: bool = False, **kwargs):
    """
    Train a Probability Spell Corrector.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.

    Returns
    -------
    result: malaya.spell.PROBABILITY class
    """
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab_model, vocab)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return PROBABILITY(corpus, tokenizer)
Beispiel #3
0
def transformer(model, sentence_piece: bool = False, **kwargs):
    """
    Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT.

    Parameters
    ----------
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    _TransformerCorrector: malaya.spell._TransformerCorrector class
    """
    if not hasattr(model, '_log_vectorize'):
        raise ValueError('model must has `_log_vectorize` method')

    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab, vocab_model)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return _TransformerCorrector(model, corpus, tokenizer)
Beispiel #4
0
def probability(sentence_piece: bool = False, **kwargs):
    """
    Train a Probability Spell Corrector.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    _SpellCorrector: malaya.spell._SpellCorrector class
    """
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        if validate:
            check_file(PATH_NGRAM['sentencepiece'],
                       S3_PATH_NGRAM['sentencepiece'])
        else:
            if not check_available(PATH_NGRAM[1]):
                raise Exception(
                    'sentence piece is not available, please `validate = True`'
                )

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab, vocab_model)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return _SpellCorrector(corpus, tokenizer)
Beispiel #5
0
def shortform(
    word: str,
    augment_vowel: bool = True,
    augment_consonant: bool = True,
    prob_delete_vowel: float = 0.5,
    **kwargs,
):
    """
    augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels, 
    purposely replaced some subwords into slang subwords.

    Parameters
    ----------
    word: str
    augment_vowel: bool, (default=True)
        if True, will augment vowels for each samples generated.
    augment_consonant: bool, (default=True)
        if True, will augment consonants for each samples generated.
    prob_delete_vowel: float, (default=0.5)
        probability to delete a vowel.

    Returns
    -------
    result: list
    """

    if not 0 < prob_delete_vowel < 1:
        raise ValueError(
            'prob_delete_vowel must be bigger than 0 and less than 1')
    word = simple_textcleaning(word)
    if not len(word):
        raise ValueError('word is too short to augment shortform.')

    check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
               **kwargs)

    vocab = PATH_NGRAM['sentencepiece']['vocab']
    vocab_model = PATH_NGRAM['sentencepiece']['model']
    tokenizer = load_sentencepiece(vocab, vocab_model)

    replace_consonants = {
        'n': 'm',
        't': 'y',
        'r': 't',
        'g': 'h',
        'j': 'k',
        'k': 'l',
        'd': 's',
        'd': 'f',
        'g': 'f',
        'b': 'n',
    }

    replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'}

    results = [word]

    if len(word) > 1:

        if word[-1] == 'a' and word[-2] in consonants:
            results.append(word[:-1] + 'e')

        if word[0] == 'f' and word[-1] == 'r':
            results.append('p' + words[1:])

        if word[-2] in consonants and word[-1] in vowels:
            results.append(word + 'k')

        if word[-2] in vowels and word[-1] == 'h':
            results.append(word[:-1])

    if len(word) > 2:
        if word[-3] in consonants and word[-2:] == 'ar':
            results.append(words[:-2] + 'o')

        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:
            results.append(word[1:])

        if word[-3] in consonants and word[-2:] == 'ng':
            results.append(word[:-2] + 'g')

        if word[1:3] == 'ng':
            results.append(word[:1] + x[2:])

    if augment_consonant:
        result_consonants = []
        for k, v in replace_consonants.items():
            for r in results:
                result_consonants.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_consonants)

    if augment_vowel:
        result_vowels = []
        for k, v in replace_vowels.items():
            for r in results:
                result_vowels.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_vowels)

    result_deleted = []
    for s in results:
        deleted = []
        for c in s:
            if random.random() > prob_delete_vowel and c in vowels:
                continue
            else:
                deleted.append(c)
        result_deleted.append(''.join(deleted))
    results.extend(result_deleted)

    filtered = []
    for s in results:
        t = tokenizer.tokenize(s)
        if len(t) == 1:
            filtered.append(s)
            continue
        if t[0] == '▁':
            continue
        if any([len(w) < 3 for w in t]):
            continue
        filtered.append(s)

    return list(set(filtered))