Esempio n. 1
0
def segmenter(max_split_length: int = 20, validate: bool = True):
    """
    Load Segmenter class.

    Parameters
    ----------
    max_split_length: int, (default=20)
        max length of words in a sentence to segment
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    _Segmenter : malaya.preprocessing._Segmenter class
    """

    if validate:
        check_file(PATH_PREPROCESSING[1], S3_PATH_PREPROCESSING[1])
    else:
        if not check_available(PATH_PREPROCESSING[1]):
            raise Exception(
                'preprocessing is not available, please `validate = True`')
    if validate:
        check_file(PATH_PREPROCESSING[2], S3_PATH_PREPROCESSING[2])
    else:
        if not check_available(PATH_PREPROCESSING[2]):
            raise Exception(
                'preprocessing is not available, please `validate = True`')
    return _Segmenter(max_split_length=max_split_length)
Esempio n. 2
0
def probability(sentence_piece: bool = False, **kwargs):
    """
    Train a Probability Spell Corrector.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    _SpellCorrector: malaya.spell._SpellCorrector class
    """
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        if validate:
            check_file(PATH_NGRAM['sentencepiece'],
                       S3_PATH_NGRAM['sentencepiece'])
        else:
            if not check_available(PATH_NGRAM[1]):
                raise Exception(
                    'sentence piece is not available, please `validate = True`'
                )

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab, vocab_model)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return _SpellCorrector(corpus, tokenizer)
Esempio n. 3
0
def shortform(
    word: str,
    augment_vowel: bool = True,
    augment_consonant: bool = True,
    prob_delete_vowel: float = 0.5,
    validate: bool = True,
):
    """
    augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels, 
    purposely replaced some subwords into slang subwords.

    Parameters
    ----------
    word: str
    augment_vowel: bool, (default=True)
        if True, will augment vowels for each samples generated.
    augment_consonant: bool, (default=True)
        if True, will augment consonants for each samples generated.
    prob_delete_vowel: float, (default=0.5)
        probability to delete a vowel.
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    result: list
    """

    if not 0 < prob_delete_vowel < 1:
        raise Exception(
            'prob_delete_vowel must be bigger than 0 and less than 1'
        )
    word = simple_textcleaning(word)
    if not len(word):
        raise Exception('word is too short to augment shortform.')

    if validate:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'])
    else:
        if not check_available(PATH_NGRAM[1]):
            raise Exception(
                'sentence piece is not available, please `validate = True`'
            )

    vocab = PATH_NGRAM['sentencepiece']['vocab']
    vocab_model = PATH_NGRAM['sentencepiece']['model']
    tokenizer = load_sentencepiece(vocab, vocab_model)

    replace_consonants = {
        'n': 'm',
        't': 'y',
        'r': 't',
        'g': 'h',
        'j': 'k',
        'k': 'l',
        'd': 's',
        'd': 'f',
        'g': 'f',
        'b': 'n',
    }

    replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'}

    results = [word]

    if len(word) > 1:

        if word[-1] == 'a' and word[-2] in consonants:
            results.append(word[:-1] + 'e')

        if word[0] == 'f' and word[-1] == 'r':
            results.append('p' + words[1:])

        if word[-2] in consonants and word[-1] in vowels:
            results.append(word + 'k')

        if word[-2] in vowels and word[-1] == 'h':
            results.append(word[:-1])

    if len(word) > 2:
        if word[-3] in consonants and word[-2:] == 'ar':
            results.append(words[:-2] + 'o')

        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:
            results.append(word[1:])

        if word[-3] in consonants and word[-2:] == 'ng':
            results.append(word[:-2] + 'g')

        if word[1:3] == 'ng':
            results.append(word[:1] + x[2:])

    if augment_consonant:
        result_consonants = []
        for k, v in replace_consonants.items():
            for r in results:
                result_consonants.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_consonants)

    if augment_vowel:
        result_vowels = []
        for k, v in replace_vowels.items():
            for r in results:
                result_vowels.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_vowels)

    result_deleted = []
    for s in results:
        deleted = []
        for c in s:
            if random.random() > prob_delete_vowel and c in vowels:
                continue
            else:
                deleted.append(c)
        result_deleted.append(''.join(deleted))
    results.extend(result_deleted)

    filtered = []
    for s in results:
        t = tokenizer.tokenize(s)
        if len(t) == 1:
            filtered.append(s)
            continue
        if t[0] == '▁':
            continue
        if any([len(w) < 3 for w in t]):
            continue
        filtered.append(s)

    return list(set(filtered))
Esempio n. 4
0
def preprocessing(
    normalize: List[str] = [
        'url',
        'email',
        'percent',
        'money',
        'phone',
        'user',
        'time',
        'date',
        'number',
    ],
    annotate: List[str] = [
        'allcaps',
        'elongated',
        'repeated',
        'emphasis',
        'censored',
        'hashtag',
    ],
    lowercase: bool = True,
    fix_unidecode: bool = True,
    expand_hashtags: bool = True,
    expand_english_contractions: bool = True,
    translate_english_to_bm: bool = True,
    remove_postfix: bool = True,
    maxlen_segmenter: int = 20,
    validate: bool = True,
    speller=None,
):
    """
    Load Preprocessing class.

    Parameters
    ----------
    normalize: list
        normalizing tokens, can check all supported normalizing at malaya.preprocessing.get_normalize()
    annotate: list
        annonate tokens <open></open>, only accept ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']
    lowercase: bool
    fix_unidecode: bool
    expand_hashtags: bool
        expand hashtags using Viterbi algorithm, #mondayblues == monday blues
    expand_english_contractions: bool
        expand english contractions
    translate_english_to_bm: bool
        translate english words to bahasa malaysia words
    remove_postfix: bool
        remove postfix from a word, faster way to get root word
    speller: object
        spelling correction object, need to have a method `correct`
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.


    Returns
    -------
    _Preprocessing : malaya.preprocessing._Preprocessing class
    """

    if any([e not in _normalize for e in normalize]):
        raise ValueError(
            'normalize element not able to recognize, supported normalization can check at get_normalize()'
        )
    if any([e not in _annotate for e in annotate]):
        raise ValueError(
            "annotate only accept ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']"
        )
    if speller is not None:
        if not hasattr(speller, 'correct') and not hasattr(
                speller, 'normalize_elongated'):
            raise ValueError(
                'speller must has `correct` or `normalize_elongated` method')

    if expand_hashtags:
        if validate:
            check_file(PATH_PREPROCESSING[1], S3_PATH_PREPROCESSING[1])
        else:
            if not check_available(PATH_PREPROCESSING[1]):
                raise Exception(
                    'preprocessing is not available, please `validate = True`')
        if validate:
            check_file(PATH_PREPROCESSING[2], S3_PATH_PREPROCESSING[2])
        else:
            if not check_available(PATH_PREPROCESSING[2]):
                raise Exception(
                    'preprocessing is not available, please `validate = True`')

    if translate_english_to_bm:
        if validate:
            check_file(
                PATH_PREPROCESSING['english-malay'],
                S3_PATH_PREPROCESSING['english-malay'],
            )
        else:
            if not check_available(PATH_PREPROCESSING['english-malay']):
                raise Exception(
                    'translator english-malay is not available, please `validate = True`'
                )

        with open(PATH_PREPROCESSING['english-malay']['model']) as fopen:
            translator = json.load(fopen)
    else:
        translator = None

    return _Preprocessing(
        normalize=normalize,
        annotate=annotate,
        lowercase=lowercase,
        fix_unidecode=fix_unidecode,
        expand_hashtags=expand_hashtags,
        expand_english_contractions=expand_english_contractions,
        remove_postfix=remove_postfix,
        maxlen_segmenter=maxlen_segmenter,
        translator=translator,
        speller=speller,
    )