def transformer(model, sentence_piece: bool = False, **kwargs): """ Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. Returns ------- result: malaya.spell.TRANSFORMER class """ if not hasattr(model, '_log_vectorize'): raise ValueError('model must has `_log_vectorize` method') check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab_model, vocab) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return TRANSFORMER(model, corpus, tokenizer)
def probability(sentence_piece: bool = False, **kwargs): """ Train a Probability Spell Corrector. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. Returns ------- result: malaya.spell.PROBABILITY class """ check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab_model, vocab) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return PROBABILITY(corpus, tokenizer)
def transformer(model, sentence_piece: bool = False, **kwargs): """ Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT. Parameters ---------- validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _TransformerCorrector: malaya.spell._TransformerCorrector class """ if not hasattr(model, '_log_vectorize'): raise ValueError('model must has `_log_vectorize` method') check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab, vocab_model) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return _TransformerCorrector(model, corpus, tokenizer)
def probability(sentence_piece: bool = False, **kwargs): """ Train a Probability Spell Corrector. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _SpellCorrector: malaya.spell._SpellCorrector class """ check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: if validate: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece']) else: if not check_available(PATH_NGRAM[1]): raise Exception( 'sentence piece is not available, please `validate = True`' ) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab, vocab_model) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return _SpellCorrector(corpus, tokenizer)
def shortform( word: str, augment_vowel: bool = True, augment_consonant: bool = True, prob_delete_vowel: float = 0.5, **kwargs, ): """ augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels, purposely replaced some subwords into slang subwords. Parameters ---------- word: str augment_vowel: bool, (default=True) if True, will augment vowels for each samples generated. augment_consonant: bool, (default=True) if True, will augment consonants for each samples generated. prob_delete_vowel: float, (default=0.5) probability to delete a vowel. Returns ------- result: list """ if not 0 < prob_delete_vowel < 1: raise ValueError( 'prob_delete_vowel must be bigger than 0 and less than 1') word = simple_textcleaning(word) if not len(word): raise ValueError('word is too short to augment shortform.') check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab, vocab_model) replace_consonants = { 'n': 'm', 't': 'y', 'r': 't', 'g': 'h', 'j': 'k', 'k': 'l', 'd': 's', 'd': 'f', 'g': 'f', 'b': 'n', } replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'} results = [word] if len(word) > 1: if word[-1] == 'a' and word[-2] in consonants: results.append(word[:-1] + 'e') if word[0] == 'f' and word[-1] == 'r': results.append('p' + words[1:]) if word[-2] in consonants and word[-1] in vowels: results.append(word + 'k') if word[-2] in vowels and word[-1] == 'h': results.append(word[:-1]) if len(word) > 2: if word[-3] in consonants and word[-2:] == 'ar': results.append(words[:-2] + 'o') if word[0] == 'h' and word[1] in vowels and word[2] in consonants: results.append(word[1:]) if word[-3] in consonants and word[-2:] == 'ng': results.append(word[:-2] + 'g') if word[1:3] == 'ng': results.append(word[:1] + x[2:]) if augment_consonant: result_consonants = [] for k, v in replace_consonants.items(): for r in results: result_consonants.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_consonants) if augment_vowel: result_vowels = [] for k, v in replace_vowels.items(): for r in results: result_vowels.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_vowels) result_deleted = [] for s in results: deleted = [] for c in s: if random.random() > prob_delete_vowel and c in vowels: continue else: deleted.append(c) result_deleted.append(''.join(deleted)) results.extend(result_deleted) filtered = [] for s in results: t = tokenizer.tokenize(s) if len(t) == 1: filtered.append(s) continue if t[0] == '▁': continue if any([len(w) < 3 for w in t]): continue filtered.append(s) return list(set(filtered))