def transformer(model, sentence_piece: bool = False, **kwargs): """ Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. Returns ------- result: malaya.spell.Transformer class """ if not hasattr(model, '_log_vectorize'): raise ValueError('model must have `_log_vectorize` method') check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return Transformer(model, corpus, tokenizer)
def probability(sentence_piece: bool = False, **kwargs): """ Train a Probability Spell Corrector. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. Returns ------- result: malaya.spell.Probability class """ check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return Probability(corpus, tokenizer)
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.Constituency class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.constituency.available_transformer()`.' ) path = check_file( file=model, module='constituency', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['input_ids', 'word_end_mask'] outputs = ['charts', 'tags'] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': _vectorizer_mapping[model]}) mode = 'bert' if 'bert' in model else 'xlnet' return Constituency( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, dictionary=settings.constituency, mode=mode, )
def load(model: str = 'xlnet', pool_mode: str = 'last', **kwargs): """ Load xlnet model. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'xlnet'`` - XLNET architecture from google. pool_mode : str, optional (default='last') Model logits architecture supported. Allowed values: * ``'last'`` - last of the sequence. * ``'first'`` - first of the sequence. * ``'mean'`` - mean of the sequence. * ``'attn'`` - attention of the sequence. Returns ------- result : malaya.transformers.xlnet.Model class """ model = model.lower() pool_mode = pool_mode.lower() if pool_mode not in ['last', 'first', 'mean', 'attn']: raise Exception( "pool_mode not supported, only support ['last', 'first', 'mean', 'attn']" ) path = check_file(PATH_XLNET[model]['model'], S3_PATH_XLNET[model], **kwargs) if not os.path.exists( os.path.join(PATH_XLNET[model]['directory'], 'model.ckpt')): import tarfile with tarfile.open(path['model']) as tar: tar.extractall(path=PATH_XLNET[model]['path']) vocab_model = os.path.join(PATH_XLNET[model]['directory'], 'sp10m.cased.v9.model') vocab = os.path.join(PATH_XLNET[model]['directory'], 'sp10m.cased.v9.vocab') tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model) xlnet_config = xlnet_lib.XLNetConfig( json_path=os.path.join(PATH_XLNET[model]['directory'], 'config.json')) xlnet_checkpoint = os.path.join(PATH_XLNET[model]['directory'], 'model.ckpt') model = Model(xlnet_config, tokenizer, xlnet_checkpoint, pool_mode=pool_mode, **kwargs) model._saver.restore(model._sess, xlnet_checkpoint) return model
def _transformer(model, bert_model, xlnet_model, quantized=False, siamese=False, **kwargs): model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.similarity.available_transformer()`.' ) path = check_file( file=model, module='similarity', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_model = bert_model if siamese: selected_node = 'import/bert/pooler/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: selected_model = xlnet_model if siamese: selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0' if not siamese: selected_node = _vectorizer_mapping[model] inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] outputs = ['logits'] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': selected_node}) return selected_model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def load(model: str = 'base', **kwargs): """ Load bert model. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'bert'`` - base bert-bahasa released by Malaya. * ``'tiny-bert'`` - tiny bert-bahasa released by Malaya. Returns ------- result : malaya.transformers.bert.Model class """ from malaya.path import PATH_BERT, S3_PATH_BERT from malaya.function import check_file model = model.lower() check_file(PATH_BERT[model]['model'], S3_PATH_BERT[model], **kwargs) if not os.path.exists(PATH_BERT[model]['directory'] + 'model.ckpt'): import tarfile with tarfile.open(PATH_BERT[model]['model']['model']) as tar: tar.extractall(path = PATH_BERT[model]['path']) import sentencepiece as spm from malaya.text.bpe import SentencePieceTokenizer bert_checkpoint = PATH_BERT[model]['directory'] + 'model.ckpt' vocab_model = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.model' vocab = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.vocab' bert_config = PATH_BERT[model]['directory'] + 'config.json' sp_model = spm.SentencePieceProcessor() sp_model.Load(vocab_model) with open(vocab) as fopen: v = fopen.read().split('\n')[:-1] v = [i.split('\t') for i in v] v = {i[0]: no for no, i in enumerate(v)} tokenizer = SentencePieceTokenizer(v, sp_model) bert_config = modeling.BertConfig.from_json_file(bert_config) model = Model(bert_config, tokenizer) model._saver.restore(model._sess, bert_checkpoint) return model
def transformer_squad(module, model='bert', quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3'] if model in ['xlnet', 'alxlnet']: inputs.append('Placeholder_4') outputs = [ 'start_top_log_probs', 'start_top_index', 'end_top_log_probs', 'end_top_index', 'cls_logits', 'logits_vectorize', ] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session(g, inputs, outputs) if 'bert' in model: mode = 'bert' else: mode = 'xlnet' return SQUAD( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, module=module, mode=mode, length=LENGTHS[mode], )
def transformer(module, model='xlnet', quantized=False, tok=None, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], 'setting': TAGGING_SETTING[module], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) with open(path['setting']) as fopen: nodes = json.load(fopen) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} selected_model = TaggingBERT if model in ['xlnet', 'alxlnet']: inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} selected_model = TaggingXLNET outputs = ['logits'] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return selected_model(input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, tok=tok)
def load(model: str = 'base', **kwargs): """ Load bert model. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'bert'`` - base bert-bahasa released by Malaya. * ``'tiny-bert'`` - tiny bert-bahasa released by Malaya. Returns ------- result : malaya.transformers.bert.Model class """ from malaya.path import PATH_BERT, S3_PATH_BERT from malaya.function import check_file model = model.lower() check_file(PATH_BERT[model]['model'], S3_PATH_BERT[model], **kwargs) if not os.path.exists(PATH_BERT[model]['directory'] + 'model.ckpt'): import tarfile with tarfile.open(PATH_BERT[model]['model']['model']) as tar: tar.extractall(path=PATH_BERT[model]['path']) bert_checkpoint = PATH_BERT[model]['directory'] + 'model.ckpt' vocab_model = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.model' vocab = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.vocab' bert_config = PATH_BERT[model]['directory'] + 'config.json' tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model) bert_config = modeling.BertConfig.from_json_file(bert_config) model = Model(bert_config, tokenizer, **kwargs) model._saver.restore(model._sess, bert_checkpoint) return model
def transformer(version: str = 'v2', model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention. Parameters ---------- version : str, optional (default='v2') Version supported. Allowed values: * ``'v1'`` - version 1, maintain for knowledge graph. * ``'v2'`` - Trained on bigger dataset, better version. model : str, optional (default='xlnet') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: model List of model classes: * if `bert` in model, will return `malaya.model.bert.DependencyBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`. """ version = _validate_version(version) model = model.lower() if model not in _transformer_availability[version]: raise ValueError( "model not supported, please check supported models from `malaya.dependency.available_transformer(version='{version}')`." ) module = 'dependency' minus = 1 if version != 'v1': module = f'{module}-{version}' minus = 2 path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: inputs = ['Placeholder'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} selected_model = DependencyBERT if model in ['xlnet', 'alxlnet']: inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} selected_model = DependencyXLNET outputs = ['logits', 'heads_seq'] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return selected_model(input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=label, minus=minus)
def transformer(model: str = 'bert', quantized: bool = False, **kwargs): """ Load Transformer keyword similarity model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: model List of model classes: * if `bert` in model, will return `malaya.model.bert.KeyphraseBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.KeyphraseXLNET`. """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.keyword_extraction.available_transformer()`.' ) path = check_file( file=model, module='keyword-extraction', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) outputs = ['logits'] if model in ['bert', 'tiny-bert']: inputs = [ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', ] outputs.append('bert/summary') selected_class = KeyphraseBERT if model in ['xlnet', 'alxlnet']: inputs = [ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', 'Placeholder_4', 'Placeholder_5', ] outputs.append('xlnet/summary') selected_class = KeyphraseXLNET tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session(g, inputs, outputs) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def shortform( word: str, augment_vowel: bool = True, augment_consonant: bool = True, prob_delete_vowel: float = 0.5, **kwargs, ): """ augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels, purposely replaced some subwords into slang subwords. Parameters ---------- word: str augment_vowel: bool, (default=True) if True, will augment vowels for each samples generated. augment_consonant: bool, (default=True) if True, will augment consonants for each samples generated. prob_delete_vowel: float, (default=0.5) probability to delete a vowel. Returns ------- result: list """ if not 0 < prob_delete_vowel < 1: raise ValueError( 'prob_delete_vowel must be bigger than 0 and less than 1') word = simple_textcleaning(word) if not len(word): raise ValueError('word is too short to augment shortform.') check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model) replace_consonants = { 'n': 'm', 't': 'y', 'r': 't', 'g': 'h', 'j': 'k', 'k': 'l', 'd': 's', 'd': 'f', 'g': 'f', 'b': 'n', } replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'} results = [word] if len(word) > 1: if word[-1] == 'a' and word[-2] in consonants: results.append(word[:-1] + 'e') if word[0] == 'f' and word[-1] == 'r': results.append('p' + words[1:]) if word[-2] in consonants and word[-1] in vowels: results.append(word + 'k') if word[-2] in vowels and word[-1] == 'h': results.append(word[:-1]) if len(word) > 2: if word[-3] in consonants and word[-2:] == 'ar': results.append(words[:-2] + 'o') if word[0] == 'h' and word[1] in vowels and word[2] in consonants: results.append(word[1:]) if word[-3] in consonants and word[-2:] == 'ng': results.append(word[:-2] + 'g') if word[1:3] == 'ng': results.append(word[:1] + x[2:]) if augment_consonant: result_consonants = [] for k, v in replace_consonants.items(): for r in results: result_consonants.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_consonants) if augment_vowel: result_vowels = [] for k, v in replace_vowels.items(): for r in results: result_vowels.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_vowels) result_deleted = [] for s in results: deleted = [] for c in s: if random.random() > prob_delete_vowel and c in vowels: continue else: deleted.append(c) result_deleted.append(''.join(deleted)) results.extend(result_deleted) filtered = [] for s in results: t = tokenizer.tokenize(s) if len(t) == 1: filtered.append(s) continue if t[0] == '▁': continue if any([len(w) < 3 for w in t]): continue filtered.append(s) return list(set(filtered))