def load(module, model, model_class, quantized=False, **kwargs): try: import tensorflow_text except BaseException: raise ModuleNotFoundError( 'tensorflow-text not installed. Please install it by `pip install tensorflow-text` and try again. Also, make sure tensorflow-text version same as tensorflow version.' ) path = check_file( file=model, module=module, keys={'model': 'model.pb', 'vocab': VOCAB_MODEL.get(module, MS_EN_BPE_MODEL)}, quantized=quantized, **kwargs, ) g = load_graph(path['model'], t5_graph=True, **kwargs) tokenizer = SentencePieceBatchEncoder(vocab_file=path['vocab']) inputs = ['inputs'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'decode': 'import/SelectV2_3:0'} ) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def load(module, model, encoder, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': LM_VOCAB[module] }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) encoder = ENCODER_MODEL[encoder](vocab_file=path['vocab'], id_mode=True) inputs = ['Placeholder'] outputs = ['greedy', 'beam'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def _transformer(model, bert_class, xlnet_class, quantized=False, siamese=False, **kwargs): model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.similarity.available_transformer()`.' ) path = check_file( file=model, module='similarity', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: tokenizer = AlbertTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) selected_class = bert_class if siamese: selected_node = 'import/bert/pooler/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) selected_class = xlnet_class if siamese: selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0' if not siamese: selected_node = _vectorizer_mapping[model] inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] outputs = ['logits'] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': selected_node}) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def load(module, model, model_class, maxlen, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': TRANSLATION_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder'] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) encoder = SentencePieceEncoder(vocab_file=path['vocab']) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, maxlen=maxlen, )
def load(module, model, encoder, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': LM_VOCAB[module] }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) inputs = ['Placeholder'] outputs = ['greedy', 'beam'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def load_lm(path, s3_path, model, model_class, quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) X = g.get_tensor_by_name('import/Placeholder:0') top_p = g.get_tensor_by_name('import/Placeholder_2:0') greedy = g.get_tensor_by_name('import/greedy:0') beam = g.get_tensor_by_name('import/beam:0') nucleus = g.get_tensor_by_name('import/nucleus:0') tokenizer = SentencePieceEncoder(path[model]['vocab']) return model_class( X, top_p, greedy, beam, nucleus, generate_session(graph=g, **kwargs), tokenizer, )
def _transformer(model, bert_class, xlnet_class, quantized=False, siamese=False, **kwargs): model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.similarity.available_transformer()`.' ) check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_SIMILARITY[model][model_path], **kwargs) path = PATH_SIMILARITY if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) selected_class = bert_class if siamese: selected_node = 'import/bert/pooler/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) selected_class = xlnet_class if siamese: selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0' if not siamese: selected_node = _vectorizer_mapping[model] return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name(selected_node), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def load(path, s3_path, model, encoder, model_class, quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path[model]['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) return model_class( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/greedy:0'), g.get_tensor_by_name('import/beam:0'), generate_session(graph=g, **kwargs), encoder, )
def load_tatabahasa(module, model, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': T2T_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) tokenizer = SentencePieceEncoder(vocab_file=path['vocab']) inputs = ['x_placeholder'] outputs = ['greedy', 'tag_greedy'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def load_pegasus(module, model, model_class, maxlen, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': PEGASUS_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder', 'top_p', 'temperature'] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) tokenizer = WordPieceTokenizer(vocab_file=path['vocab']) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, maxlen=maxlen, )
def load_lm(module, model, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': T2T_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) X = g.get_tensor_by_name('import/Placeholder:0') top_p = g.get_tensor_by_name('import/Placeholder_2:0') greedy = g.get_tensor_by_name('import/greedy:0') beam = g.get_tensor_by_name('import/beam:0') nucleus = g.get_tensor_by_name('import/nucleus:0') tokenizer = SentencePieceEncoder(path['vocab']) inputs = ['Placeholder', 'Placeholder_2'] outputs = ['greedy', 'beam', 'nucleus'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.Constituency class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.constituency.available_transformer()`.' ) path = check_file( file=model, module='constituency', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['input_ids', 'word_end_mask'] outputs = ['charts', 'tags'] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': _vectorizer_mapping[model]}) mode = 'bert' if 'bert' in model else 'xlnet' return Constituency( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, dictionary=settings.constituency, mode=mode, )
def load(path, s3_path, model, model_class, compressed=True, quantized=False, **kwargs): try: import tensorflow_text import tf_sentencepiece except: raise ModuleNotFoundError( 'tensorflow-text and tf-sentencepiece not installed. Please install it by `pip install tensorflow-text==1.15.0 tf-sentencepiece==0.1.86` and try again. Also, make sure tensorflow-text version same as tensorflow version.' ) if compressed and not quantized: path = path['t5-compressed'] s3_path = s3_path['t5-compressed'] check_file(path[model]['model'], s3_path[model], **kwargs) if not os.path.exists(path[model]['directory'] + 'saved_model.pb'): import tarfile with tarfile.open(path[model]['model']['model']) as tar: tar.extractall(path=path[model]['path']) X = None decode = None sess = generate_session(graph=None, **kwargs) meta_graph_def = tf.compat.v1.saved_model.load( sess, ['serve'], path[model]['directory']) signature_def = meta_graph_def.signature_def['serving_default'] pred = lambda x: sess.run( fetches=signature_def.outputs['outputs'].name, feed_dict={signature_def.inputs['input'].name: x}, ) else: path = path['t5'] s3_path = s3_path['t5'] check_file(path[model], s3_path[model], quantized=quantized, optimized=True, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) X = g.get_tensor_by_name('import/inputs:0') decode = g.get_tensor_by_name( 'import/SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp:0' ) sess = generate_session(graph=g, **kwargs) pred = None return model_class(X=X, decode=decode, sess=sess, pred=pred)
def transformer(path, s3_path, class_name, model='xlnet', quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) try: with open(path[model]['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return TAGGING_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return TAGGING_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/transpose_3:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def transformer(model: str = 'xlnet', **kwargs): """ Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. Returns ------- result : malaya.model.tf.CONSTITUENCY class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from malaya.constituency.available_transformer()' ) check_file(PATH_CONSTITUENCY[model], S3_PATH_CONSTITUENCY[model], **kwargs) g = load_graph(PATH_CONSTITUENCY[model]['model'], **kwargs) with open(PATH_CONSTITUENCY[model]['dictionary']) as fopen: dictionary = json.load(fopen) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: tokenizer = sentencepiece_tokenizer_bert( PATH_CONSTITUENCY[model]['tokenizer'], PATH_CONSTITUENCY[model]['vocab'], ) mode = 'bert' if model in ['xlnet']: tokenizer = sentencepiece_tokenizer_xlnet( PATH_CONSTITUENCY[model]['tokenizer']) mode = 'xlnet' from malaya.model.tf import CONSTITUENCY return CONSTITUENCY( input_ids=g.get_tensor_by_name('import/input_ids:0'), word_end_mask=g.get_tensor_by_name('import/word_end_mask:0'), charts=g.get_tensor_by_name('import/charts:0'), tags=g.get_tensor_by_name('import/tags:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, dictionary=dictionary, mode=mode, )
def deep_model(quantized: bool = False, **kwargs): """ Load deep learning language detection model. Original size is 51.2MB, Quantized size 12.8MB. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.DeepLang class """ path = check_file( file = 'lang-32', module = 'language-detection', keys = { 'model': 'model.pb', 'vector': LANGUAGE_DETECTION_BOW, 'bpe': LANGUAGE_DETECTION_VOCAB, }, quantized = quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe, subword_mode = load_yttm(path['bpe']) try: with open(path['vector'], 'rb') as fopen: vector = pickle.load(fopen) except: raise ValueError( "model corrupted due to some reasons, please run `malaya.clear_cache('language-detection/lang-32')` and try again" ) inputs = [ 'X_Placeholder/shape', 'X_Placeholder/values', 'X_Placeholder/indices', 'W_Placeholder/shape', 'W_Placeholder/values', 'W_Placeholder/indices', ] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return DeepLang( input_nodes = input_nodes, output_nodes = output_nodes, sess = generate_session(graph = g, **kwargs), vectorizer = vector, bpe = bpe, type = subword_mode, label = lang_labels, )
def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention stemming model, this also include lemmatization. Original size 41.6MB, quantized size 10.6MB . Parameters ---------- quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: malaya.stem.DeepStemmer class """ if check_tf_version() > 1: raise Exception( f'Tensorflow 2.0 and above not able to use `deep_model` for stemmer, use Tensorflow 1.15 instead.' ) path = check_file( file='lstm-bahdanau', module='stem', keys={ 'model': 'model.pb', 'vocab': STEMMER_VOCAB }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe, subword_mode = load_yttm(path['vocab'], id_mode=True) inputs = ['Placeholder'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={ 'greedy': 'import/decode_1/greedy:0', 'beam': 'import/decode_2/beam:0', }, ) tokenizer = Tokenizer().tokenize return DeepStemmer( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), bpe=bpe, subword_mode=subword_mode, tokenizer=tokenizer, )
def transformer(class_name, model='xlnet', quantized=False, **kwargs): path = check_file( file=model, module=class_name, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], 'setting': TAGGING_SETTING[class_name], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) try: with open(path['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: tokenizer = AlbertTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} Model = TaggingBERT if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} Model = TaggingXLNET outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return Model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def transformer(module, model='xlnet', quantized=False, tok=None, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} selected_model = TaggingBERT selected_tokenizer = SentencePieceTokenizer elif model in ['xlnet', 'alxlnet']: inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} selected_model = TaggingXLNET selected_tokenizer = SentencePieceTokenizer elif model in ['fastformer', 'tiny-fastformer']: inputs = ['Placeholder'] vectorizer_nodes = {'fastformer': 'import/fast_transformer/add_24:0', 'tiny-fastformer': 'import/fast_transformer/add_8:0'} vectorizer = {'vectorizer': vectorizer_nodes[model]} selected_model = TaggingFastFormer selected_tokenizer = WordPieceTokenizer outputs = ['logits'] tokenizer = selected_tokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra=vectorizer ) return selected_model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=TAGGING_SETTING[module], tok=tok )
def transformer_squad(class_name, model = 'bert', quantized = False, **kwargs): path = check_file( file = model, module = class_name, keys = { 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized = quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3'] if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert( path['tokenizer'], path['vocab'] ) if model in ['albert', 'tiny-albert']: tokenizer = AlbertTokenizer( vocab_file = path['vocab'], spm_model_file = path['tokenizer'] ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs.append('Placeholder_4') outputs = [ 'start_top_log_probs', 'start_top_index', 'end_top_log_probs', 'end_top_index', 'cls_logits', 'logits_vectorize', ] input_nodes, output_nodes = nodes_session(g, inputs, outputs) mode = 'bert' if 'bert' in model else 'xlnet' return SQUAD( input_nodes = input_nodes, output_nodes = output_nodes, sess = generate_session(graph = g, **kwargs), tokenizer = tokenizer, class_name = class_name, mode = mode, length = LENGTHS[mode], )
def load(module, model, model_class, quantized=False, **kwargs): try: import tensorflow_text except BaseException: raise ModuleNotFoundError( 'tensorflow-text not installed. Please install it by `pip install tensorflow-text` and try again. Also, make sure tensorflow-text version same as tensorflow version.' ) if model.split('-')[-1] == '4k': default_vocab = MS_EN_4k_BPE_MODEL else: default_vocab = MS_EN_BPE_MODEL path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': VOCAB_MODEL.get(module, default_vocab) }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], t5_graph=True, **kwargs) tokenizer = SentencePieceBatchEncoder(vocab_file=path['vocab']) inputs = ['inputs'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'decode': 'import/SelectV2_3:0'}) if module == 'kesalahan-tatabahasa': word_tokenizer = Tokenizer(date=False, time=False).tokenize elif module == 'spelling-correction': word_tokenizer = Tokenizer(duration=False, date=False).tokenize else: word_tokenizer = None return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, word_tokenizer=word_tokenizer, )
def wiki_load_model(): if not os.path.isfile(PATH_SUMMARIZE['wiki']['model']): print('downloading SUMMARIZE wikipedia frozen model') download_file(S3_PATH_SUMMARIZE['wiki']['model'], PATH_SUMMARIZE['wiki']['model']) if not os.path.isfile(PATH_SUMMARIZE['wiki']['setting']): print('downloading SUMMARIZE wikipedia dictionary') download_file( S3_PATH_SUMMARIZE['wiki']['setting'], PATH_SUMMARIZE['wiki']['setting'], ) g = load_graph(PATH_SUMMARIZE['wiki']['model']) x = g.get_tensor_by_name('import/Placeholder_1:0') logits = g.get_tensor_by_name('import/logits:0') attention = g.get_tensor_by_name('import/attention:0') sess = tf.InteractiveSession(graph=g) with open(PATH_SUMMARIZE['wiki']['setting']) as fopen: dictionary = json.load(fopen) return sess, x, logits, attention, dictionary, 50
def transformer( path, s3_path, class_name, model = 'xlnet', size = 'base', **kwargs ): check_file(path[model][size], s3_path[model][size], **kwargs) try: with open(path[model][size]['setting']) as fopen: nodes = json.load(fopen) g = load_graph(path[model][size]['model']) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again" ) if model in ['albert', 'bert']: tokenizer, cls, sep = sentencepiece_tokenizer_bert( path[model][size]['tokenizer'], path[model][size]['vocab'] ) return TAGGING_BERT( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = None, input_masks = None, logits = g.get_tensor_by_name('import/logits:0'), sess = generate_session(graph = g), tokenizer = tokenizer, cls = cls, sep = sep, settings = nodes, ) if model in ['xlnet']: tokenizer = sentencepiece_tokenizer_xlnet( path[model][size]['tokenizer'] ) return TAGGING_XLNET( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'), input_masks = g.get_tensor_by_name('import/Placeholder_2:0'), logits = g.get_tensor_by_name('import/logits:0'), sess = generate_session(graph = g), tokenizer = tokenizer, settings = nodes, )
def load_char(module, model, left_dict, cleaning, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={'model': 'model.pb'}, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['x_placeholder'] outputs = ['greedy', 'tag_greedy'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return TransformerChar( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), left_dict=left_dict, cleaning=cleaning, )
def transformer(model='base', **kwargs): """ Load transformer encoder-decoder model to translate MS-to-EN. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'small'`` - transformer Small parameters. * ``'base'`` - transformer Base parameters. * ``'large'`` - transformer Large parameters. Returns ------- result: malaya.model.tf.TRANSLATION class """ model = model.lower() if model not in _transformer_availability: raise Exception( 'model not supported, please check supported models from malaya.translation.ms_en.available_transformer()' ) path = PATH_TRANSLATION['ms-en'] s3_path = S3_PATH_TRANSLATION['ms-en'] check_file(path[model], s3_path[model], **kwargs) g = load_graph(path[model]['model'], **kwargs) from malaya.text.t2t import text_encoder from malaya.model.tf import TRANSLATION encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) return TRANSLATION( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/greedy:0'), g.get_tensor_by_name('import/beam:0'), generate_session(graph=g, **kwargs), encoder, )
def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention stemming model, this also include lemmatization. Original size 41.6MB, quantized size 10.6MB . Parameters ---------- quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: malaya.stem.DEEP_STEMMER class """ from malaya.preprocessing import _tokenizer check_file(PATH_STEM['deep'], S3_PATH_STEM['deep'], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_STEM['deep'][model_path], **kwargs) bpe, subword_mode = load_yttm(PATH_STEM['deep']['bpe'], id_mode=True) return DEEP_STEMMER( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/decode_1/greedy:0'), g.get_tensor_by_name('import/decode_2/beam:0'), generate_session(graph=g, **kwargs), bpe, subword_mode, _tokenizer, )
def load_tatabahasa(path, s3_path, model, model_class, quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) tokenizer = SentencePieceEncoder(path[model]['vocab']) return model_class( X=g.get_tensor_by_name('import/x_placeholder:0'), greedy=g.get_tensor_by_name('import/greedy:0'), tag_greedy=g.get_tensor_by_name('import/tag_greedy:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def deep_model(model: str = 'bahdanau', **kwargs): """ Load seq2seq stemmer deep learning model. Returns ------- DEEP_STEMMER: malaya.stemmer._DEEP_STEMMER class """ check_file(PATH_STEM[model], S3_PATH_STEM[model], **kwargs) try: with open(PATH_STEM[model]['setting'], 'r') as fopen: dic_stemmer = json.load(fopen) g = load_graph(PATH_STEM[model]['model']) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('stem/{model}') and try again" ) return _DEEP_STEMMER( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/logits:0'), generate_session(graph=g), dic_stemmer, )
def load(model, quantized=False, **kwargs): path = check_file( file=model, module='gpt2', keys={ 'model': 'model.pb', 'encoder': GPT2_ENCODER, 'vocab': GPT2_VOCAB }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) with open(path['encoder']) as f: en = json.load(f) with open(path['vocab'], encoding='utf-8') as f: bpe_data = f.read() bpe_merges = [ tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1] ] encoder = GPT2Encoder( encoder=en, bpe_merges=bpe_merges, ) inputs = ['X', 'temp', 'top_k', 'top_p', 'maxlen', 'n_samples'] outputs = ['output'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return GPT2( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def deep_model(**kwargs): """ Load LSTM + Bahdanau Attention stemming model. Returns ------- DEEP_STEMMER: malaya.stem.DEEP_STEMMER class """ from malaya.preprocessing import _tokenizer check_file(PATH_STEM['deep'], S3_PATH_STEM['deep'], **kwargs) g = load_graph(PATH_STEM['deep']['model']) bpe, subword_mode = load_yttm(PATH_STEM['deep']['bpe'], id_mode=True) return DEEP_STEMMER( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/decode_1/greedy:0'), g.get_tensor_by_name('import/decode_2/beam:0'), generate_session(graph=g), bpe, subword_mode, _tokenizer, )