def load(module, model, encoder, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': LM_VOCAB[module] }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) encoder = ENCODER_MODEL[encoder](vocab_file=path['vocab'], id_mode=True) inputs = ['Placeholder'] outputs = ['greedy', 'beam'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def load_pegasus(module, model, model_class, maxlen, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': PEGASUS_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder', 'top_p', 'temperature'] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) tokenizer = WordPieceTokenizer(vocab_file=path['vocab']) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, maxlen=maxlen, )
def load(module, model, model_class, quantized=False, **kwargs): try: import tensorflow_text except BaseException: raise ModuleNotFoundError( 'tensorflow-text not installed. Please install it by `pip install tensorflow-text` and try again. Also, make sure tensorflow-text version same as tensorflow version.' ) path = check_file( file=model, module=module, keys={'model': 'model.pb', 'vocab': VOCAB_MODEL.get(module, MS_EN_BPE_MODEL)}, quantized=quantized, **kwargs, ) g = load_graph(path['model'], t5_graph=True, **kwargs) tokenizer = SentencePieceBatchEncoder(vocab_file=path['vocab']) inputs = ['inputs'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'decode': 'import/SelectV2_3:0'} ) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def transformer(class_name, model='xlnet', quantized=False, **kwargs): path = check_file( file=model, module=class_name, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], 'setting': TAGGING_SETTING[class_name], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) try: with open(path['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path['vocab'], do_lower_case=False, spm_model_file=path['tokenizer'], ) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} Model = TaggingBERT if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} Model = TaggingXLNET outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return Model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def _transformer(model, bert_class, xlnet_class, quantized=False, siamese=False, **kwargs): model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.similarity.available_transformer()`.' ) path = check_file( file=model, module='similarity', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: tokenizer = AlbertTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) selected_class = bert_class if siamese: selected_node = 'import/bert/pooler/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) selected_class = xlnet_class if siamese: selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0' if not siamese: selected_node = _vectorizer_mapping[model] inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] outputs = ['logits'] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': selected_node}) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def load(module, model, encoder, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': LM_VOCAB[module] }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) inputs = ['Placeholder'] outputs = ['greedy', 'beam'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def load_lstm(module, left_dict, right_dict, cleaning, quantized=False, **kwargs): path = check_file( file='lstm-bahdanau', module=module, keys={'model': 'model.pb'}, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={ 'greedy': 'import/decode_1/greedy:0', 'beam': 'import/decode_2/beam:0', }, ) return Seq2SeqLSTM( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), left_dict=left_dict, right_dict=right_dict, cleaning=cleaning, )
def load_lm(module, model, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': T2T_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) X = g.get_tensor_by_name('import/Placeholder:0') top_p = g.get_tensor_by_name('import/Placeholder_2:0') greedy = g.get_tensor_by_name('import/greedy:0') beam = g.get_tensor_by_name('import/beam:0') nucleus = g.get_tensor_by_name('import/nucleus:0') tokenizer = SentencePieceEncoder(path['vocab']) inputs = ['Placeholder', 'Placeholder_2'] outputs = ['greedy', 'beam', 'nucleus'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def load(module, model, model_class, maxlen, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': TRANSLATION_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder'] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) encoder = SentencePieceEncoder(vocab_file=path['vocab']) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, maxlen=maxlen, )
def load_tatabahasa(module, model, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': T2T_BPE_MODEL }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) tokenizer = SentencePieceEncoder(vocab_file=path['vocab']) inputs = ['x_placeholder'] outputs = ['greedy', 'tag_greedy'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.Constituency class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.constituency.available_transformer()`.' ) path = check_file( file=model, module='constituency', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['input_ids', 'word_end_mask'] outputs = ['charts', 'tags'] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'vectorizer': _vectorizer_mapping[model]}) mode = 'bert' if 'bert' in model else 'xlnet' return Constituency( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, dictionary=settings.constituency, mode=mode, )
def deep_model(quantized: bool = False, **kwargs): """ Load deep learning language detection model. Original size is 51.2MB, Quantized size 12.8MB. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.DeepLang class """ path = check_file( file = 'lang-32', module = 'language-detection', keys = { 'model': 'model.pb', 'vector': LANGUAGE_DETECTION_BOW, 'bpe': LANGUAGE_DETECTION_VOCAB, }, quantized = quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe, subword_mode = load_yttm(path['bpe']) try: with open(path['vector'], 'rb') as fopen: vector = pickle.load(fopen) except: raise ValueError( "model corrupted due to some reasons, please run `malaya.clear_cache('language-detection/lang-32')` and try again" ) inputs = [ 'X_Placeholder/shape', 'X_Placeholder/values', 'X_Placeholder/indices', 'W_Placeholder/shape', 'W_Placeholder/values', 'W_Placeholder/indices', ] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return DeepLang( input_nodes = input_nodes, output_nodes = output_nodes, sess = generate_session(graph = g, **kwargs), vectorizer = vector, bpe = bpe, type = subword_mode, label = lang_labels, )
def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention stemming model, this also include lemmatization. Original size 41.6MB, quantized size 10.6MB . Parameters ---------- quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: malaya.stem.DeepStemmer class """ if check_tf_version() > 1: raise Exception( f'Tensorflow 2.0 and above not able to use `deep_model` for stemmer, use Tensorflow 1.15 instead.' ) path = check_file( file='lstm-bahdanau', module='stem', keys={ 'model': 'model.pb', 'vocab': STEMMER_VOCAB }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe, subword_mode = load_yttm(path['vocab'], id_mode=True) inputs = ['Placeholder'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={ 'greedy': 'import/decode_1/greedy:0', 'beam': 'import/decode_2/beam:0', }, ) tokenizer = Tokenizer().tokenize return DeepStemmer( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), bpe=bpe, subword_mode=subword_mode, tokenizer=tokenizer, )
def transformer(module, model='xlnet', quantized=False, tok=None, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} selected_model = TaggingBERT selected_tokenizer = SentencePieceTokenizer elif model in ['xlnet', 'alxlnet']: inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} selected_model = TaggingXLNET selected_tokenizer = SentencePieceTokenizer elif model in ['fastformer', 'tiny-fastformer']: inputs = ['Placeholder'] vectorizer_nodes = {'fastformer': 'import/fast_transformer/add_24:0', 'tiny-fastformer': 'import/fast_transformer/add_8:0'} vectorizer = {'vectorizer': vectorizer_nodes[model]} selected_model = TaggingFastFormer selected_tokenizer = WordPieceTokenizer outputs = ['logits'] tokenizer = selected_tokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra=vectorizer ) return selected_model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=TAGGING_SETTING[module], tok=tok )
def transformer_squad(class_name, model = 'bert', quantized = False, **kwargs): path = check_file( file = model, module = class_name, keys = { 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized = quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3'] if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert( path['tokenizer'], path['vocab'] ) if model in ['albert', 'tiny-albert']: tokenizer = AlbertTokenizer( vocab_file = path['vocab'], spm_model_file = path['tokenizer'] ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs.append('Placeholder_4') outputs = [ 'start_top_log_probs', 'start_top_index', 'end_top_log_probs', 'end_top_index', 'cls_logits', 'logits_vectorize', ] input_nodes, output_nodes = nodes_session(g, inputs, outputs) mode = 'bert' if 'bert' in model else 'xlnet' return SQUAD( input_nodes = input_nodes, output_nodes = output_nodes, sess = generate_session(graph = g, **kwargs), tokenizer = tokenizer, class_name = class_name, mode = mode, length = LENGTHS[mode], )
def load(module, model, model_class, quantized=False, **kwargs): try: import tensorflow_text except BaseException: raise ModuleNotFoundError( 'tensorflow-text not installed. Please install it by `pip install tensorflow-text` and try again. Also, make sure tensorflow-text version same as tensorflow version.' ) if model.split('-')[-1] == '4k': default_vocab = MS_EN_4k_BPE_MODEL else: default_vocab = MS_EN_BPE_MODEL path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': VOCAB_MODEL.get(module, default_vocab) }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], t5_graph=True, **kwargs) tokenizer = SentencePieceBatchEncoder(vocab_file=path['vocab']) inputs = ['inputs'] outputs = [] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={'decode': 'import/SelectV2_3:0'}) if module == 'kesalahan-tatabahasa': word_tokenizer = Tokenizer(date=False, time=False).tokenize elif module == 'spelling-correction': word_tokenizer = Tokenizer(duration=False, date=False).tokenize else: word_tokenizer = None return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, word_tokenizer=word_tokenizer, )
def load_char(module, model, left_dict, cleaning, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={'model': 'model.pb'}, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['x_placeholder'] outputs = ['greedy', 'tag_greedy'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return TransformerChar( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), left_dict=left_dict, cleaning=cleaning, )
def load(model, quantized=False, **kwargs): path = check_file( file=model, module='gpt2', keys={ 'model': 'model.pb', 'encoder': GPT2_ENCODER, 'vocab': GPT2_VOCAB }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) with open(path['encoder']) as f: en = json.load(f) with open(path['vocab'], encoding='utf-8') as f: bpe_data = f.read() bpe_merges = [ tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1] ] encoder = GPT2Encoder( encoder=en, bpe_merges=bpe_merges, ) inputs = ['X', 'temp', 'top_k', 'top_p', 'maxlen', 'n_samples'] outputs = ['output'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return GPT2( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: model List of model classes: * if `bert` in model, will return `malaya.model.bert.DependencyBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`. """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.dependency.available_transformer()`.' ) path = check_file( file=model, module='dependency', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) inputs = ['Placeholder'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} Model = DependencyBERT if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} Model = DependencyXLNET outputs = ['logits', 'heads_seq'] input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return Model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=label, )
def transformer(version: str = 'v2', model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention. Parameters ---------- version : str, optional (default='v2') Version supported. Allowed values: * ``'v1'`` - version 1, maintain for knowledge graph. * ``'v2'`` - Trained on bigger dataset, better version. model : str, optional (default='xlnet') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: model List of model classes: * if `bert` in model, will return `malaya.model.bert.DependencyBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`. """ version = _validate_version(version) model = model.lower() if model not in _transformer_availability[version]: raise ValueError( "model not supported, please check supported models from `malaya.dependency.available_transformer(version='{version}')`." ) module = 'dependency' minus = 1 if version != 'v1': module = f'{module}-{version}' minus = 2 path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: inputs = ['Placeholder'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} selected_model = DependencyBERT if model in ['xlnet', 'alxlnet']: inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} selected_model = DependencyXLNET outputs = ['logits', 'heads_seq'] tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return selected_model(input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=label, minus=minus)
def transformer_ontonotes5(class_name, model='xlnet', quantized=False, **kwargs): path = check_file( file=model, module=class_name, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], 'setting': TAGGING_SETTING[class_name], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) hypen = r'\w+(?:-\w+)+' hypen_left = r'\w+(?: -\w+)+' hypen_right = r'\w+(?:- \w+)+' hypen_both = r'\w+(?: - \w+)+' pipeline = [ hypen, hypen_left, hypen_right, hypen_both, _expressions['percent'], _expressions['money'], _expressions['time'], _expressions['date'], _expressions['repeat_puncts'], _expressions['number'], _expressions['word'], ] pipeline.append('(?:\S)') compiled = re.compile(r'({})'.format('|'.join(pipeline))) def tok(string): tokens = compiled.findall(string) return [t[0] for t in tokens] try: with open(path['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path['vocab'], do_lower_case=False, spm_model_file=path['tokenizer'], ) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} Model = TaggingBERT return TaggingBERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, tok=tok, ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} Model = TaggingXLNET outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs, extra=vectorizer) return Model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def transformer(model: str = 'bert', quantized: bool = False, **kwargs): """ Load Transformer keyword similarity model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: model List of model classes: * if `bert` in model, will return `malaya.model.bert.KeyphraseBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.KeyphraseXLNET`. """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.keyword_extraction.available_transformer()`.' ) path = check_file( file=model, module='keyword-extraction', keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) outputs = ['logits'] if model in ['bert', 'tiny-bert']: inputs = [ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', ] outputs.append('bert/summary') selected_class = KeyphraseBERT if model in ['xlnet', 'alxlnet']: inputs = [ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', 'Placeholder_4', 'Placeholder_5', ] outputs.append('xlnet/summary') selected_class = KeyphraseXLNET tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session(g, inputs, outputs) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def transformer( module, label, model='bert', sigmoid=False, quantized=False, **kwargs, ): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if sigmoid: selected_model = SIGMOID_MODEL[model] else: if len(label) > 2 or module == 'relevancy': selected_model = MULTICLASS_MODEL[model] else: selected_model = BINARY_MODEL[model] if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: attention = bert_attention_weights(bert_num_layers[model], g) elif model in ['albert', 'tiny-albert']: attention = albert_attention_weights(albert_num_layers[model], g) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} elif model in ['xlnet', 'alxlnet']: if model in ['xlnet']: weights_import = xlnet_attention_weights elif model in ['alxlnet']: weights_import = alxlnet_attention_weights inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] vectorizer = {'vectorizer': 'import/transpose_3:0'} attention = weights_import(g) elif model in ['bigbird', 'tiny-bigbird']: inputs = ['Placeholder'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} attention = None elif model in ['fnet', 'fnet-large']: inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/vectorizer:0'} attention = None elif model in ['fastformer', 'tiny-fastformer']: inputs = ['Placeholder'] vectorizer_nodes = { 'fastformer': 'import/fast_transformer/add_24:0', 'tiny-fastformer': 'import/fast_transformer/add_8:0' } vectorizer = {'vectorizer': vectorizer_nodes[model]} attention = None outputs = ['logits', 'logits_seq'] tokenizer = TOKENIZER_MODEL[model](vocab_file=path['vocab'], spm_model_file=path['tokenizer']) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra=vectorizer, attention={'attention': attention}, ) return selected_model( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=label, module=module, )
def transformer( class_name, label, model='bert', sigmoid=False, quantized=False, **kwargs, ): path = check_file( file=model, module=class_name, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], 'tokenizer': MODEL_BPE[model], }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if sigmoid: if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = SigmoidBERT if model in ['xlnet', 'alxlnet']: selected_class = SigmoidXLNET else: if len(label) > 2 or class_name == 'relevancy': if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = MulticlassBERT if model in ['xlnet', 'alxlnet']: selected_class = MulticlassXLNET if model in ['bigbird', 'tiny-bigbird']: selected_class = MulticlassBigBird else: if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = BinaryBERT if model in ['xlnet', 'alxlnet']: selected_class = BinaryXLNET if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from malaya.transformers.albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path['vocab'], do_lower_case=False, spm_model_file=path['tokenizer'], ) inputs = ['Placeholder', 'Placeholder_1'] vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} attention = _extract_attention_weights_import(bert_num_layers[model], g) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2'] tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer']) vectorizer = {'vectorizer': 'import/transpose_3:0'} attention = _extract_attention_weights_import(g) if model in ['bigbird', 'tiny-bigbird']: inputs = ['Placeholder'] tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'], path['vocab']) vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'} attention = None outputs = ['logits', 'logits_seq'] input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra=vectorizer, attention={'attention': attention}, ) return selected_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=label, class_name=class_name, )