def load(path, s3_path, model, encoder, model_class, quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path[model]['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) return model_class( X=g.get_tensor_by_name('import/Placeholder:0'), greedy=g.get_tensor_by_name('import/greedy:0'), beam=g.get_tensor_by_name('import/beam:0'), sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def multinomial(path, s3_path, module, label, sigmoid=False, **kwargs): path = check_file(path['multinomial'], s3_path['multinomial'], **kwargs) try: with open(path['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(path['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except BaseException: path = os.path.normpath(f'{module}/multinomial') raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{path}')` and try again" ) bpe = YTTMEncoder(vocab_file=path['bpe']) stemmer = naive() cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer) if sigmoid: selected_model = MultilabelBayes else: if len(label) > 2: selected_model = MulticlassBayes else: selected_model = BinaryBayes return selected_model( multinomial=multinomial, label=label, vectorize=vectorize, bpe=bpe, cleaning=cleaning, )
def load(module, model, encoder, model_class, quantized=False, **kwargs): path = check_file( file=model, module=module, keys={ 'model': 'model.pb', 'vocab': LM_VOCAB[module] }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) inputs = ['Placeholder'] outputs = ['greedy', 'beam'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return model_class( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def deep_model(quantized: bool = False, **kwargs): """ Load deep learning language detection model. Original size is 51.2MB, Quantized size 12.8MB. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.tf.DeepLang class """ path = check_file( file='lang-32', module='language-detection', keys={ 'model': 'model.pb', 'vector': LANGUAGE_DETECTION_BOW, 'bpe': LANGUAGE_DETECTION_VOCAB, }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe = YTTMEncoder(vocab_file=path['bpe']) with open(path['vector'], 'rb') as fopen: vector = pickle.load(fopen) inputs = [ 'X_Placeholder/shape', 'X_Placeholder/values', 'X_Placeholder/indices', 'W_Placeholder/shape', 'W_Placeholder/values', 'W_Placeholder/indices', ] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return DeepLang( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), vectorizer=vector, bpe=bpe, label=lang_labels, )
def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention stemming model, this also include lemmatization. Original size 41.6MB, quantized size 10.6MB . Parameters ---------- quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result: malaya.stem.DeepStemmer class """ path = check_file( file='lstm-bahdanau', module='stem', keys={ 'model': 'model.pb', 'vocab': STEMMER_VOCAB }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) inputs = ['Placeholder'] outputs = [] bpe = YTTMEncoder(vocab_file=path['vocab'], id_mode=True) input_nodes, output_nodes = nodes_session( g, inputs, outputs, extra={ 'greedy': 'import/decode_1/greedy:0', 'beam': 'import/decode_2/beam:0', }, ) tokenizer = Tokenizer().tokenize return DeepStemmer( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), bpe=bpe, tokenizer=tokenizer, )
def load(path, s3_path, model, encoder, model_class, **kwargs): check_file(path[model], s3_path[model], **kwargs) g = load_graph(path[model]['model'], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path[model]['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) return model_class( g.get_tensor_by_name('import/Placeholder:0'), g.get_tensor_by_name('import/greedy:0'), g.get_tensor_by_name('import/beam:0'), generate_session(graph=g, **kwargs), encoder, )