Beispiel #1
0
def load(path,
         s3_path,
         model,
         model_class,
         compressed=True,
         quantized=False,
         **kwargs):

    try:
        import tensorflow_text
        import tf_sentencepiece
    except:
        raise ModuleNotFoundError(
            'tensorflow-text and tf-sentencepiece not installed. Please install it by `pip install tensorflow-text==1.15.0 tf-sentencepiece==0.1.86` and try again. Also, make sure tensorflow-text version same as tensorflow version.'
        )

    if compressed and not quantized:
        path = path['t5-compressed']
        s3_path = s3_path['t5-compressed']
        check_file(path[model]['model'], s3_path[model], **kwargs)

        if not os.path.exists(path[model]['directory'] + 'saved_model.pb'):
            import tarfile

            with tarfile.open(path[model]['model']['model']) as tar:
                tar.extractall(path=path[model]['path'])

        X = None
        decode = None
        sess = generate_session(graph=None, **kwargs)
        meta_graph_def = tf.compat.v1.saved_model.load(
            sess, ['serve'], path[model]['directory'])
        signature_def = meta_graph_def.signature_def['serving_default']
        pred = lambda x: sess.run(
            fetches=signature_def.outputs['outputs'].name,
            feed_dict={signature_def.inputs['input'].name: x},
        )

    else:
        path = path['t5']
        s3_path = s3_path['t5']
        check_file(path[model],
                   s3_path[model],
                   quantized=quantized,
                   optimized=True,
                   **kwargs)
        if quantized:
            model_path = 'quantized'
        else:
            model_path = 'model'
        g = load_graph(path[model][model_path], **kwargs)
        X = g.get_tensor_by_name('import/inputs:0')
        decode = g.get_tensor_by_name(
            'import/SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp:0'
        )
        sess = generate_session(graph=g, **kwargs)
        pred = None

    return model_class(X=X, decode=decode, sess=sess, pred=pred)
Beispiel #2
0
def transformer(path,
                s3_path,
                class_name,
                model='xlnet',
                quantized=False,
                **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(path[model][model_path], **kwargs)

    try:
        with open(path[model]['setting']) as fopen:
            nodes = json.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again"
        )

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return TAGGING_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])
        return TAGGING_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/transpose_3:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )
Beispiel #3
0
def load(module, model, encoder, model_class, quantized=False, **kwargs):

    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': LM_VOCAB[module]
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    encoder = ENCODER_MODEL[encoder](vocab_file=path['vocab'], id_mode=True)

    inputs = ['Placeholder']
    outputs = ['greedy', 'beam']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Beispiel #4
0
def transformer(class_name, model='xlnet', quantized=False, **kwargs):
    path = check_file(
        file=model,
        module=class_name,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
            'setting': TAGGING_SETTING[class_name],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    try:
        with open(path['setting']) as fopen:
            nodes = json.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/{model}/{size}')` and try again"
        )

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])

        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path['vocab'],
                do_lower_case=False,
                spm_model_file=path['tokenizer'],
            )

        inputs = ['Placeholder', 'Placeholder_1']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        Model = TaggingBERT

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        vectorizer = {'vectorizer': 'import/transpose_3:0'}
        Model = TaggingXLNET

    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g,
                                              inputs,
                                              outputs,
                                              extra=vectorizer)

    return Model(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        settings=nodes,
    )
Beispiel #5
0
def load(module, model, encoder, model_class, quantized=False, **kwargs):

    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': LM_VOCAB[module]
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    inputs = ['Placeholder']
    outputs = ['greedy', 'beam']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Beispiel #6
0
def load_lm(module, model, model_class, quantized=False, **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': T2T_BPE_MODEL
        },
        quantized=quantized,
        **kwargs,
    )

    g = load_graph(path['model'], **kwargs)
    X = g.get_tensor_by_name('import/Placeholder:0')
    top_p = g.get_tensor_by_name('import/Placeholder_2:0')
    greedy = g.get_tensor_by_name('import/greedy:0')
    beam = g.get_tensor_by_name('import/beam:0')
    nucleus = g.get_tensor_by_name('import/nucleus:0')

    tokenizer = SentencePieceEncoder(path['vocab'])

    inputs = ['Placeholder', 'Placeholder_2']
    outputs = ['greedy', 'beam', 'nucleus']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
    )
Beispiel #7
0
def load_lstm(module,
              left_dict,
              right_dict,
              cleaning,
              quantized=False,
              **kwargs):
    path = check_file(
        file='lstm-bahdanau',
        module=module,
        keys={'model': 'model.pb'},
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    inputs = ['Placeholder']
    outputs = []
    input_nodes, output_nodes = nodes_session(
        g,
        inputs,
        outputs,
        extra={
            'greedy': 'import/decode_1/greedy:0',
            'beam': 'import/decode_2/beam:0',
        },
    )
    return Seq2SeqLSTM(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        left_dict=left_dict,
        right_dict=right_dict,
        cleaning=cleaning,
    )
Beispiel #8
0
    def __init__(self, embed_matrix, dictionary: dict, **kwargs):
        """
        Parameters
        ----------
        embed_matrix: numpy array
        dictionary: dictionary
        """

        self._embed_matrix = embed_matrix
        self._dictionary = dictionary
        self._reverse_dictionary = {v: k for k, v in dictionary.items()}
        self.words = list(dictionary.keys())
        self._jarowinkler = JaroWinkler()
        device = get_device(**kwargs)
        _graph = tf.Graph()
        with _graph.as_default():
            with tf.device(device):
                self._embedding = tf.compat.v1.placeholder(
                    tf.float32, self._embed_matrix.shape)
                self._x = tf.compat.v1.placeholder(
                    tf.float32, [None, self._embed_matrix.shape[1]])
                normed_embedding = tf.nn.l2_normalize(self._embedding, axis=1)
                normed_array = tf.nn.l2_normalize(self._x, axis=1)
                self._cosine_similarity = tf.matmul(
                    normed_array, tf.transpose(normed_embedding, [1, 0]))
                self._sess = generate_session(_graph, **kwargs)
Beispiel #9
0
def _transformer(model,
                 bert_class,
                 xlnet_class,
                 quantized=False,
                 siamese=False,
                 **kwargs):
    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.similarity.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='similarity',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])

        if model in ['albert', 'tiny-albert']:
            tokenizer = AlbertTokenizer(vocab_file=path['vocab'],
                                        spm_model_file=path['tokenizer'])
        selected_class = bert_class

        if siamese:
            selected_node = 'import/bert/pooler/dense/BiasAdd:0'

    if model in ['xlnet', 'alxlnet']:

        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        selected_class = xlnet_class
        if siamese:
            selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0'

    if not siamese:
        selected_node = _vectorizer_mapping[model]

    inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'vectorizer': selected_node})

    return selected_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )
Beispiel #10
0
def _transformer(model,
                 bert_class,
                 xlnet_class,
                 quantized=False,
                 siamese=False,
                 **kwargs):
    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.similarity.available_transformer()`.'
        )

    check_file(PATH_SIMILARITY[model],
               S3_PATH_SIMILARITY[model],
               quantized=quantized,
               **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_SIMILARITY[model][model_path], **kwargs)

    path = PATH_SIMILARITY

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        selected_class = bert_class
        if siamese:
            selected_node = 'import/bert/pooler/dense/BiasAdd:0'

    if model in ['xlnet', 'alxlnet']:

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])
        selected_class = xlnet_class
        if siamese:
            selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0'

    if not siamese:
        selected_node = _vectorizer_mapping[model]

    return selected_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
        input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
        logits=g.get_tensor_by_name('import/logits:0'),
        vectorizer=g.get_tensor_by_name(selected_node),
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )
Beispiel #11
0
def load_lm(path, s3_path, model, model_class, quantized=False, **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'

    g = load_graph(path[model][model_path], **kwargs)
    X = g.get_tensor_by_name('import/Placeholder:0')
    top_p = g.get_tensor_by_name('import/Placeholder_2:0')
    greedy = g.get_tensor_by_name('import/greedy:0')
    beam = g.get_tensor_by_name('import/beam:0')
    nucleus = g.get_tensor_by_name('import/nucleus:0')

    tokenizer = SentencePieceEncoder(path[model]['vocab'])

    return model_class(
        X=X,
        top_p=top_p,
        greedy=greedy,
        beam=beam,
        nucleus=nucleus,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
    )
Beispiel #12
0
def load(module, model, model_class, maxlen, quantized=False, **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': TRANSLATION_BPE_MODEL
        },
        quantized=quantized,
        **kwargs,
    )

    g = load_graph(path['model'], **kwargs)
    inputs = ['Placeholder']
    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    encoder = SentencePieceEncoder(vocab_file=path['vocab'])

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
        maxlen=maxlen,
    )
Beispiel #13
0
def load_pegasus(module,
                 model,
                 model_class,
                 maxlen,
                 quantized=False,
                 **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': PEGASUS_BPE_MODEL
        },
        quantized=quantized,
        **kwargs,
    )

    g = load_graph(path['model'], **kwargs)

    inputs = ['Placeholder', 'top_p', 'temperature']
    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)
    tokenizer = WordPieceTokenizer(vocab_file=path['vocab'])

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        maxlen=maxlen,
    )
Beispiel #14
0
def load_tatabahasa(module, model, model_class, quantized=False, **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': T2T_BPE_MODEL
        },
        quantized=quantized,
        **kwargs,
    )

    g = load_graph(path['model'], **kwargs)
    tokenizer = SentencePieceEncoder(vocab_file=path['vocab'])

    inputs = ['x_placeholder']
    outputs = ['greedy', 'tag_greedy']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
    )
Beispiel #15
0
def load(path,
         s3_path,
         model,
         encoder,
         model_class,
         quantized=False,
         **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'

    g = load_graph(path[model][model_path], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path[model]['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    return model_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        greedy=g.get_tensor_by_name('import/greedy:0'),
        beam=g.get_tensor_by_name('import/beam:0'),
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Beispiel #16
0
def load(module, model, model_class, quantized=False, **kwargs):

    try:
        import tensorflow_text
    except BaseException:
        raise ModuleNotFoundError(
            'tensorflow-text not installed. Please install it by `pip install tensorflow-text` and try again. Also, make sure tensorflow-text version same as tensorflow version.'
        )

    path = check_file(
        file=model,
        module=module,
        keys={'model': 'model.pb', 'vocab': VOCAB_MODEL.get(module, MS_EN_BPE_MODEL)},
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], t5_graph=True, **kwargs)
    tokenizer = SentencePieceBatchEncoder(vocab_file=path['vocab'])
    inputs = ['inputs']
    outputs = []
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'decode': 'import/SelectV2_3:0'}
    )

    return model_class(
        input_nodes=input_nodes, output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
    )
Beispiel #17
0
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.tf.Constituency class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.constituency.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='constituency',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    inputs = ['input_ids', 'word_end_mask']
    outputs = ['charts', 'tags']
    tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'],
                                       spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'vectorizer': _vectorizer_mapping[model]})
    mode = 'bert' if 'bert' in model else 'xlnet'

    return Constituency(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        dictionary=settings.constituency,
        mode=mode,
    )
Beispiel #18
0
def transformer(model: str = 'xlnet', **kwargs):
    """
    Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.

    Returns
    -------
    result : malaya.model.tf.CONSTITUENCY class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from malaya.constituency.available_transformer()'
        )

    check_file(PATH_CONSTITUENCY[model], S3_PATH_CONSTITUENCY[model], **kwargs)
    g = load_graph(PATH_CONSTITUENCY[model]['model'], **kwargs)

    with open(PATH_CONSTITUENCY[model]['dictionary']) as fopen:
        dictionary = json.load(fopen)

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:

        tokenizer = sentencepiece_tokenizer_bert(
            PATH_CONSTITUENCY[model]['tokenizer'],
            PATH_CONSTITUENCY[model]['vocab'],
        )
        mode = 'bert'

    if model in ['xlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(
            PATH_CONSTITUENCY[model]['tokenizer'])
        mode = 'xlnet'

    from malaya.model.tf import CONSTITUENCY

    return CONSTITUENCY(
        input_ids=g.get_tensor_by_name('import/input_ids:0'),
        word_end_mask=g.get_tensor_by_name('import/word_end_mask:0'),
        charts=g.get_tensor_by_name('import/charts:0'),
        tags=g.get_tensor_by_name('import/tags:0'),
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        dictionary=dictionary,
        mode=mode,
    )
def deep_model(quantized: bool = False, **kwargs):
    """
    Load deep learning language detection model.
    Original size is 51.2MB, Quantized size 12.8MB.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.tf.DeepLang class
    """

    path = check_file(
        file = 'lang-32',
        module = 'language-detection',
        keys = {
            'model': 'model.pb',
            'vector': LANGUAGE_DETECTION_BOW,
            'bpe': LANGUAGE_DETECTION_VOCAB,
        },
        quantized = quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    bpe, subword_mode = load_yttm(path['bpe'])

    try:
        with open(path['vector'], 'rb') as fopen:
            vector = pickle.load(fopen)
    except:
        raise ValueError(
            "model corrupted due to some reasons, please run `malaya.clear_cache('language-detection/lang-32')` and try again"
        )

    inputs = [
        'X_Placeholder/shape',
        'X_Placeholder/values',
        'X_Placeholder/indices',
        'W_Placeholder/shape',
        'W_Placeholder/values',
        'W_Placeholder/indices',
    ]
    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return DeepLang(
        input_nodes = input_nodes,
        output_nodes = output_nodes,
        sess = generate_session(graph = g, **kwargs),
        vectorizer = vector,
        bpe = bpe,
        type = subword_mode,
        label = lang_labels,
    )
Beispiel #20
0
def deep_model(quantized: bool = False, **kwargs):
    """
    Load LSTM + Bahdanau Attention stemming model, this also include lemmatization.
    Original size 41.6MB, quantized size 10.6MB .

    Parameters
    ----------
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: malaya.stem.DeepStemmer class
    """

    if check_tf_version() > 1:
        raise Exception(
            f'Tensorflow 2.0 and above not able to use `deep_model` for stemmer, use Tensorflow 1.15 instead.'
        )

    path = check_file(
        file='lstm-bahdanau',
        module='stem',
        keys={
            'model': 'model.pb',
            'vocab': STEMMER_VOCAB
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    bpe, subword_mode = load_yttm(path['vocab'], id_mode=True)
    inputs = ['Placeholder']
    outputs = []
    input_nodes, output_nodes = nodes_session(
        g,
        inputs,
        outputs,
        extra={
            'greedy': 'import/decode_1/greedy:0',
            'beam': 'import/decode_2/beam:0',
        },
    )

    tokenizer = Tokenizer().tokenize

    return DeepStemmer(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        bpe=bpe,
        subword_mode=subword_mode,
        tokenizer=tokenizer,
    )
Beispiel #21
0
def transformer(
    path, s3_path, class_name, model = 'xlnet', size = 'base', **kwargs
):
    check_file(path[model][size], s3_path[model][size], **kwargs)

    try:
        with open(path[model][size]['setting']) as fopen:
            nodes = json.load(fopen)
        g = load_graph(path[model][size]['model'])
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again"
        )

    if model in ['albert', 'bert']:
        tokenizer, cls, sep = sentencepiece_tokenizer_bert(
            path[model][size]['tokenizer'], path[model][size]['vocab']
        )
        return TAGGING_BERT(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = None,
            input_masks = None,
            logits = g.get_tensor_by_name('import/logits:0'),
            sess = generate_session(graph = g),
            tokenizer = tokenizer,
            cls = cls,
            sep = sep,
            settings = nodes,
        )

    if model in ['xlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(
            path[model][size]['tokenizer']
        )
        return TAGGING_XLNET(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks = g.get_tensor_by_name('import/Placeholder_2:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            sess = generate_session(graph = g),
            tokenizer = tokenizer,
            settings = nodes,
        )
Beispiel #22
0
    def __init__(self,
                 xlnet_config,
                 tokenizer,
                 checkpoint,
                 pool_mode='last',
                 **kwargs):

        kwargs_config = dict(
            is_training=True,
            use_tpu=False,
            use_bfloat16=False,
            dropout=0.0,
            dropatt=0.0,
            init='normal',
            init_range=0.1,
            init_std=0.05,
            clamp_len=-1,
        )

        xlnet_parameters = xlnet_lib.RunConfig(**kwargs_config)

        self._tokenizer = tokenizer
        device = get_device(**kwargs)
        _graph = tf.Graph()
        with _graph.as_default():
            with tf.device(device):
                self.X = tf.placeholder(tf.int32, [None, None])
                self.segment_ids = tf.placeholder(tf.int32, [None, None])
                self.input_masks = tf.placeholder(tf.float32, [None, None])

                xlnet_model = xlnet_lib.XLNetModel(
                    xlnet_config=xlnet_config,
                    run_config=xlnet_parameters,
                    input_ids=tf.transpose(self.X, [1, 0]),
                    seg_ids=tf.transpose(self.segment_ids, [1, 0]),
                    input_mask=tf.transpose(self.input_masks, [1, 0]),
                )

                self.logits = xlnet_model.get_pooled_out(pool_mode, True)
                self._sess = generate_session(_graph, **kwargs)
                self._sess.run(tf.global_variables_initializer())
                tvars = tf.trainable_variables()
                assignment_map, _ = get_assignment_map_from_checkpoint(
                    tvars, checkpoint)
                self._saver = tf.train.Saver(var_list=assignment_map)
                attentions = [
                    n.name for n in tf.get_default_graph().as_graph_def().node
                    if 'rel_attn/Softmax' in n.name
                ]
                g = tf.get_default_graph()
                self.attention_nodes = [
                    g.get_tensor_by_name('%s:0' % (a)) for a in attentions
                ]
Beispiel #23
0
def transformer(module, model='xlnet', quantized=False, tok=None, **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        inputs = ['Placeholder', 'Placeholder_1']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        selected_model = TaggingBERT
        selected_tokenizer = SentencePieceTokenizer

    elif model in ['xlnet', 'alxlnet']:
        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        vectorizer = {'vectorizer': 'import/transpose_3:0'}
        selected_model = TaggingXLNET
        selected_tokenizer = SentencePieceTokenizer

    elif model in ['fastformer', 'tiny-fastformer']:
        inputs = ['Placeholder']
        vectorizer_nodes = {'fastformer': 'import/fast_transformer/add_24:0',
                            'tiny-fastformer': 'import/fast_transformer/add_8:0'}
        vectorizer = {'vectorizer': vectorizer_nodes[model]}
        selected_model = TaggingFastFormer
        selected_tokenizer = WordPieceTokenizer

    outputs = ['logits']
    tokenizer = selected_tokenizer(vocab_file=path['vocab'], spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra=vectorizer
    )

    return selected_model(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        settings=TAGGING_SETTING[module],
        tok=tok
    )
Beispiel #24
0
def transformer_squad(class_name, model = 'bert', quantized = False, **kwargs):
    path = check_file(
        file = model,
        module = class_name,
        keys = {
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized = quantized,
        **kwargs,
    )

    g = load_graph(path['model'], **kwargs)
    inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3']

    if model in ['bert', 'tiny-bert']:
        tokenizer = sentencepiece_tokenizer_bert(
            path['tokenizer'], path['vocab']
        )
    if model in ['albert', 'tiny-albert']:
        tokenizer = AlbertTokenizer(
            vocab_file = path['vocab'], spm_model_file = path['tokenizer']
        )
    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        inputs.append('Placeholder_4')

    outputs = [
        'start_top_log_probs',
        'start_top_index',
        'end_top_log_probs',
        'end_top_index',
        'cls_logits',
        'logits_vectorize',
    ]
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    mode = 'bert' if 'bert' in model else 'xlnet'
    return SQUAD(
        input_nodes = input_nodes,
        output_nodes = output_nodes,
        sess = generate_session(graph = g, **kwargs),
        tokenizer = tokenizer,
        class_name = class_name,
        mode = mode,
        length = LENGTHS[mode],
    )
Beispiel #25
0
def load(module, model, model_class, quantized=False, **kwargs):

    try:
        import tensorflow_text
    except BaseException:
        raise ModuleNotFoundError(
            'tensorflow-text not installed. Please install it by `pip install tensorflow-text` and try again. Also, make sure tensorflow-text version same as tensorflow version.'
        )

    if model.split('-')[-1] == '4k':
        default_vocab = MS_EN_4k_BPE_MODEL
    else:
        default_vocab = MS_EN_BPE_MODEL

    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': VOCAB_MODEL.get(module, default_vocab)
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], t5_graph=True, **kwargs)
    tokenizer = SentencePieceBatchEncoder(vocab_file=path['vocab'])
    inputs = ['inputs']
    outputs = []
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'decode': 'import/SelectV2_3:0'})

    if module == 'kesalahan-tatabahasa':
        word_tokenizer = Tokenizer(date=False, time=False).tokenize
    elif module == 'spelling-correction':
        word_tokenizer = Tokenizer(duration=False, date=False).tokenize
    else:
        word_tokenizer = None

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        word_tokenizer=word_tokenizer,
    )
Beispiel #26
0
def load(path, s3_path, model, encoder, model_class, **kwargs):
    check_file(path[model], s3_path[model], **kwargs)
    g = load_graph(path[model]['model'], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path[model]['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    return model_class(
        g.get_tensor_by_name('import/Placeholder:0'),
        g.get_tensor_by_name('import/greedy:0'),
        g.get_tensor_by_name('import/beam:0'),
        generate_session(graph=g, **kwargs),
        encoder,
    )
Beispiel #27
0
 def __init__(self, hparams, encoder, generate_length, temperature, top_k,
              **kwargs):
     self._encoder = encoder
     device = get_device(**kwargs)
     self._graph = tf.Graph()
     with self._graph.as_default():
         with tf.device(device):
             self._X = tf.placeholder(tf.int32, [1, None])
             self._model = sample_sequence(
                 hparams=hparams,
                 length=generate_length,
                 context=self._X,
                 batch_size=1,
                 temperature=temperature,
                 top_k=top_k,
             )
             self._sess = generate_session(self._graph, **kwargs)
             self._sess.run(tf.global_variables_initializer())
             self._saver = tf.train.Saver(tf.trainable_variables())
Beispiel #28
0
def load_lm(path, s3_path, model, model_class, **kwargs):
    check_file(path[model], s3_path[model], **kwargs)
    g = load_graph(path[model]['model'], **kwargs)
    X = g.get_tensor_by_name('import/Placeholder:0')
    top_p = g.get_tensor_by_name('import/Placeholder_2:0')
    greedy = g.get_tensor_by_name('import/greedy:0')
    beam = g.get_tensor_by_name('import/beam:0')
    nucleus = g.get_tensor_by_name('import/nucleus:0')

    tokenizer = SentencePieceEncoder(path[model]['vocab'])

    return model_class(
        X,
        top_p,
        greedy,
        beam,
        nucleus,
        generate_session(graph=g, **kwargs),
        tokenizer,
    )
Beispiel #29
0
def load_char(module, model, left_dict, cleaning, quantized=False, **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={'model': 'model.pb'},
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    inputs = ['x_placeholder']
    outputs = ['greedy', 'tag_greedy']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return TransformerChar(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        left_dict=left_dict,
        cleaning=cleaning,
    )
Beispiel #30
0
def transformer(model='base', **kwargs):
    """
    Load transformer encoder-decoder model to translate MS-to-EN.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'small'`` - transformer Small parameters.
        * ``'base'`` - transformer Base parameters.
        * ``'large'`` - transformer Large parameters.

    Returns
    -------
    result: malaya.model.tf.TRANSLATION class
    """
    model = model.lower()
    if model not in _transformer_availability:
        raise Exception(
            'model not supported, please check supported models from malaya.translation.ms_en.available_transformer()'
        )

    path = PATH_TRANSLATION['ms-en']
    s3_path = S3_PATH_TRANSLATION['ms-en']

    check_file(path[model], s3_path[model], **kwargs)
    g = load_graph(path[model]['model'], **kwargs)

    from malaya.text.t2t import text_encoder
    from malaya.model.tf import TRANSLATION

    encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])
    return TRANSLATION(
        g.get_tensor_by_name('import/Placeholder:0'),
        g.get_tensor_by_name('import/greedy:0'),
        g.get_tensor_by_name('import/beam:0'),
        generate_session(graph=g, **kwargs),
        encoder,
    )