Example #1
0
def deep_model(**kwargs):
    """
    Load deep learning language detection model.

    Returns
    -------
    result : malaya.model.tf.DEEP_LANG class
    """

    check_file(PATH_LANG_DETECTION['deep'], S3_PATH_LANG_DETECTION['deep'],
               **kwargs)
    try:
        with open(PATH_LANG_DETECTION['deep']['vector'], 'rb') as fopen:
            vector = pickle.load(fopen)
    except:
        raise ValueError(
            "model corrupted due to some reasons, please run malaya.clear_cache('language-detection/deep') and try again"
        )

    from malaya.text.bpe import load_yttm

    bpe, subword_mode = load_yttm(PATH_LANG_DETECTION['deep']['bpe'])

    import os

    return DEEP_LANG(
        os.path.dirname(PATH_LANG_DETECTION['deep']['model']),
        vector,
        lang_labels,
        bpe,
        subword_mode,
    )
Example #2
0
def multinomial(path, s3_path, class_name, label, **kwargs):
    check_file(path['multinomial'], s3_path['multinomial'], **kwargs)
    try:
        with open(path['multinomial']['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(path['multinomial']['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/multinomial') and try again"
        )
    bpe, subword_mode = load_yttm(path['multinomial']['bpe'])

    from malaya.stem import _classification_textcleaning_stemmer

    if len(label) > 2:
        selected_class = MULTICLASS_BAYES
    else:
        selected_class = BINARY_BAYES
    return selected_class(
        multinomial=multinomial,
        label=label,
        vectorize=vectorize,
        bpe=bpe,
        subword_mode=subword_mode,
        cleaning=_classification_textcleaning_stemmer,
    )
Example #3
0
def load(path,
         s3_path,
         model,
         encoder,
         model_class,
         quantized=False,
         **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'

    g = load_graph(path[model][model_path], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path[model]['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    return model_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        greedy=g.get_tensor_by_name('import/greedy:0'),
        beam=g.get_tensor_by_name('import/beam:0'),
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Example #4
0
def multinomial(path, s3_path, class_name, label, sigmoid=False, **kwargs):
    check_file(path['multinomial'], s3_path['multinomial'], **kwargs)
    try:
        with open(path['multinomial']['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(path['multinomial']['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/multinomial')` and try again"
        )
    bpe, subword_mode = load_yttm(path['multinomial']['bpe'])

    stemmer = naive()
    cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer)

    if sigmoid:
        selected_class = MultilabelBayes
    else:
        if len(label) > 2:
            selected_class = MulticlassBayes
        else:
            selected_class = BinaryBayes
    return selected_class(
        multinomial=multinomial,
        label=label,
        vectorize=vectorize,
        bpe=bpe,
        subword_mode=subword_mode,
        cleaning=cleaning,
    )
Example #5
0
def load(module, model, encoder, model_class, quantized=False, **kwargs):

    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': LM_VOCAB[module]
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    inputs = ['Placeholder']
    outputs = ['greedy', 'beam']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Example #6
0
def deep_model(quantized: bool = False, **kwargs):
    """
    Load LSTM + Bahdanau Attention stemming model, this also include lemmatization.
    Original size 41.6MB, quantized size 10.6MB .

    Parameters
    ----------
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: malaya.stem.DeepStemmer class
    """

    if check_tf_version() > 1:
        raise Exception(
            f'Tensorflow 2.0 and above not able to use `deep_model` for stemmer, use Tensorflow 1.15 instead.'
        )

    path = check_file(
        file='lstm-bahdanau',
        module='stem',
        keys={
            'model': 'model.pb',
            'vocab': STEMMER_VOCAB
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    bpe, subword_mode = load_yttm(path['vocab'], id_mode=True)
    inputs = ['Placeholder']
    outputs = []
    input_nodes, output_nodes = nodes_session(
        g,
        inputs,
        outputs,
        extra={
            'greedy': 'import/decode_1/greedy:0',
            'beam': 'import/decode_2/beam:0',
        },
    )

    tokenizer = Tokenizer().tokenize

    return DeepStemmer(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        bpe=bpe,
        subword_mode=subword_mode,
        tokenizer=tokenizer,
    )
def deep_model(quantized: bool = False, **kwargs):
    """
    Load deep learning language detection model.
    Original size is 51.2MB, Quantized size 12.8MB.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.tf.DeepLang class
    """

    path = check_file(
        file = 'lang-32',
        module = 'language-detection',
        keys = {
            'model': 'model.pb',
            'vector': LANGUAGE_DETECTION_BOW,
            'bpe': LANGUAGE_DETECTION_VOCAB,
        },
        quantized = quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    bpe, subword_mode = load_yttm(path['bpe'])

    try:
        with open(path['vector'], 'rb') as fopen:
            vector = pickle.load(fopen)
    except:
        raise ValueError(
            "model corrupted due to some reasons, please run `malaya.clear_cache('language-detection/lang-32')` and try again"
        )

    inputs = [
        'X_Placeholder/shape',
        'X_Placeholder/values',
        'X_Placeholder/indices',
        'W_Placeholder/shape',
        'W_Placeholder/values',
        'W_Placeholder/indices',
    ]
    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return DeepLang(
        input_nodes = input_nodes,
        output_nodes = output_nodes,
        sess = generate_session(graph = g, **kwargs),
        vectorizer = vector,
        bpe = bpe,
        type = subword_mode,
        label = lang_labels,
    )
Example #8
0
def load(path, s3_path, model, encoder, model_class, **kwargs):
    check_file(path[model], s3_path[model], **kwargs)
    g = load_graph(path[model]['model'], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path[model]['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    return model_class(
        g.get_tensor_by_name('import/Placeholder:0'),
        g.get_tensor_by_name('import/greedy:0'),
        g.get_tensor_by_name('import/beam:0'),
        generate_session(graph=g, **kwargs),
        encoder,
    )
Example #9
0
def multinomial(**kwargs):
    """
    Load multinomial toxicity model.

    Returns
    -------
    result : malaya.model.ml.MULTILABEL_BAYES class
    """
    import pickle

    check_file(PATH_TOXIC['multinomial'], S3_PATH_TOXIC['multinomial'],
               **kwargs)

    try:
        with open(PATH_TOXIC['multinomial']['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(PATH_TOXIC['multinomial']['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('toxic/multinomial') and try again"
        )

    from malaya.text.bpe import load_yttm
    from malaya.stem import _classification_textcleaning_stemmer, naive

    stemmer = naive()
    cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer)

    bpe, subword_mode = load_yttm(PATH_TOXIC['multinomial']['bpe'])

    return MULTILABEL_BAYES(
        multinomial=multinomial,
        label=label,
        vectorize=vectorize,
        bpe=bpe,
        subword_mode=subword_mode,
        cleaning=cleaning,
    )
Example #10
0
def deep_model(quantized: bool = False, **kwargs):
    """
    Load LSTM + Bahdanau Attention stemming model, this also include lemmatization.
    Original size 41.6MB, quantized size 10.6MB .

    Parameters
    ----------
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: malaya.stem.DEEP_STEMMER class
    """
    from malaya.preprocessing import _tokenizer

    check_file(PATH_STEM['deep'],
               S3_PATH_STEM['deep'],
               quantized=quantized,
               **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_STEM['deep'][model_path], **kwargs)

    bpe, subword_mode = load_yttm(PATH_STEM['deep']['bpe'], id_mode=True)

    return DEEP_STEMMER(
        g.get_tensor_by_name('import/Placeholder:0'),
        g.get_tensor_by_name('import/decode_1/greedy:0'),
        g.get_tensor_by_name('import/decode_2/beam:0'),
        generate_session(graph=g, **kwargs),
        bpe,
        subword_mode,
        _tokenizer,
    )
Example #11
0
def deep_model(**kwargs):
    """
    Load LSTM + Bahdanau Attention stemming model.

    Returns
    -------
    DEEP_STEMMER: malaya.stem.DEEP_STEMMER class
    """
    from malaya.preprocessing import _tokenizer

    check_file(PATH_STEM['deep'], S3_PATH_STEM['deep'], **kwargs)
    g = load_graph(PATH_STEM['deep']['model'])

    bpe, subword_mode = load_yttm(PATH_STEM['deep']['bpe'], id_mode=True)

    return DEEP_STEMMER(
        g.get_tensor_by_name('import/Placeholder:0'),
        g.get_tensor_by_name('import/decode_1/greedy:0'),
        g.get_tensor_by_name('import/decode_2/beam:0'),
        generate_session(graph=g),
        bpe,
        subword_mode,
        _tokenizer,
    )