Beispiel #1
0
def load(path,
         s3_path,
         model,
         encoder,
         model_class,
         quantized=False,
         **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'

    g = load_graph(path[model][model_path], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path[model]['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    return model_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        greedy=g.get_tensor_by_name('import/greedy:0'),
        beam=g.get_tensor_by_name('import/beam:0'),
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Beispiel #2
0
def multinomial(path, s3_path, module, label, sigmoid=False, **kwargs):
    path = check_file(path['multinomial'], s3_path['multinomial'], **kwargs)
    try:
        with open(path['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(path['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except BaseException:
        path = os.path.normpath(f'{module}/multinomial')
        raise Exception(
            f"model corrupted due to some reasons, please run `malaya.clear_cache('{path}')` and try again"
        )

    bpe = YTTMEncoder(vocab_file=path['bpe'])

    stemmer = naive()
    cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer)

    if sigmoid:
        selected_model = MultilabelBayes
    else:
        if len(label) > 2:
            selected_model = MulticlassBayes
        else:
            selected_model = BinaryBayes

    return selected_model(
        multinomial=multinomial,
        label=label,
        vectorize=vectorize,
        bpe=bpe,
        cleaning=cleaning,
    )
Beispiel #3
0
def load(module, model, encoder, model_class, quantized=False, **kwargs):

    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': LM_VOCAB[module]
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    inputs = ['Placeholder']
    outputs = ['greedy', 'beam']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return model_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Beispiel #4
0
def deep_model(quantized: bool = False, **kwargs):
    """
    Load deep learning language detection model.
    Original size is 51.2MB, Quantized size 12.8MB.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.tf.DeepLang class
    """

    path = check_file(
        file='lang-32',
        module='language-detection',
        keys={
            'model': 'model.pb',
            'vector': LANGUAGE_DETECTION_BOW,
            'bpe': LANGUAGE_DETECTION_VOCAB,
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    bpe = YTTMEncoder(vocab_file=path['bpe'])

    with open(path['vector'], 'rb') as fopen:
        vector = pickle.load(fopen)

    inputs = [
        'X_Placeholder/shape',
        'X_Placeholder/values',
        'X_Placeholder/indices',
        'W_Placeholder/shape',
        'W_Placeholder/values',
        'W_Placeholder/indices',
    ]
    outputs = ['logits']
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return DeepLang(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        vectorizer=vector,
        bpe=bpe,
        label=lang_labels,
    )
Beispiel #5
0
def deep_model(quantized: bool = False, **kwargs):
    """
    Load LSTM + Bahdanau Attention stemming model, this also include lemmatization.
    Original size 41.6MB, quantized size 10.6MB .

    Parameters
    ----------
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: malaya.stem.DeepStemmer class
    """

    path = check_file(
        file='lstm-bahdanau',
        module='stem',
        keys={
            'model': 'model.pb',
            'vocab': STEMMER_VOCAB
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    inputs = ['Placeholder']
    outputs = []
    bpe = YTTMEncoder(vocab_file=path['vocab'], id_mode=True)
    input_nodes, output_nodes = nodes_session(
        g,
        inputs,
        outputs,
        extra={
            'greedy': 'import/decode_1/greedy:0',
            'beam': 'import/decode_2/beam:0',
        },
    )

    tokenizer = Tokenizer().tokenize

    return DeepStemmer(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        bpe=bpe,
        tokenizer=tokenizer,
    )
Beispiel #6
0
def load(path, s3_path, model, encoder, model_class, **kwargs):
    check_file(path[model], s3_path[model], **kwargs)
    g = load_graph(path[model]['model'], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path[model]['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    return model_class(
        g.get_tensor_by_name('import/Placeholder:0'),
        g.get_tensor_by_name('import/greedy:0'),
        g.get_tensor_by_name('import/beam:0'),
        generate_session(graph=g, **kwargs),
        encoder,
    )