Ejemplo n.º 1
0
def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Load a symspell Spell Corrector for Malay.

    Returns
    -------
    result: malaya.spell.Symspell class
    """

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except BaseException:
        raise ModuleNotFoundError(
            'symspellpy not installed. Please install it and try again.')

    path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'],
                      **kwargs)
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell.load_dictionary(path['model'], term_index, count_index)

    path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)
    with open(path['model']) as fopen:
        corpus = json.load(fopen)
    return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)
Ejemplo n.º 2
0
def fasttext(quantization: bool = True, **kwargs):
    """
    Load Fasttext language detection model.
    
    Parameters
    ----------
    quantization: bool, optional (default=True)
        if True, load quantized fasttext model. Else, load original fasttext model.

    Returns
    -------
    result : malaya.model.ml.LANGUAGE_DETECTION class
    """

    try:
        import fasttext
    except:
        raise ModuleNotFoundError(
            'fasttext not installed. Please install it by `pip install fasttext` and try again.'
        )
    if quantization:
        model = 'fasttext-quantized'
    else:
        model = 'fasttext-original'
    check_file(PATH_LANG_DETECTION[model], S3_PATH_LANG_DETECTION[model],
               **kwargs)

    try:
        model_fasttext = fasttext.load_model(
            PATH_LANG_DETECTION[model]['model'])
    except:
        raise ValueError(
            f"model corrupted due to some reasons, please run malaya.clear_cache('language-detection/{model}') and try again"
        )
    return LANGUAGE_DETECTION(model_fasttext, lang_labels)
Ejemplo n.º 3
0
def multinomial(path, s3_path, class_name, label, **kwargs):
    check_file(path['multinomial'], s3_path['multinomial'], **kwargs)
    try:
        with open(path['multinomial']['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(path['multinomial']['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/multinomial') and try again"
        )
    bpe, subword_mode = load_yttm(path['multinomial']['bpe'])

    from malaya.stem import _classification_textcleaning_stemmer

    if len(label) > 2:
        selected_class = MULTICLASS_BAYES
    else:
        selected_class = BINARY_BAYES
    return selected_class(
        multinomial=multinomial,
        label=label,
        vectorize=vectorize,
        bpe=bpe,
        subword_mode=subword_mode,
        cleaning=_classification_textcleaning_stemmer,
    )
Ejemplo n.º 4
0
def segmenter(max_split_length: int = 20, validate: bool = True):
    """
    Load Segmenter class.

    Parameters
    ----------
    max_split_length: int, (default=20)
        max length of words in a sentence to segment
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    _Segmenter : malaya.preprocessing._Segmenter class
    """

    if validate:
        check_file(PATH_PREPROCESSING[1], S3_PATH_PREPROCESSING[1])
    else:
        if not check_available(PATH_PREPROCESSING[1]):
            raise Exception(
                'preprocessing is not available, please `validate = True`')
    if validate:
        check_file(PATH_PREPROCESSING[2], S3_PATH_PREPROCESSING[2])
    else:
        if not check_available(PATH_PREPROCESSING[2]):
            raise Exception(
                'preprocessing is not available, please `validate = True`')
    return _Segmenter(max_split_length=max_split_length)
Ejemplo n.º 5
0
def load_lm(path, s3_path, model, model_class, quantized=False, **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'

    g = load_graph(path[model][model_path], **kwargs)
    X = g.get_tensor_by_name('import/Placeholder:0')
    top_p = g.get_tensor_by_name('import/Placeholder_2:0')
    greedy = g.get_tensor_by_name('import/greedy:0')
    beam = g.get_tensor_by_name('import/beam:0')
    nucleus = g.get_tensor_by_name('import/nucleus:0')

    tokenizer = SentencePieceEncoder(path[model]['vocab'])

    return model_class(
        X=X,
        top_p=top_p,
        greedy=greedy,
        beam=beam,
        nucleus=nucleus,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
    )
Ejemplo n.º 6
0
def transformer(model, sentence_piece: bool = False, **kwargs):
    """
    Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT.

    Parameters
    ----------
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    _TransformerCorrector: malaya.spell._TransformerCorrector class
    """
    if not hasattr(model, '_log_vectorize'):
        raise ValueError('model must has `_log_vectorize` method')

    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab, vocab_model)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return _TransformerCorrector(model, corpus, tokenizer)
Ejemplo n.º 7
0
def multinomial(path, s3_path, class_name, label, sigmoid=False, **kwargs):
    check_file(path['multinomial'], s3_path['multinomial'], **kwargs)
    try:
        with open(path['multinomial']['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(path['multinomial']['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/multinomial')` and try again"
        )
    bpe, subword_mode = load_yttm(path['multinomial']['bpe'])

    stemmer = naive()
    cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer)

    if sigmoid:
        selected_class = MultilabelBayes
    else:
        if len(label) > 2:
            selected_class = MulticlassBayes
        else:
            selected_class = BinaryBayes
    return selected_class(
        multinomial=multinomial,
        label=label,
        vectorize=vectorize,
        bpe=bpe,
        subword_mode=subword_mode,
        cleaning=cleaning,
    )
Ejemplo n.º 8
0
def transformer(model, sentence_piece: bool = False, **kwargs):
    """
    Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.

    Returns
    -------
    result: malaya.spell.TRANSFORMER class
    """
    if not hasattr(model, '_log_vectorize'):
        raise ValueError('model must has `_log_vectorize` method')

    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab_model, vocab)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return TRANSFORMER(model, corpus, tokenizer)
Ejemplo n.º 9
0
def fasttext(quantized: bool = True, **kwargs):
    """
    Load Fasttext language detection model.
    Original size is 353MB, Quantized size 31.1MB.

    Parameters
    ----------
    quantized: bool, optional (default=True)
        if True, load quantized fasttext model. Else, load original fasttext model.

    Returns
    -------
    result : malaya.model.ml.LanguageDetection class
    """

    try:
        import fasttext
    except BaseException:
        raise ModuleNotFoundError(
            'fasttext not installed. Please install it by `pip install fasttext` and try again.'
        )
    if quantized:
        model = 'fasttext-quantized'
    else:
        model = 'fasttext-original'

    check_file(
        PATH_LANG_DETECTION[model], S3_PATH_LANG_DETECTION[model], **kwargs
    )
    model_fasttext = fasttext.load_model(
        PATH_LANG_DETECTION[model]['model']
    )
    return LanguageDetection(model_fasttext, lang_labels)
Ejemplo n.º 10
0
def multinomial(path, s3_path, module, label, sigmoid=False, **kwargs):
    check_file(path['multinomial'], s3_path['multinomial'], **kwargs)
    try:
        with open(path['multinomial']['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(path['multinomial']['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except BaseException:
        path = os.path.normpath(f'{module}/multinomial')
        raise Exception(
            f"model corrupted due to some reasons, please run `malaya.clear_cache('{path}')` and try again"
        )

    bpe = YTTMEncoder(vocab_file=path['multinomial']['bpe'])

    stemmer = naive()
    cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer)

    if sigmoid:
        selected_model = MultilabelBayes
    else:
        if len(label) > 2:
            selected_model = MulticlassBayes
        else:
            selected_model = BinaryBayes
    return selected_model(
        multinomial=multinomial,
        label=label,
        vectorize=vectorize,
        bpe=bpe,
        cleaning=cleaning,
    )
Ejemplo n.º 11
0
def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Train a symspell Spell Corrector.

    Returns
    -------
    result: malaya.spell.SYMSPELL class
    """

    check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs)
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except:
        raise Exception(
            'symspellpy not installed. Please install it and try again.')
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = PATH_NGRAM['symspell']['model']
    sym_spell.load_dictionary(dictionary_path, term_index, count_index)
    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)
Ejemplo n.º 12
0
def load(model: str = 'wikipedia', **kwargs):
    """
    Return malaya.wordvector.WordVector object.

    Parameters
    ----------
    model : str, optional (default='wikipedia')
        Model architecture supported. Allowed values:

        * ``'wikipedia'`` - pretrained on Malay wikipedia word2vec size 256.
        * ``'socialmedia'`` - pretrained on cleaned Malay twitter and Malay instagram size 256.
        * ``'news'`` - pretrained on cleaned Malay news size 256.
        * ``'combine'`` - pretrained on cleaned Malay news + Malay social media + Malay wikipedia size 256.

    Returns
    -------
    vocabulary: indices dictionary for `vector`.
    vector: np.array, 2D.
    """

    model = model.lower()
    if model not in _wordvector_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.wordvector.available_wordvector()`.'
        )

    check_file(PATH_WORDVECTOR[model], S3_PATH_WORDVECTOR[model], **kwargs)
    return _load(PATH_WORDVECTOR[model]['vocab'],
                 PATH_WORDVECTOR[model]['model'])
Ejemplo n.º 13
0
def load(path,
         s3_path,
         model,
         encoder,
         model_class,
         quantized=False,
         **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'

    g = load_graph(path[model][model_path], **kwargs)

    if encoder == 'subword':
        encoder = text_encoder.SubwordTextEncoder(path[model]['vocab'])

    if encoder == 'yttm':
        bpe, subword_mode = load_yttm(path[model]['vocab'], True)
        encoder = YTTMEncoder(bpe, subword_mode)

    return model_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        greedy=g.get_tensor_by_name('import/greedy:0'),
        beam=g.get_tensor_by_name('import/beam:0'),
        sess=generate_session(graph=g, **kwargs),
        encoder=encoder,
    )
Ejemplo n.º 14
0
def _transformer(model,
                 bert_class,
                 xlnet_class,
                 quantized=False,
                 siamese=False,
                 **kwargs):
    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.similarity.available_transformer()`.'
        )

    check_file(PATH_SIMILARITY[model],
               S3_PATH_SIMILARITY[model],
               quantized=quantized,
               **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_SIMILARITY[model][model_path], **kwargs)

    path = PATH_SIMILARITY

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        selected_class = bert_class
        if siamese:
            selected_node = 'import/bert/pooler/dense/BiasAdd:0'

    if model in ['xlnet', 'alxlnet']:

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])
        selected_class = xlnet_class
        if siamese:
            selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0'

    if not siamese:
        selected_node = _vectorizer_mapping[model]

    return selected_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
        input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
        logits=g.get_tensor_by_name('import/logits:0'),
        vectorizer=g.get_tensor_by_name(selected_node),
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )
Ejemplo n.º 15
0
def deep_model(**kwargs):
    """
    Load deep learning language detection model.

    Returns
    -------
    result : malaya.model.tf.DEEP_LANG class
    """

    check_file(PATH_LANG_DETECTION['deep'], S3_PATH_LANG_DETECTION['deep'],
               **kwargs)
    try:
        with open(PATH_LANG_DETECTION['deep']['vector'], 'rb') as fopen:
            vector = pickle.load(fopen)
    except:
        raise ValueError(
            "model corrupted due to some reasons, please run malaya.clear_cache('language-detection/deep') and try again"
        )

    from malaya.text.bpe import load_yttm

    bpe, subword_mode = load_yttm(PATH_LANG_DETECTION['deep']['bpe'])

    import os

    return DEEP_LANG(
        os.path.dirname(PATH_LANG_DETECTION['deep']['model']),
        vector,
        lang_labels,
        bpe,
        subword_mode,
    )
Ejemplo n.º 16
0
def probability(sentence_piece: bool = False, **kwargs):
    """
    Train a Probability Spell Corrector.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.

    Returns
    -------
    result: malaya.spell.PROBABILITY class
    """
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = load_sentencepiece(vocab_model, vocab)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return PROBABILITY(corpus, tokenizer)
Ejemplo n.º 17
0
def probability(sentence_piece: bool = False, **kwargs):
    """
    Train a Probability Spell Corrector.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.

    Returns
    -------
    result: malaya.spell.Probability class
    """

    tokenizer = None
    if sentence_piece:
        path = check_file(PATH_NGRAM['sentencepiece'],
                          S3_PATH_NGRAM['sentencepiece'], **kwargs)
        print(path)

        vocab = path['vocab']
        vocab_model = path['model']
        tokenizer = SentencePieceTokenizer(vocab_file=vocab,
                                           spm_model_file=vocab_model)

    path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    with open(path['model']) as fopen:
        corpus = json.load(fopen)
    return Probability(corpus, tokenizer)
Ejemplo n.º 18
0
def multinomial(**kwargs):
    """
    Load multinomial toxicity model.

    Parameters
    ----------
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    BAYES : malaya._models._sklearn_model.MULTILABEL_BAYES class
    """
    import pickle

    check_file(PATH_TOXIC['multinomial'], S3_PATH_TOXIC['multinomial'],
               **kwargs)

    try:
        with open(PATH_TOXIC['multinomial']['model'], 'rb') as fopen:
            multinomial = pickle.load(fopen)
        with open(PATH_TOXIC['multinomial']['vector'], 'rb') as fopen:
            vectorize = pickle.load(fopen)
    except:
        raise Exception(
            "model corrupted due to some reasons, please run malaya.clear_cache('toxic/multinomial') and try again"
        )
    from .stem import _classification_textcleaning_stemmer

    return MULTILABEL_BAYES(
        models=multinomial,
        vectors=vectorize,
        cleaning=_classification_textcleaning_stemmer,
    )
Ejemplo n.º 19
0
def synonym(string: str,
            threshold: float = 0.5,
            top_n=5,
            cleaning=augmentation_textcleaning,
            **kwargs):
    """
    augmenting a string using synonym, https://github.com/huseinzol05/Malaya-Dataset#90k-synonym

    Parameters
    ----------
    string: str
    threshold: float, optional (default=0.5)
        random selection for a word.
    top_n: int, (default=5)
        number of nearest neighbors returned. Length of returned result should as top_n.
    cleaning: function, (default=malaya.text.function.augmentation_textcleaning)
        function to clean text.

    Returns
    -------
    result: List[str]
    """
    if not isinstance(cleaning, Callable) and cleaning is not None:
        raise ValueError('cleaning must be a callable type or None')

    global _synonym_dict

    if _synonym_dict is None:
        check_file(PATH_AUGMENTATION['synonym'],
                   S3_PATH_AUGMENTATION['synonym'], **kwargs)
        synonyms = defaultdict(list)
        files = [
            PATH_AUGMENTATION['synonym']['model'],
            PATH_AUGMENTATION['synonym']['model2'],
        ]
        for file in files:
            with open(file) as fopen:
                data = json.load(fopen)

            for i in data:
                if not len(i[1]):
                    continue
                synonyms[i[0]].extend(i[1])
                for r in i[1]:
                    synonyms[r].append(i[0])
        for k, v in synonyms.items():
            synonyms[k] = list(set(v))
        _synonym_dict = synonyms

    original_string = string
    if cleaning:
        string = cleaning(string).split()

    augmented = []
    for i in range(top_n):
        string_ = replace(string, threshold)
        augmented.append(
            _make_upper(' '.join(string_), ' '.join(original_string)))
    return augmented
Ejemplo n.º 20
0
def t5(model: str = 'base', **kwargs):
    """
    Load T5 model to generate a string given a isu penting.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'base'`` - T5 Base parameters.
        * ``'small'`` - T5 Small parameters.

    Returns
    -------
    result: malaya.model.t5.GENERATOR class
    """

    model = model.lower()
    if model not in _t5_availability:
        raise Exception(
            'model not supported, please check supported models from malaya.generator.available_t5()'
        )

    from malaya.path import PATH_GENERATOR, S3_PATH_GENERATOR

    path = PATH_GENERATOR['sample']
    s3_path = S3_PATH_GENERATOR['sample']

    from malaya.function import check_file

    try:
        import tensorflow_text
        import tf_sentencepiece
        import tensorflow as tf
    except:
        raise Exception(
            'tensorflow-text and tf-sentencepiece not installed. Please install it by `pip install tensorflow-text tf-sentencepiece` and try again. Also, make sure tensorflow-text version same as tensorflow version.'
        )

    check_file(path[model]['model'], s3_path[model], **kwargs)

    if not os.path.exists(path[model]['directory'] + 'saved_model.pb'):
        import tarfile

        with tarfile.open(path[model]['model']['model']) as tar:
            tar.extractall(path=path[model]['path'])

    sess = tf.InteractiveSession()
    meta_graph_def = tf.compat.v1.saved_model.load(sess, ['serve'],
                                                   path[model]['directory'])
    signature_def = meta_graph_def.signature_def['serving_default']
    pred = lambda x: sess.run(
        fetches=signature_def.outputs['outputs'].name,
        feed_dict={signature_def.inputs['input'].name: x},
    )

    from malaya.model.t5 import GENERATOR

    return GENERATOR(pred)
Ejemplo n.º 21
0
def load(path,
         s3_path,
         model,
         model_class,
         compressed=True,
         quantized=False,
         **kwargs):

    try:
        import tensorflow_text
        import tf_sentencepiece
    except:
        raise ModuleNotFoundError(
            'tensorflow-text and tf-sentencepiece not installed. Please install it by `pip install tensorflow-text==1.15.0 tf-sentencepiece==0.1.86` and try again. Also, make sure tensorflow-text version same as tensorflow version.'
        )

    if compressed and not quantized:
        path = path['t5-compressed']
        s3_path = s3_path['t5-compressed']
        check_file(path[model]['model'], s3_path[model], **kwargs)

        if not os.path.exists(path[model]['directory'] + 'saved_model.pb'):
            import tarfile

            with tarfile.open(path[model]['model']['model']) as tar:
                tar.extractall(path=path[model]['path'])

        X = None
        decode = None
        sess = generate_session(graph=None, **kwargs)
        meta_graph_def = tf.compat.v1.saved_model.load(
            sess, ['serve'], path[model]['directory'])
        signature_def = meta_graph_def.signature_def['serving_default']
        pred = lambda x: sess.run(
            fetches=signature_def.outputs['outputs'].name,
            feed_dict={signature_def.inputs['input'].name: x},
        )

    else:
        path = path['t5']
        s3_path = s3_path['t5']
        check_file(path[model],
                   s3_path[model],
                   quantized=quantized,
                   optimized=True,
                   **kwargs)
        if quantized:
            model_path = 'quantized'
        else:
            model_path = 'model'
        g = load_graph(path[model][model_path], **kwargs)
        X = g.get_tensor_by_name('import/inputs:0')
        decode = g.get_tensor_by_name(
            'import/SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp:0'
        )
        sess = generate_session(graph=g, **kwargs)
        pred = None

    return model_class(X=X, decode=decode, sess=sess, pred=pred)
Ejemplo n.º 22
0
def load(model: str = 'alxlnet', pool_mode: str = 'last', **kwargs):
    """
    Load alxlnet model.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'alxlnet'`` - XLNET architecture from google + Malaya.
    pool_mode : str, optional (default='last')
        Model logits architecture supported. Allowed values:

        * ``'last'`` - last of the sequence.
        * ``'first'`` - first of the sequence.
        * ``'mean'`` - mean of the sequence.
        * ``'attn'`` - attention of the sequence.

    Returns
    -------
    result : malaya.transformers.alxlnet.Model class
    """

    model = model.lower()
    pool_mode = pool_mode.lower()

    from malaya.path import PATH_ALXLNET, S3_PATH_ALXLNET
    from malaya.function import check_file
    import sentencepiece as spm

    if pool_mode not in ['last', 'first', 'mean', 'attn']:
        raise Exception(
            "pool_mode not supported, only support ['last', 'first', 'mean', 'attn']"
        )

    check_file(PATH_ALXLNET[model]['model'], S3_PATH_ALXLNET[model], **kwargs)

    if not os.path.exists(PATH_ALXLNET[model]['directory']):
        import tarfile

        with tarfile.open(PATH_ALXLNET[model]['model']['model']) as tar:
            tar.extractall(path = PATH_ALXLNET[model]['path'])

    sp_model = spm.SentencePieceProcessor()
    sp_model.Load(PATH_ALXLNET[model]['directory'] + 'sp10m.cased.v9.model')
    xlnet_config = xlnet_lib.XLNetConfig(
        json_path = PATH_ALXLNET[model]['directory'] + 'config.json'
    )
    xlnet_checkpoint = PATH_ALXLNET[model]['directory'] + 'model.ckpt'
    model = Model(
        xlnet_config,
        sp_model,
        xlnet_checkpoint,
        pool_mode = pool_mode,
        **kwargs
    )
    model._saver.restore(model._sess, xlnet_checkpoint)
    return model
Ejemplo n.º 23
0
def transformer(path,
                s3_path,
                class_name,
                model='xlnet',
                quantized=False,
                **kwargs):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(path[model][model_path], **kwargs)

    try:
        with open(path[model]['setting']) as fopen:
            nodes = json.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again"
        )

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return TAGGING_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])
        return TAGGING_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            vectorizer=g.get_tensor_by_name('import/transpose_3:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )
Ejemplo n.º 24
0
def transformer(model: str = 'xlnet', **kwargs):
    """
    Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.

    Returns
    -------
    result : malaya.model.tf.CONSTITUENCY class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from malaya.constituency.available_transformer()'
        )

    check_file(PATH_CONSTITUENCY[model], S3_PATH_CONSTITUENCY[model], **kwargs)
    g = load_graph(PATH_CONSTITUENCY[model]['model'], **kwargs)

    with open(PATH_CONSTITUENCY[model]['dictionary']) as fopen:
        dictionary = json.load(fopen)

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:

        tokenizer = sentencepiece_tokenizer_bert(
            PATH_CONSTITUENCY[model]['tokenizer'],
            PATH_CONSTITUENCY[model]['vocab'],
        )
        mode = 'bert'

    if model in ['xlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(
            PATH_CONSTITUENCY[model]['tokenizer'])
        mode = 'xlnet'

    from malaya.model.tf import CONSTITUENCY

    return CONSTITUENCY(
        input_ids=g.get_tensor_by_name('import/input_ids:0'),
        word_end_mask=g.get_tensor_by_name('import/word_end_mask:0'),
        charts=g.get_tensor_by_name('import/charts:0'),
        tags=g.get_tensor_by_name('import/tags:0'),
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        dictionary=dictionary,
        mode=mode,
    )
Ejemplo n.º 25
0
def load(model: str = 'xlnet', pool_mode: str = 'last', **kwargs):
    """
    Load xlnet model.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'xlnet'`` - XLNET architecture from google.
    pool_mode : str, optional (default='last')
        Model logits architecture supported. Allowed values:

        * ``'last'`` - last of the sequence.
        * ``'first'`` - first of the sequence.
        * ``'mean'`` - mean of the sequence.
        * ``'attn'`` - attention of the sequence.

    Returns
    -------
    result : malaya.transformers.xlnet.Model class
    """

    model = model.lower()
    pool_mode = pool_mode.lower()

    if pool_mode not in ['last', 'first', 'mean', 'attn']:
        raise Exception(
            "pool_mode not supported, only support ['last', 'first', 'mean', 'attn']"
        )

    check_file(PATH_XLNET[model]['model'], S3_PATH_XLNET[model], **kwargs)

    if not os.path.exists(PATH_XLNET[model]['directory'] + 'model.ckpt'):
        import tarfile

        with tarfile.open(PATH_XLNET[model]['model']['model']) as tar:
            tar.extractall(path=PATH_XLNET[model]['path'])

    vocab_model = PATH_XLNET[model]['directory'] + 'sp10m.cased.v9.model'
    vocab = PATH_XLNET[model]['directory'] + 'sp10m.cased.v9.vocab'
    tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model)
    xlnet_config = xlnet_lib.XLNetConfig(
        json_path=PATH_XLNET[model]['directory'] + 'config.json'
    )
    xlnet_checkpoint = PATH_XLNET[model]['directory'] + 'model.ckpt'
    model = Model(
        xlnet_config,
        tokenizer,
        xlnet_checkpoint,
        pool_mode=pool_mode,
        **kwargs
    )
    model._saver.restore(model._sess, xlnet_checkpoint)
    return model
Ejemplo n.º 26
0
def load_news():
    """
    Return malaya pretrained local malaysia news word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector

    Returns
    -------
    result: tuple(vocabulary, vector)
    """

    check_file(PATH_WORDVECTOR['news'], S3_PATH_WORDVECTOR['news'])
    return _load(PATH_WORDVECTOR['news']['vocab'],
                 PATH_WORDVECTOR['news']['model'])
Ejemplo n.º 27
0
def load_wiki_news_social_media():
    """
    Return malaya pretrained local malaysia Wikipedia + Social media + News word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector

    Returns
    -------
    tuple: (vocabulary, vector)
    """

    check_file(PATH_WORDVECTOR['combine'], S3_PATH_WORDVECTOR['combine'])
    return _load(
        PATH_WORDVECTOR['combine']['vocab'], PATH_WORDVECTOR['combine']['model']
    )
Ejemplo n.º 28
0
def lexicon(**kwargs):
    """
    Load Lexicon NSFW model.

    Returns
    -------
    result : malaya.text.lexicon.nsfw.Lexicon class
    """

    check_file(PATH_NSFW['lexicon'], S3_PATH_NSFW['lexicon'], **kwargs)
    with open(PATH_NSFW['lexicon']['model']) as fopen:
        corpus = json.load(fopen)
    return nsfw.Lexicon(corpus)
Ejemplo n.º 29
0
def load_news():
    """
    Return malaya pretrained local malaysia news word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector

    Returns
    -------
    vocabulary: indices dictionary for `vector`.
    vector: np.array, 2D.
    """

    check_file(PATH_WORDVECTOR['news'], S3_PATH_WORDVECTOR['news'])
    return _load(
        PATH_WORDVECTOR['news']['vocab'], PATH_WORDVECTOR['news']['model']
    )
Ejemplo n.º 30
0
def load_wiki():
    """
    Return malaya pretrained wikipedia word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector

    Returns
    -------
    tuple: (vocabulary, vector)
    """

    check_file(PATH_WORDVECTOR['wikipedia'], S3_PATH_WORDVECTOR['wikipedia'])
    return _load(
        PATH_WORDVECTOR['wikipedia']['vocab'],
        PATH_WORDVECTOR['wikipedia']['model'],
    )