Python SentencePieceTokenizerの例、malaya.text.bpe.SentencePieceTokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: spell.py プロジェクト: lantip/Malaya

def transformer(model, sentence_piece: bool = False, **kwargs):
    """
    Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.

    Returns
    -------
    result: malaya.spell.Transformer class
    """
    if not hasattr(model, '_log_vectorize'):
        raise ValueError('model must have `_log_vectorize` method')

    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = SentencePieceTokenizer(vocab_file=vocab,
                                           spm_model_file=vocab_model)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return Transformer(model, corpus, tokenizer)

コード例 #2

0

ファイルを表示

ファイル: spell.py プロジェクト: lantip/Malaya

def probability(sentence_piece: bool = False, **kwargs):
    """
    Train a Probability Spell Corrector.

    Parameters
    ----------
    sentence_piece: bool, optional (default=False)
        if True, reduce possible augmentation states using sentence piece.

    Returns
    -------
    result: malaya.spell.Probability class
    """
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    tokenizer = None

    if sentence_piece:
        check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
                   **kwargs)

        vocab = PATH_NGRAM['sentencepiece']['vocab']
        vocab_model = PATH_NGRAM['sentencepiece']['model']
        tokenizer = SentencePieceTokenizer(vocab_file=vocab,
                                           spm_model_file=vocab_model)

    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return Probability(corpus, tokenizer)

コード例 #3

0

ファイルを表示

def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.tf.Constituency class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.constituency.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='constituency',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    inputs = ['input_ids', 'word_end_mask']
    outputs = ['charts', 'tags']
    tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'],
                                       spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'vectorizer': _vectorizer_mapping[model]})
    mode = 'bert' if 'bert' in model else 'xlnet'

    return Constituency(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        dictionary=settings.constituency,
        mode=mode,
    )

コード例 #4

0

ファイルを表示

ファイル: __init__.py プロジェクト: huseinzol05/malaya

def load(model: str = 'xlnet', pool_mode: str = 'last', **kwargs):
    """
    Load xlnet model.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'xlnet'`` - XLNET architecture from google.
    pool_mode : str, optional (default='last')
        Model logits architecture supported. Allowed values:

        * ``'last'`` - last of the sequence.
        * ``'first'`` - first of the sequence.
        * ``'mean'`` - mean of the sequence.
        * ``'attn'`` - attention of the sequence.

    Returns
    -------
    result : malaya.transformers.xlnet.Model class
    """

    model = model.lower()
    pool_mode = pool_mode.lower()

    if pool_mode not in ['last', 'first', 'mean', 'attn']:
        raise Exception(
            "pool_mode not supported, only support ['last', 'first', 'mean', 'attn']"
        )

    path = check_file(PATH_XLNET[model]['model'], S3_PATH_XLNET[model],
                      **kwargs)

    if not os.path.exists(
            os.path.join(PATH_XLNET[model]['directory'], 'model.ckpt')):
        import tarfile

        with tarfile.open(path['model']) as tar:
            tar.extractall(path=PATH_XLNET[model]['path'])

    vocab_model = os.path.join(PATH_XLNET[model]['directory'],
                               'sp10m.cased.v9.model')
    vocab = os.path.join(PATH_XLNET[model]['directory'],
                         'sp10m.cased.v9.vocab')
    tokenizer = SentencePieceTokenizer(vocab_file=vocab,
                                       spm_model_file=vocab_model)
    xlnet_config = xlnet_lib.XLNetConfig(
        json_path=os.path.join(PATH_XLNET[model]['directory'], 'config.json'))
    xlnet_checkpoint = os.path.join(PATH_XLNET[model]['directory'],
                                    'model.ckpt')
    model = Model(xlnet_config,
                  tokenizer,
                  xlnet_checkpoint,
                  pool_mode=pool_mode,
                  **kwargs)
    model._saver.restore(model._sess, xlnet_checkpoint)
    return model

コード例 #5

0

ファイルを表示

def _transformer(model,
                 bert_model,
                 xlnet_model,
                 quantized=False,
                 siamese=False,
                 **kwargs):
    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.similarity.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='similarity',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        selected_model = bert_model
        if siamese:
            selected_node = 'import/bert/pooler/dense/BiasAdd:0'

    if model in ['xlnet', 'alxlnet']:
        selected_model = xlnet_model
        if siamese:
            selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0'

    if not siamese:
        selected_node = _vectorizer_mapping[model]

    inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
    outputs = ['logits']
    tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'],
                                       spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(
        g, inputs, outputs, extra={'vectorizer': selected_node})

    return selected_model(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )

コード例 #6

0

ファイルを表示

ファイル: __init__.py プロジェクト: madamroziyani/malaya

def load(model: str = 'base', **kwargs):
    """
    Load bert model.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'bert'`` - base bert-bahasa released by Malaya.
        * ``'tiny-bert'`` - tiny bert-bahasa released by Malaya.

    Returns
    -------
    result : malaya.transformers.bert.Model class
    """

    from malaya.path import PATH_BERT, S3_PATH_BERT
    from malaya.function import check_file

    model = model.lower()
    check_file(PATH_BERT[model]['model'], S3_PATH_BERT[model], **kwargs)

    if not os.path.exists(PATH_BERT[model]['directory'] + 'model.ckpt'):
        import tarfile

        with tarfile.open(PATH_BERT[model]['model']['model']) as tar:
            tar.extractall(path = PATH_BERT[model]['path'])

    import sentencepiece as spm
    from malaya.text.bpe import SentencePieceTokenizer

    bert_checkpoint = PATH_BERT[model]['directory'] + 'model.ckpt'
    vocab_model = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.model'
    vocab = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.vocab'
    bert_config = PATH_BERT[model]['directory'] + 'config.json'

    sp_model = spm.SentencePieceProcessor()
    sp_model.Load(vocab_model)

    with open(vocab) as fopen:
        v = fopen.read().split('\n')[:-1]
    v = [i.split('\t') for i in v]
    v = {i[0]: no for no, i in enumerate(v)}
    tokenizer = SentencePieceTokenizer(v, sp_model)

    bert_config = modeling.BertConfig.from_json_file(bert_config)
    model = Model(bert_config, tokenizer)
    model._saver.restore(model._sess, bert_checkpoint)
    return model

コード例 #7

0

ファイルを表示

def transformer_squad(module, model='bert', quantized=False, **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )

    g = load_graph(path['model'], **kwargs)
    inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3']

    if model in ['xlnet', 'alxlnet']:
        inputs.append('Placeholder_4')

    outputs = [
        'start_top_log_probs',
        'start_top_index',
        'end_top_log_probs',
        'end_top_index',
        'cls_logits',
        'logits_vectorize',
    ]
    tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'],
                                       spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    if 'bert' in model:
        mode = 'bert'
    else:
        mode = 'xlnet'

    return SQUAD(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        module=module,
        mode=mode,
        length=LENGTHS[mode],
    )

コード例 #8

0

ファイルを表示

ファイル: tag.py プロジェクト: lantip/Malaya

def transformer(module, model='xlnet', quantized=False, tok=None, **kwargs):
    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
            'setting': TAGGING_SETTING[module],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    with open(path['setting']) as fopen:
        nodes = json.load(fopen)

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        inputs = ['Placeholder', 'Placeholder_1']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        selected_model = TaggingBERT

    if model in ['xlnet', 'alxlnet']:
        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        vectorizer = {'vectorizer': 'import/transpose_3:0'}
        selected_model = TaggingXLNET

    outputs = ['logits']
    tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'],
                                       spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(g,
                                              inputs,
                                              outputs,
                                              extra=vectorizer)

    return selected_model(input_nodes=input_nodes,
                          output_nodes=output_nodes,
                          sess=generate_session(graph=g, **kwargs),
                          tokenizer=tokenizer,
                          settings=nodes,
                          tok=tok)

コード例 #9

0

ファイルを表示

def load(model: str = 'base', **kwargs):
    """
    Load bert model.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'bert'`` - base bert-bahasa released by Malaya.
        * ``'tiny-bert'`` - tiny bert-bahasa released by Malaya.

    Returns
    -------
    result : malaya.transformers.bert.Model class
    """

    from malaya.path import PATH_BERT, S3_PATH_BERT
    from malaya.function import check_file

    model = model.lower()
    check_file(PATH_BERT[model]['model'], S3_PATH_BERT[model], **kwargs)

    if not os.path.exists(PATH_BERT[model]['directory'] + 'model.ckpt'):
        import tarfile

        with tarfile.open(PATH_BERT[model]['model']['model']) as tar:
            tar.extractall(path=PATH_BERT[model]['path'])

    bert_checkpoint = PATH_BERT[model]['directory'] + 'model.ckpt'
    vocab_model = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.model'
    vocab = PATH_BERT[model]['directory'] + 'sp10m.cased.bert.vocab'
    bert_config = PATH_BERT[model]['directory'] + 'config.json'

    tokenizer = SentencePieceTokenizer(vocab_file=vocab,
                                       spm_model_file=vocab_model)
    bert_config = modeling.BertConfig.from_json_file(bert_config)
    model = Model(bert_config, tokenizer, **kwargs)
    model._saver.restore(model._sess, bert_checkpoint)
    return model

コード例 #10

0

ファイルを表示

def transformer(version: str = 'v2',
                model: str = 'xlnet',
                quantized: bool = False,
                **kwargs):
    """
    Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention.

    Parameters
    ----------
    version : str, optional (default='v2')
        Version supported. Allowed values:

        * ``'v1'`` - version 1, maintain for knowledge graph.
        * ``'v2'`` - Trained on bigger dataset, better version.

    model : str, optional (default='xlnet')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: model
        List of model classes:

        * if `bert` in model, will return `malaya.model.bert.DependencyBERT`.
        * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`.
    """

    version = _validate_version(version)
    model = model.lower()
    if model not in _transformer_availability[version]:
        raise ValueError(
            "model not supported, please check supported models from `malaya.dependency.available_transformer(version='{version}')`."
        )

    module = 'dependency'
    minus = 1
    if version != 'v1':
        module = f'{module}-{version}'
        minus = 2

    path = check_file(
        file=model,
        module=module,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']:
        inputs = ['Placeholder']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}

        selected_model = DependencyBERT

    if model in ['xlnet', 'alxlnet']:
        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        vectorizer = {'vectorizer': 'import/transpose_3:0'}

        selected_model = DependencyXLNET

    outputs = ['logits', 'heads_seq']
    tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'],
                                       spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(g,
                                              inputs,
                                              outputs,
                                              extra=vectorizer)

    return selected_model(input_nodes=input_nodes,
                          output_nodes=output_nodes,
                          sess=generate_session(graph=g, **kwargs),
                          tokenizer=tokenizer,
                          settings=label,
                          minus=minus)

コード例 #11

0

ファイルを表示

ファイル: keyword_extraction.py プロジェクト: huseinzol05/malaya

def transformer(model: str = 'bert', quantized: bool = False, **kwargs):
    """
    Load Transformer keyword similarity model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: model
        List of model classes:

        * if `bert` in model, will return `malaya.model.bert.KeyphraseBERT`.
        * if `xlnet` in model, will return `malaya.model.xlnet.KeyphraseXLNET`.
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya.keyword_extraction.available_transformer()`.'
        )

    path = check_file(
        file=model,
        module='keyword-extraction',
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)
    outputs = ['logits']

    if model in ['bert', 'tiny-bert']:
        inputs = [
            'Placeholder',
            'Placeholder_1',
            'Placeholder_2',
            'Placeholder_3',
        ]
        outputs.append('bert/summary')
        selected_class = KeyphraseBERT

    if model in ['xlnet', 'alxlnet']:

        inputs = [
            'Placeholder',
            'Placeholder_1',
            'Placeholder_2',
            'Placeholder_3',
            'Placeholder_4',
            'Placeholder_5',
        ]
        outputs.append('xlnet/summary')
        selected_class = KeyphraseXLNET

    tokenizer = SentencePieceTokenizer(vocab_file=path['vocab'],
                                       spm_model_file=path['tokenizer'])
    input_nodes, output_nodes = nodes_session(g, inputs, outputs)

    return selected_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=['not similar', 'similar'],
    )

コード例 #12

0

ファイルを表示

def shortform(
    word: str,
    augment_vowel: bool = True,
    augment_consonant: bool = True,
    prob_delete_vowel: float = 0.5,
    **kwargs,
):
    """
    augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels,
    purposely replaced some subwords into slang subwords.

    Parameters
    ----------
    word: str
    augment_vowel: bool, (default=True)
        if True, will augment vowels for each samples generated.
    augment_consonant: bool, (default=True)
        if True, will augment consonants for each samples generated.
    prob_delete_vowel: float, (default=0.5)
        probability to delete a vowel.

    Returns
    -------
    result: list
    """

    if not 0 < prob_delete_vowel < 1:
        raise ValueError(
            'prob_delete_vowel must be bigger than 0 and less than 1')
    word = simple_textcleaning(word)
    if not len(word):
        raise ValueError('word is too short to augment shortform.')

    check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
               **kwargs)

    vocab = PATH_NGRAM['sentencepiece']['vocab']
    vocab_model = PATH_NGRAM['sentencepiece']['model']
    tokenizer = SentencePieceTokenizer(vocab_file=vocab,
                                       spm_model_file=vocab_model)

    replace_consonants = {
        'n': 'm',
        't': 'y',
        'r': 't',
        'g': 'h',
        'j': 'k',
        'k': 'l',
        'd': 's',
        'd': 'f',
        'g': 'f',
        'b': 'n',
    }

    replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'}

    results = [word]

    if len(word) > 1:

        if word[-1] == 'a' and word[-2] in consonants:
            results.append(word[:-1] + 'e')

        if word[0] == 'f' and word[-1] == 'r':
            results.append('p' + words[1:])

        if word[-2] in consonants and word[-1] in vowels:
            results.append(word + 'k')

        if word[-2] in vowels and word[-1] == 'h':
            results.append(word[:-1])

    if len(word) > 2:
        if word[-3] in consonants and word[-2:] == 'ar':
            results.append(words[:-2] + 'o')

        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:
            results.append(word[1:])

        if word[-3] in consonants and word[-2:] == 'ng':
            results.append(word[:-2] + 'g')

        if word[1:3] == 'ng':
            results.append(word[:1] + x[2:])

    if augment_consonant:
        result_consonants = []
        for k, v in replace_consonants.items():
            for r in results:
                result_consonants.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_consonants)

    if augment_vowel:
        result_vowels = []
        for k, v in replace_vowels.items():
            for r in results:
                result_vowels.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_vowels)

    result_deleted = []
    for s in results:
        deleted = []
        for c in s:
            if random.random() > prob_delete_vowel and c in vowels:
                continue
            else:
                deleted.append(c)
        result_deleted.append(''.join(deleted))
    results.extend(result_deleted)

    filtered = []
    for s in results:
        t = tokenizer.tokenize(s)
        if len(t) == 1:
            filtered.append(s)
            continue
        if t[0] == '▁':
            continue
        if any([len(w) < 3 for w in t]):
            continue
        filtered.append(s)

    return list(set(filtered))