Esempio n. 1
0
def transformer(
    path,
    s3_path,
    class_name,
    label,
    model='bert',
    quantized=False,
    **kwargs,
):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(path[model][model_path], **kwargs)

    if len(label) > 2 or class_name == 'relevancy':
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = MULTICLASS_BERT
            selected_node = 'import/dense/BiasAdd:0'
        if model in ['xlnet', 'alxlnet']:
            selected_class = MULTICLASS_XLNET
            selected_node = 'import/transpose_3:0'

    else:
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = BINARY_BERT
            selected_node = 'import/dense/BiasAdd:0'
        if model in ['xlnet', 'alxlnet']:
            selected_class = BINARY_XLNET
            selected_node = 'import/transpose_3:0'

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer=g.get_tensor_by_name(selected_node),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(bert_num_layers[model], g),
            class_name=class_name,
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer=g.get_tensor_by_name(selected_node),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(g),
            class_name=class_name,
        )
Esempio n. 2
0
def transformer(model: str = 'xlnet', **kwargs):
    """
    Load Transformer toxicity model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - BERT architecture from google.
        * ``'tiny-bert'`` - BERT architecture from google with smaller parameters.
        * ``'albert'`` - ALBERT architecture from google.
        * ``'tiny-albert'`` - ALBERT architecture from google with smaller parameters.
        * ``'xlnet'`` - XLNET architecture from google.
        * ``'alxlnet'`` - XLNET architecture from google + Malaya.

    Returns
    -------
    result : malaya.model.bert.SIGMOID_BERT class
    """

    model = model.lower()
    if model not in _availability:
        raise Exception(
            'model not supported, please check supported models from malaya.sentiment.available_transformer()'
        )

    check_file(PATH_TOXIC[model], S3_PATH_TOXIC[model], **kwargs)
    g = load_graph(PATH_TOXIC[model]['model'])

    path = PATH_TOXIC

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return SIGMOID_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(bert_num_layers[model], g),
            class_name='toxic',
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return SIGMOID_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(g),
            class_name='toxic',
        )
Esempio n. 3
0
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer toxicity model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.bert.SIGMOID_BERT class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise Exception(
            'model not supported, please check supported models from `malaya.toxicity.available_transformer()`.'
        )

    check_file(
        PATH_TOXIC[model], S3_PATH_TOXIC[model], quantized = quantized, **kwargs
    )
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_TOXIC[model][model_path], **kwargs)

    path = PATH_TOXIC

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import,
            )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(
                path[model]['tokenizer'], path[model]['vocab']
            )
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import,
            )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file = path[model]['vocab'],
                do_lower_case = False,
                spm_model_file = path[model]['tokenizer'],
            )

        return SIGMOID_BERT(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = None,
            input_masks = g.get_tensor_by_name('import/Placeholder_1:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            logits_seq = g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer = g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess = generate_session(graph = g, **kwargs),
            tokenizer = tokenizer,
            label = label,
            attns = _extract_attention_weights_import(
                bert_num_layers[model], g
            ),
            class_name = 'toxic',
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import,
            )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import,
            )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return SIGMOID_XLNET(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks = g.get_tensor_by_name('import/Placeholder_2:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            logits_seq = g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer = g.get_tensor_by_name('import/transpose_3:0'),
            sess = generate_session(graph = g, **kwargs),
            tokenizer = tokenizer,
            label = label,
            attns = _extract_attention_weights_import(g),
            class_name = 'toxic',
        )
Esempio n. 4
0
def transformer(
    class_name,
    label,
    model='bert',
    sigmoid=False,
    quantized=False,
    **kwargs,
):
    path = check_file(
        file=model,
        module=class_name,
        keys={
            'model': 'model.pb',
            'vocab': MODEL_VOCAB[model],
            'tokenizer': MODEL_BPE[model],
        },
        quantized=quantized,
        **kwargs,
    )
    g = load_graph(path['model'], **kwargs)

    if sigmoid:
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = SigmoidBERT
        if model in ['xlnet', 'alxlnet']:
            selected_class = SigmoidXLNET
    else:
        if len(label) > 2 or class_name == 'relevancy':
            if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
                selected_class = MulticlassBERT
            if model in ['xlnet', 'alxlnet']:
                selected_class = MulticlassXLNET
            if model in ['bigbird', 'tiny-bigbird']:
                selected_class = MulticlassBigBird

        else:
            if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
                selected_class = BinaryBERT
            if model in ['xlnet', 'alxlnet']:
                selected_class = BinaryXLNET

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                     path['vocab'])
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from malaya.transformers.albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path['vocab'],
                do_lower_case=False,
                spm_model_file=path['tokenizer'],
            )

        inputs = ['Placeholder', 'Placeholder_1']
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        attention = _extract_attention_weights_import(bert_num_layers[model],
                                                      g)

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']
        tokenizer = sentencepiece_tokenizer_xlnet(path['tokenizer'])
        vectorizer = {'vectorizer': 'import/transpose_3:0'}
        attention = _extract_attention_weights_import(g)

    if model in ['bigbird', 'tiny-bigbird']:
        inputs = ['Placeholder']
        tokenizer = sentencepiece_tokenizer_bert(path['tokenizer'],
                                                 path['vocab'])
        vectorizer = {'vectorizer': 'import/dense/BiasAdd:0'}
        attention = None

    outputs = ['logits', 'logits_seq']
    input_nodes, output_nodes = nodes_session(
        g,
        inputs,
        outputs,
        extra=vectorizer,
        attention={'attention': attention},
    )

    return selected_class(
        input_nodes=input_nodes,
        output_nodes=output_nodes,
        sess=generate_session(graph=g, **kwargs),
        tokenizer=tokenizer,
        label=label,
        class_name=class_name,
    )