Esempio n. 1
0
def auto_reduce(model_outputs: 'np.ndarray', mask_2d: 'np.ndarray',
                model_name: str) -> 'np.ndarray':
    """
    Automatically creates a sentence embedding from its token embeddings.
        * For BERT-like models (BERT, RoBERTa, DistillBERT, Electra ...) uses embedding of first token
        * For XLM and XLNet models uses embedding of last token
        * Assumes that other models are language-model like and uses embedding of last token
    """
    if 'bert' in model_name or 'electra' in model_name:
        return reduce_cls(model_outputs, mask_2d)
    if 'xlnet' in model_name:
        return reduce_cls(model_outputs, mask_2d, cls_pos='tail')
    default_logger.warning(
        'Using embedding of a last token as a sequence embedding. '
        'If that is not desirable, change `pooling_strategy`')
    return reduce_cls(model_outputs, mask_2d, cls_pos='tail')
Esempio n. 2
0
def test_reduce_cls_tail():
    results = reduce_cls(test_data, test_mask, cls_pos='tail')
    for data, mask, result in zip(test_data, test_mask, results):
        num_valid_tokens = int(sum(mask))
        np.testing.assert_array_equal(data[num_valid_tokens - 1, :], result)
Esempio n. 3
0
def test_reduce_cls_head():
    results = reduce_cls(test_data, test_mask, cls_pos='head')
    for data, mask, result in zip(test_data, test_mask, results):
        np.testing.assert_array_equal(data[0, :], result)