Esempio n. 1
0
def test_unhashed_features_other():
    """ Check that when there are several candidates, they do not appear in "other"
    if at least one is found. If none are found, they should appear in "other"
    together.
    """
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 3))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(
            pos=[
                FW([{
                    'name': 'foo',
                    'sign': 1
                }, {
                    'name': 'see',
                    'sign': -1
                }], 2),
                FW([{
                    'name': 'zoo',
                    'sign': 1
                }, {
                    'name': 'bar',
                    'sign': 1
                }], 3),
            ],
            neg=[
                FW([{
                    'name': 'ree',
                    'sign': 1
                }, {
                    'name': 'tre',
                    'sign': 1
                }], -4),
            ],
        ))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[
                ('see', [(2, 5)], 2),
                ('tre', [(23, 26)], -4),
                ('ree', [(24, 27)], -4),
            ],
            preserve_density=True,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[
                                            FW([{
                                                'name': 'zoo',
                                                'sign': 1
                                            }, {
                                                'name': 'bar',
                                                'sign': 1
                                            }], 3),
                                        ],
                                        neg=[FW(hl_in_text, -2)],
                                    ))
Esempio n. 2
0
def test_weighted_spans_word():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word')
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('lemon', 4),
                            FW('bias', 8)],
                       neg=[FW('tree', -6)],
                       neg_remaining=10))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4),
                   ('tree', [(23, 27)], -6)],
            preserve_density=False,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[FW('bias', 8),
                                             FW(hl_in_text, 0)],
                                        neg=[],
                                        neg_remaining=10,
                                    ))
Esempio n. 3
0
def test_no_weighted_spans():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(doc, vec, FeatureWeights(pos=[], neg=[]))
    assert w_spans == WeightedSpans(analyzer='char',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[],
                                    other=FeatureWeights(pos=[], neg=[]))
Esempio n. 4
0
def test_weighted_spans_feature_union():
    doc = {'text': 'I see: a leaning lemon tree', 'url': 'http://example.com'}
    vec = FeatureUnion([
        ('text',
         CountVectorizer(analyzer='word',
                         preprocessor=lambda x: x['text'].lower())),
        ('url',
         CountVectorizer(analyzer='char',
                         ngram_range=(4, 4),
                         preprocessor=lambda x: x['url'])),
    ])
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[
            FW('text__see', 2),
            FW('text__lemon', 4),
            FW('bias', 8),
            FW('url__ampl', 10),
            FW('url__mple', 7),
        ],
                       neg=[
                           FW('text__tree', -6),
                           FW('url__exam', -10),
                       ],
                       neg_remaining=10))
    assert w_spans == WeightedSpans(
        [
            DocWeightedSpans(
                document='i see: a leaning lemon tree',
                spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4),
                       ('tree', [(23, 27)], -6)],
                preserve_density=False,
                vec_name='text',
            ),
            DocWeightedSpans(
                document='http://example.com',
                spans=[('exam', [(7, 11)], -10), ('ampl', [(9, 13)], 10),
                       ('mple', [(10, 14)], 7)],
                preserve_density=True,
                vec_name='url',
            ),
        ],
        other=FeatureWeights(
            pos=[
                FW('bias', 8),
                FW(FormattedFeatureName('url: Highlighted in text (sum)'), 7),
                FW(FormattedFeatureName('text: Highlighted in text (sum)'), 0),
            ],
            neg=[],
            neg_remaining=10,
        ))
Esempio n. 5
0
File: text.py Progetto: soprof/eli5
def get_weighted_spans(doc, vec, feature_weights):
    # type: (Any, Any, FeatureWeights) -> Optional[WeightedSpans]
    """ If possible, return a dict with preprocessed document and a list
    of spans with weights, corresponding to features in the document.
    """
    if isinstance(vec, FeatureUnion):
        return _get_weighted_spans_from_union(doc, vec, feature_weights)
    else:
        result = _get_doc_weighted_spans(doc, vec, feature_weights)
        if result is not None:
            found_features, doc_weighted_spans = result
            return WeightedSpans(
                [doc_weighted_spans],
                other=_get_other(feature_weights, [('', found_features)]),
            )
Esempio n. 6
0
def get_weighted_spans(doc, vec, feature_weights):
    """ If possible, return a dict with preprocessed document and a list
    of spans with weights, corresponding to features in the document.
    """
    if isinstance(vec, InvertableHashingVectorizer):
        vec = vec.vec
    if not isinstance(vec, VectorizerMixin):
        return

    def _get_features(feature):
        if isinstance(feature, list):
            return [f['name'] for f in feature]
        else:
            return [feature]

    # (group, idx) is a feature key here
    feature_weights_dict = {
        f: (fw.weight, (group, idx))
        for group in ['pos', 'neg']
        for idx, fw in enumerate(getattr(feature_weights, group))
        for f in _get_features(fw.feature)
    }

    span_analyzer, preprocessed_doc = _build_span_analyzer(doc, vec)
    if span_analyzer is None:
        return

    weighted_spans = []
    found_features = {}
    for spans, feature in span_analyzer(preprocessed_doc):
        try:
            weight, key = feature_weights_dict[feature]
        except KeyError:
            pass
        else:
            weighted_spans.append((feature, spans, weight))
            found_features[key] = weight

    return WeightedSpans(
        analyzer=vec.analyzer,
        document=preprocessed_doc,
        weighted_spans=weighted_spans,
        other=_get_other(feature_weights, feature_weights_dict,
                         found_features),
    )
Esempio n. 7
0
def test_weighted_spans_word_stopwords():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word', stop_words='english')
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('lemon', 5),
                            FW('bias', 8)],
                       neg=[FW('tree', -6)]))
    assert w_spans == WeightedSpans(analyzer='word',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[('lemon', [(17, 22)], 5),
                                                    ('tree', [(23, 27)], -6)],
                                    other=FeatureWeights(
                                        pos=[FW('bias', 8),
                                             FW('see', 2)],
                                        neg=[FW(hl_in_text, -1)],
                                    ))
Esempio n. 8
0
def test_weighted_spans_char():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('a le', 5),
                            FW('on ', 8)],
                       neg=[FW('lem', -6)]))
    assert w_spans == WeightedSpans(analyzer='char',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[('see', [(2, 5)], 2),
                                                    ('lem', [(17, 20)], -6),
                                                    ('on ', [(20, 23)], 8),
                                                    ('a le', [(7, 11)], 5)],
                                    other=FeatureWeights(
                                        pos=[FW(hl_in_text, 9)],
                                        neg=[],
                                    ))
Esempio n. 9
0
def test_weighted_spans_char_wb():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char_wb', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('a le', 5),
                            FW('on ', 8)],
                       neg=[FW('lem', -6), FW(' lem', -4)]))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[('see', [(2, 5)], 2), ('lem', [(17, 20)], -6),
                   ('on ', [(20, 23)], 8), (' lem', [(16, 20)], -4)],
            preserve_density=True,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[FW('a le', 5),
                                             FW(hl_in_text, 0)],
                                        neg=[],
                                    ))
Esempio n. 10
0
def test_weighted_spans_word_bigrams():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word', ngram_range=(1, 2))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(
            pos=[FW('see', 2),
                 FW('leaning lemon', 5),
                 FW('lemon tree', 8)],
            neg=[FW('tree', -6)]))
    assert w_spans == WeightedSpans(analyzer='word',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[
                                        ('see', [(2, 5)], 2),
                                        ('tree', [(23, 27)], -6),
                                        ('leaning lemon', [(9, 16),
                                                           (17, 22)], 5),
                                        ('lemon tree', [(17, 22), (23, 27)], 8)
                                    ],
                                    other=FeatureWeights(
                                        pos=[FW(hl_in_text, 9)],
                                        neg=[],
                                    ))
def test_prepare_weighted_spans():
    targets = [
        TargetExplanation(target='one',
                          feature_weights=FeatureWeights(pos=[], neg=[]),
                          weighted_spans=WeightedSpans(docs_weighted_spans=[
                              DocWeightedSpans(
                                  document='ab',
                                  spans=[
                                      ('a', [(0, 1)], 1.5),
                                      ('b', [(1, 2)], 2.5),
                                  ],
                              ),
                              DocWeightedSpans(
                                  document='xy',
                                  spans=[
                                      ('xy', [(0, 2)], -4.5),
                                  ],
                              )
                          ])),
        TargetExplanation(
            target='two',
            feature_weights=FeatureWeights(pos=[], neg=[]),
            weighted_spans=WeightedSpans(
                docs_weighted_spans=[
                    DocWeightedSpans(
                        document='abc',
                        spans=[
                            ('a', [(0, 1)], 0.5),
                            ('c', [(2, 3)], 3.5),
                        ],
                    ),
                    DocWeightedSpans(
                        document='xz',
                        spans=[
                            # char_wb at the start of the document
                            (' xz', [(-1, 2)], 1.5),
                        ],
                    )
                ], )),
    ]
    assert prepare_weighted_spans(targets, preserve_density=False) == [
        [
            PreparedWeightedSpans(
                targets[0].weighted_spans.docs_weighted_spans[0],
                char_weights=np.array([1.5, 2.5]),
                weight_range=3.5),
            PreparedWeightedSpans(
                targets[0].weighted_spans.docs_weighted_spans[1],
                char_weights=np.array([-4.5, -4.5]),
                weight_range=4.5),
        ],
        [
            PreparedWeightedSpans(
                targets[1].weighted_spans.docs_weighted_spans[0],
                char_weights=np.array([0.5, 0, 3.5]),
                weight_range=3.5),
            PreparedWeightedSpans(
                targets[1].weighted_spans.docs_weighted_spans[1],
                char_weights=np.array([1.5, 1.5]),
                weight_range=4.5),
        ],
    ]