Exemple #1
0
def test_weighted_spans_feature_union():
    doc = {'text': 'I see: a leaning lemon tree', 'url': 'http://example.com'}
    vec = FeatureUnion([
        ('text',
         CountVectorizer(analyzer='word',
                         preprocessor=lambda x: x['text'].lower())),
        ('url',
         CountVectorizer(analyzer='char',
                         ngram_range=(4, 4),
                         preprocessor=lambda x: x['url'])),
    ])
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[
            FW('text__see', 2),
            FW('text__lemon', 4),
            FW('bias', 8),
            FW('url__ampl', 10),
            FW('url__mple', 7),
        ],
                       neg=[
                           FW('text__tree', -6),
                           FW('url__exam', -10),
                       ],
                       neg_remaining=10))
    assert w_spans == WeightedSpans(
        [
            DocWeightedSpans(
                document='i see: a leaning lemon tree',
                spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4),
                       ('tree', [(23, 27)], -6)],
                preserve_density=False,
                vec_name='text',
            ),
            DocWeightedSpans(
                document='http://example.com',
                spans=[('exam', [(7, 11)], -10), ('ampl', [(9, 13)], 10),
                       ('mple', [(10, 14)], 7)],
                preserve_density=True,
                vec_name='url',
            ),
        ],
        other=FeatureWeights(
            pos=[
                FW('bias', 8),
                FW(FormattedFeatureName('url: Highlighted in text (sum)'), 7),
                FW(FormattedFeatureName('text: Highlighted in text (sum)'), 0),
            ],
            neg=[],
            neg_remaining=10,
        ))
Exemple #2
0
    def get_doc_weighted_spans(
            self,
            doc,  # type: str
            feature_weights,  # type: FeatureWeights
            feature_fn  # type: Callable[[str], str]
    ):
        # type: (...) -> Tuple[Dict[Tuple[str, int], float], DocWeightedSpans]
        feature_weights_dict = _get_feature_weights_dict(
            feature_weights, feature_fn)
        spans = []
        found_features = {}
        for idx, (span, feature) in enumerate(self.text_.spans_and_tokens):
            featname = self._featname(idx, feature)
            if featname not in feature_weights_dict:
                continue
            weight, key = feature_weights_dict[featname]
            spans.append((feature, [span], weight))
            found_features[key] = weight

        doc_weighted_spans = DocWeightedSpans(
            document=doc,
            spans=spans,
            preserve_density=False,
        )
        return found_features, doc_weighted_spans
def test_render_weighted_spans_word():
    weighted_spans = DocWeightedSpans(
        document='i see: a leaning lemon tree',
        spans=[
            ('see', [(2, 5)], 0.2),
            ('tree', [(23, 27)], -0.6),
            ('leaning lemon', [(9, 16), (17, 22)], 0.5),
            ('lemon tree', [(17, 22), (23, 27)], 0.8)],
        preserve_density=False,
    )
    s = _render_weighted_spans(weighted_spans)
    print(s)
    assert s.startswith(
        '<span style="opacity: 0.80">i </span>'
        '<span'
        ' style="background-color: hsl(120, 100.00%, 89.21%); opacity: 0.83"'
        ' title="0.200">see</span>'
    )
    s_without_styles = re.sub('style=".*?"', '', s)
    assert s_without_styles == (
         '<span >i </span>'
         '<span  title="0.200">see</span>'
         '<span >: a </span>'
         '<span  title="0.500">leaning</span>'
         '<span > </span>'
         '<span  title="1.300">lemon</span>'
         '<span > </span>'
         '<span  title="0.200">tree</span>'
    )
Exemple #4
0
def test_unhashed_features_other():
    """ Check that when there are several candidates, they do not appear in "other"
    if at least one is found. If none are found, they should appear in "other"
    together.
    """
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 3))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(
            pos=[
                FW([{
                    'name': 'foo',
                    'sign': 1
                }, {
                    'name': 'see',
                    'sign': -1
                }], 2),
                FW([{
                    'name': 'zoo',
                    'sign': 1
                }, {
                    'name': 'bar',
                    'sign': 1
                }], 3),
            ],
            neg=[
                FW([{
                    'name': 'ree',
                    'sign': 1
                }, {
                    'name': 'tre',
                    'sign': 1
                }], -4),
            ],
        ))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[
                ('see', [(2, 5)], 2),
                ('tre', [(23, 26)], -4),
                ('ree', [(24, 27)], -4),
            ],
            preserve_density=True,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[
                                            FW([{
                                                'name': 'zoo',
                                                'sign': 1
                                            }, {
                                                'name': 'bar',
                                                'sign': 1
                                            }], 3),
                                        ],
                                        neg=[FW(hl_in_text, -2)],
                                    ))
Exemple #5
0
def test_weighted_spans_word():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word')
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('lemon', 4),
                            FW('bias', 8)],
                       neg=[FW('tree', -6)],
                       neg_remaining=10))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4),
                   ('tree', [(23, 27)], -6)],
            preserve_density=False,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[FW('bias', 8),
                                             FW(hl_in_text, 0)],
                                        neg=[],
                                        neg_remaining=10,
                                    ))
Exemple #6
0
def test_no_weighted_spans():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(doc, vec, FeatureWeights(pos=[], neg=[]))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[],
            preserve_density=True,
        )
    ],
                                    other=FeatureWeights(pos=[], neg=[]))
Exemple #7
0
def test_override_preserve_density():
    weighted_spans = DocWeightedSpans(
        document='see',
        spans=[
            ('se', [(0, 2)], 0.2),
            ('ee', [(1, 3)], 0.1),
        ],
        preserve_density=True,
    )
    s = _render_weighted_spans(weighted_spans, preserve_density=False)
    assert s.startswith(
        '<span '
        'style="background-color: hsl(120, 100.00%, 69.88%); opacity: 0.93" '
        'title="0.200">s</span>')
def test_render_weighted_spans_zero():
    weighted_spans = DocWeightedSpans(
        document='ab',
        spans=[('a', [(0, 1)], 0.0),
               ('b', [(1, 2)], 0.0)],
        preserve_density=False,
    )
    np_err = np.geterr()
    np.seterr(all='raise')
    try:
        s = _render_weighted_spans(weighted_spans)
    finally:
        np.seterr(**np_err)
    assert s == '<span style="opacity: 0.80">ab</span>'
Exemple #9
0
def test_render_weighted_spans_char():
    weighted_spans = DocWeightedSpans(
        document='see',
        spans=[
            ('se', [(0, 2)], 0.2),
            ('ee', [(1, 3)], 0.1),
        ],
        preserve_density=True,
    )
    s = _render_weighted_spans(weighted_spans)
    assert s == (
        '<span'
        ' style="background-color: hsl(120, 100.00%, 69.88%); opacity: 0.93"'
        ' title="0.100">s</span>'
        '<span'
        ' style="background-color: hsl(120, 100.00%, 60.00%); opacity: 1.00"'
        ' title="0.150">e</span>'
        '<span'
        ' style="background-color: hsl(120, 100.00%, 81.46%); opacity: 0.87"'
        ' title="0.050">e</span>')
Exemple #10
0
def test_weighted_spans_char():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('a le', 5),
                            FW('on ', 8)],
                       neg=[FW('lem', -6)]))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[('see', [(2, 5)], 2), ('lem', [(17, 20)], -6),
                   ('on ', [(20, 23)], 8), ('a le', [(7, 11)], 5)],
            preserve_density=True,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[FW(hl_in_text, 9)],
                                        neg=[],
                                    ))
Exemple #11
0
def _get_doc_weighted_spans(
        doc,
        vec,
        feature_weights,  # type: FeatureWeights
        feature_fn=None  # type: Callable[[str], str]
):
    # type: (...) -> Optional[Tuple[FoundFeatures, DocWeightedSpans]]
    if isinstance(vec, InvertableHashingVectorizer):
        vec = vec.vec

    if hasattr(vec, 'get_doc_weighted_spans'):
        return vec.get_doc_weighted_spans(doc, feature_weights, feature_fn)

    if not isinstance(vec, VectorizerMixin):
        return None

    span_analyzer, preprocessed_doc = build_span_analyzer(doc, vec)
    if span_analyzer is None:
        return None

    feature_weights_dict = _get_feature_weights_dict(feature_weights,
                                                     feature_fn)
    spans = []
    found_features = {}
    for f_spans, feature in span_analyzer(preprocessed_doc):
        if feature not in feature_weights_dict:
            continue
        weight, key = feature_weights_dict[feature]
        spans.append((feature, f_spans, weight))
        # XXX: this assumes feature names are unique
        found_features[key] = weight

    return found_features, DocWeightedSpans(
        document=preprocessed_doc,
        spans=spans,
        preserve_density=vec.analyzer.startswith('char'),
    )
Exemple #12
0
def test_weighted_spans_word_bigrams():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word', ngram_range=(1, 2))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(
            pos=[FW('see', 2),
                 FW('leaning lemon', 5),
                 FW('lemon tree', 8)],
            neg=[FW('tree', -6)]))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[('see', [(2, 5)], 2), ('tree', [(23, 27)], -6),
                   ('leaning lemon', [(9, 16), (17, 22)], 5),
                   ('lemon tree', [(17, 22), (23, 27)], 8)],
            preserve_density=False,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[FW(hl_in_text, 9)],
                                        neg=[],
                                    ))
def test_prepare_weighted_spans():
    targets = [
        TargetExplanation(target='one',
                          feature_weights=FeatureWeights(pos=[], neg=[]),
                          weighted_spans=WeightedSpans(docs_weighted_spans=[
                              DocWeightedSpans(
                                  document='ab',
                                  spans=[
                                      ('a', [(0, 1)], 1.5),
                                      ('b', [(1, 2)], 2.5),
                                  ],
                              ),
                              DocWeightedSpans(
                                  document='xy',
                                  spans=[
                                      ('xy', [(0, 2)], -4.5),
                                  ],
                              )
                          ])),
        TargetExplanation(
            target='two',
            feature_weights=FeatureWeights(pos=[], neg=[]),
            weighted_spans=WeightedSpans(
                docs_weighted_spans=[
                    DocWeightedSpans(
                        document='abc',
                        spans=[
                            ('a', [(0, 1)], 0.5),
                            ('c', [(2, 3)], 3.5),
                        ],
                    ),
                    DocWeightedSpans(
                        document='xz',
                        spans=[
                            # char_wb at the start of the document
                            (' xz', [(-1, 2)], 1.5),
                        ],
                    )
                ], )),
    ]
    assert prepare_weighted_spans(targets, preserve_density=False) == [
        [
            PreparedWeightedSpans(
                targets[0].weighted_spans.docs_weighted_spans[0],
                char_weights=np.array([1.5, 2.5]),
                weight_range=3.5),
            PreparedWeightedSpans(
                targets[0].weighted_spans.docs_weighted_spans[1],
                char_weights=np.array([-4.5, -4.5]),
                weight_range=4.5),
        ],
        [
            PreparedWeightedSpans(
                targets[1].weighted_spans.docs_weighted_spans[0],
                char_weights=np.array([0.5, 0, 3.5]),
                weight_range=3.5),
            PreparedWeightedSpans(
                targets[1].weighted_spans.docs_weighted_spans[1],
                char_weights=np.array([1.5, 1.5]),
                weight_range=4.5),
        ],
    ]