Beispiel #1
0
def test_unhashed_features_other():
    """ Check that when there are several candidates, they do not appear in "other"
    if at least one is found. If none are found, they should appear in "other"
    together.
    """
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 3))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(
            pos=[
                FW([{
                    'name': 'foo',
                    'sign': 1
                }, {
                    'name': 'see',
                    'sign': -1
                }], 2),
                FW([{
                    'name': 'zoo',
                    'sign': 1
                }, {
                    'name': 'bar',
                    'sign': 1
                }], 3),
            ],
            neg=[
                FW([{
                    'name': 'ree',
                    'sign': 1
                }, {
                    'name': 'tre',
                    'sign': 1
                }], -4),
            ],
        ))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[
                ('see', [(2, 5)], 2),
                ('tre', [(23, 26)], -4),
                ('ree', [(24, 27)], -4),
            ],
            preserve_density=True,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[
                                            FW([{
                                                'name': 'zoo',
                                                'sign': 1
                                            }, {
                                                'name': 'bar',
                                                'sign': 1
                                            }], 3),
                                        ],
                                        neg=[FW(hl_in_text, -2)],
                                    ))
Beispiel #2
0
def test_weighted_spans_word():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word')
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('lemon', 4),
                            FW('bias', 8)],
                       neg=[FW('tree', -6)],
                       neg_remaining=10))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4),
                   ('tree', [(23, 27)], -6)],
            preserve_density=False,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[FW('bias', 8),
                                             FW(hl_in_text, 0)],
                                        neg=[],
                                        neg_remaining=10,
                                    ))
Beispiel #3
0
def test_no_weighted_spans():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(doc, vec, FeatureWeights(pos=[], neg=[]))
    assert w_spans == WeightedSpans(analyzer='char',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[],
                                    other=FeatureWeights(pos=[], neg=[]))
Beispiel #4
0
def test_weighted_spans_feature_union():
    doc = {'text': 'I see: a leaning lemon tree', 'url': 'http://example.com'}
    vec = FeatureUnion([
        ('text',
         CountVectorizer(analyzer='word',
                         preprocessor=lambda x: x['text'].lower())),
        ('url',
         CountVectorizer(analyzer='char',
                         ngram_range=(4, 4),
                         preprocessor=lambda x: x['url'])),
    ])
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[
            FW('text__see', 2),
            FW('text__lemon', 4),
            FW('bias', 8),
            FW('url__ampl', 10),
            FW('url__mple', 7),
        ],
                       neg=[
                           FW('text__tree', -6),
                           FW('url__exam', -10),
                       ],
                       neg_remaining=10))
    assert w_spans == WeightedSpans(
        [
            DocWeightedSpans(
                document='i see: a leaning lemon tree',
                spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4),
                       ('tree', [(23, 27)], -6)],
                preserve_density=False,
                vec_name='text',
            ),
            DocWeightedSpans(
                document='http://example.com',
                spans=[('exam', [(7, 11)], -10), ('ampl', [(9, 13)], 10),
                       ('mple', [(10, 14)], 7)],
                preserve_density=True,
                vec_name='url',
            ),
        ],
        other=FeatureWeights(
            pos=[
                FW('bias', 8),
                FW(FormattedFeatureName('url: Highlighted in text (sum)'), 7),
                FW(FormattedFeatureName('text: Highlighted in text (sum)'), 0),
            ],
            neg=[],
            neg_remaining=10,
        ))
Beispiel #5
0
def test_weighted_spans_word_stopwords():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word', stop_words='english')
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('lemon', 5),
                            FW('bias', 8)],
                       neg=[FW('tree', -6)]))
    assert w_spans == WeightedSpans(analyzer='word',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[('lemon', [(17, 22)], 5),
                                                    ('tree', [(23, 27)], -6)],
                                    other=FeatureWeights(
                                        pos=[FW('bias', 8),
                                             FW('see', 2)],
                                        neg=[FW(hl_in_text, -1)],
                                    ))
Beispiel #6
0
def test_weighted_spans_char():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('a le', 5),
                            FW('on ', 8)],
                       neg=[FW('lem', -6)]))
    assert w_spans == WeightedSpans(analyzer='char',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[('see', [(2, 5)], 2),
                                                    ('lem', [(17, 20)], -6),
                                                    ('on ', [(20, 23)], 8),
                                                    ('a le', [(7, 11)], 5)],
                                    other=FeatureWeights(
                                        pos=[FW(hl_in_text, 9)],
                                        neg=[],
                                    ))
Beispiel #7
0
def test_weighted_spans_char_wb():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='char_wb', ngram_range=(3, 4))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(pos=[FW('see', 2),
                            FW('a le', 5),
                            FW('on ', 8)],
                       neg=[FW('lem', -6), FW(' lem', -4)]))
    assert w_spans == WeightedSpans([
        DocWeightedSpans(
            document='i see: a leaning lemon tree',
            spans=[('see', [(2, 5)], 2), ('lem', [(17, 20)], -6),
                   ('on ', [(20, 23)], 8), (' lem', [(16, 20)], -4)],
            preserve_density=True,
        )
    ],
                                    other=FeatureWeights(
                                        pos=[FW('a le', 5),
                                             FW(hl_in_text, 0)],
                                        neg=[],
                                    ))
Beispiel #8
0
def test_weighted_spans_word_bigrams():
    doc = 'I see: a leaning lemon tree'
    vec = CountVectorizer(analyzer='word', ngram_range=(1, 2))
    vec.fit([doc])
    w_spans = get_weighted_spans(
        doc, vec,
        FeatureWeights(
            pos=[FW('see', 2),
                 FW('leaning lemon', 5),
                 FW('lemon tree', 8)],
            neg=[FW('tree', -6)]))
    assert w_spans == WeightedSpans(analyzer='word',
                                    document='i see: a leaning lemon tree',
                                    weighted_spans=[
                                        ('see', [(2, 5)], 2),
                                        ('tree', [(23, 27)], -6),
                                        ('leaning lemon', [(9, 16),
                                                           (17, 22)], 5),
                                        ('lemon tree', [(17, 22), (23, 27)], 8)
                                    ],
                                    other=FeatureWeights(
                                        pos=[FW(hl_in_text, 9)],
                                        neg=[],
                                    ))
Beispiel #9
0
def _add_weighted_spans(doc, vec, target_expl):
    if isinstance(doc, six.string_types) and vec is not None:
        weighted_spans = get_weighted_spans(doc, vec,
                                            target_expl.feature_weights)
        if weighted_spans:
            target_expl.weighted_spans = weighted_spans
Beispiel #10
0
def test_feature_union_unsupported():
    doc = 'I see: a leaning lemon tree'
    vec = FeatureUnion([('vec', CountVectorizer(analyzer=lambda x: x))])
    vec.fit([doc])
    w_spans = get_weighted_spans(doc, vec, FeatureWeights(pos=[], neg=[]))
    assert w_spans is None