def test_unhashed_features_other(): """ Check that when there are several candidates, they do not appear in "other" if at least one is found. If none are found, they should appear in "other" together. """ doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='char', ngram_range=(3, 3)) vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights( pos=[ FW([{ 'name': 'foo', 'sign': 1 }, { 'name': 'see', 'sign': -1 }], 2), FW([{ 'name': 'zoo', 'sign': 1 }, { 'name': 'bar', 'sign': 1 }], 3), ], neg=[ FW([{ 'name': 'ree', 'sign': 1 }, { 'name': 'tre', 'sign': 1 }], -4), ], )) assert w_spans == WeightedSpans([ DocWeightedSpans( document='i see: a leaning lemon tree', spans=[ ('see', [(2, 5)], 2), ('tre', [(23, 26)], -4), ('ree', [(24, 27)], -4), ], preserve_density=True, ) ], other=FeatureWeights( pos=[ FW([{ 'name': 'zoo', 'sign': 1 }, { 'name': 'bar', 'sign': 1 }], 3), ], neg=[FW(hl_in_text, -2)], ))
def test_weighted_spans_word(): doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='word') vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights(pos=[FW('see', 2), FW('lemon', 4), FW('bias', 8)], neg=[FW('tree', -6)], neg_remaining=10)) assert w_spans == WeightedSpans([ DocWeightedSpans( document='i see: a leaning lemon tree', spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4), ('tree', [(23, 27)], -6)], preserve_density=False, ) ], other=FeatureWeights( pos=[FW('bias', 8), FW(hl_in_text, 0)], neg=[], neg_remaining=10, ))
def test_no_weighted_spans(): doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='char', ngram_range=(3, 4)) vec.fit([doc]) w_spans = get_weighted_spans(doc, vec, FeatureWeights(pos=[], neg=[])) assert w_spans == WeightedSpans(analyzer='char', document='i see: a leaning lemon tree', weighted_spans=[], other=FeatureWeights(pos=[], neg=[]))
def test_weighted_spans_feature_union(): doc = {'text': 'I see: a leaning lemon tree', 'url': 'http://example.com'} vec = FeatureUnion([ ('text', CountVectorizer(analyzer='word', preprocessor=lambda x: x['text'].lower())), ('url', CountVectorizer(analyzer='char', ngram_range=(4, 4), preprocessor=lambda x: x['url'])), ]) vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights(pos=[ FW('text__see', 2), FW('text__lemon', 4), FW('bias', 8), FW('url__ampl', 10), FW('url__mple', 7), ], neg=[ FW('text__tree', -6), FW('url__exam', -10), ], neg_remaining=10)) assert w_spans == WeightedSpans( [ DocWeightedSpans( document='i see: a leaning lemon tree', spans=[('see', [(2, 5)], 2), ('lemon', [(17, 22)], 4), ('tree', [(23, 27)], -6)], preserve_density=False, vec_name='text', ), DocWeightedSpans( document='http://example.com', spans=[('exam', [(7, 11)], -10), ('ampl', [(9, 13)], 10), ('mple', [(10, 14)], 7)], preserve_density=True, vec_name='url', ), ], other=FeatureWeights( pos=[ FW('bias', 8), FW(FormattedFeatureName('url: Highlighted in text (sum)'), 7), FW(FormattedFeatureName('text: Highlighted in text (sum)'), 0), ], neg=[], neg_remaining=10, ))
def get_weighted_spans(doc, vec, feature_weights): # type: (Any, Any, FeatureWeights) -> Optional[WeightedSpans] """ If possible, return a dict with preprocessed document and a list of spans with weights, corresponding to features in the document. """ if isinstance(vec, FeatureUnion): return _get_weighted_spans_from_union(doc, vec, feature_weights) else: result = _get_doc_weighted_spans(doc, vec, feature_weights) if result is not None: found_features, doc_weighted_spans = result return WeightedSpans( [doc_weighted_spans], other=_get_other(feature_weights, [('', found_features)]), )
def get_weighted_spans(doc, vec, feature_weights): """ If possible, return a dict with preprocessed document and a list of spans with weights, corresponding to features in the document. """ if isinstance(vec, InvertableHashingVectorizer): vec = vec.vec if not isinstance(vec, VectorizerMixin): return def _get_features(feature): if isinstance(feature, list): return [f['name'] for f in feature] else: return [feature] # (group, idx) is a feature key here feature_weights_dict = { f: (fw.weight, (group, idx)) for group in ['pos', 'neg'] for idx, fw in enumerate(getattr(feature_weights, group)) for f in _get_features(fw.feature) } span_analyzer, preprocessed_doc = _build_span_analyzer(doc, vec) if span_analyzer is None: return weighted_spans = [] found_features = {} for spans, feature in span_analyzer(preprocessed_doc): try: weight, key = feature_weights_dict[feature] except KeyError: pass else: weighted_spans.append((feature, spans, weight)) found_features[key] = weight return WeightedSpans( analyzer=vec.analyzer, document=preprocessed_doc, weighted_spans=weighted_spans, other=_get_other(feature_weights, feature_weights_dict, found_features), )
def test_weighted_spans_word_stopwords(): doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='word', stop_words='english') vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights(pos=[FW('see', 2), FW('lemon', 5), FW('bias', 8)], neg=[FW('tree', -6)])) assert w_spans == WeightedSpans(analyzer='word', document='i see: a leaning lemon tree', weighted_spans=[('lemon', [(17, 22)], 5), ('tree', [(23, 27)], -6)], other=FeatureWeights( pos=[FW('bias', 8), FW('see', 2)], neg=[FW(hl_in_text, -1)], ))
def test_weighted_spans_char(): doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='char', ngram_range=(3, 4)) vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights(pos=[FW('see', 2), FW('a le', 5), FW('on ', 8)], neg=[FW('lem', -6)])) assert w_spans == WeightedSpans(analyzer='char', document='i see: a leaning lemon tree', weighted_spans=[('see', [(2, 5)], 2), ('lem', [(17, 20)], -6), ('on ', [(20, 23)], 8), ('a le', [(7, 11)], 5)], other=FeatureWeights( pos=[FW(hl_in_text, 9)], neg=[], ))
def test_weighted_spans_char_wb(): doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='char_wb', ngram_range=(3, 4)) vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights(pos=[FW('see', 2), FW('a le', 5), FW('on ', 8)], neg=[FW('lem', -6), FW(' lem', -4)])) assert w_spans == WeightedSpans([ DocWeightedSpans( document='i see: a leaning lemon tree', spans=[('see', [(2, 5)], 2), ('lem', [(17, 20)], -6), ('on ', [(20, 23)], 8), (' lem', [(16, 20)], -4)], preserve_density=True, ) ], other=FeatureWeights( pos=[FW('a le', 5), FW(hl_in_text, 0)], neg=[], ))
def test_weighted_spans_word_bigrams(): doc = 'I see: a leaning lemon tree' vec = CountVectorizer(analyzer='word', ngram_range=(1, 2)) vec.fit([doc]) w_spans = get_weighted_spans( doc, vec, FeatureWeights( pos=[FW('see', 2), FW('leaning lemon', 5), FW('lemon tree', 8)], neg=[FW('tree', -6)])) assert w_spans == WeightedSpans(analyzer='word', document='i see: a leaning lemon tree', weighted_spans=[ ('see', [(2, 5)], 2), ('tree', [(23, 27)], -6), ('leaning lemon', [(9, 16), (17, 22)], 5), ('lemon tree', [(17, 22), (23, 27)], 8) ], other=FeatureWeights( pos=[FW(hl_in_text, 9)], neg=[], ))
def test_prepare_weighted_spans(): targets = [ TargetExplanation(target='one', feature_weights=FeatureWeights(pos=[], neg=[]), weighted_spans=WeightedSpans(docs_weighted_spans=[ DocWeightedSpans( document='ab', spans=[ ('a', [(0, 1)], 1.5), ('b', [(1, 2)], 2.5), ], ), DocWeightedSpans( document='xy', spans=[ ('xy', [(0, 2)], -4.5), ], ) ])), TargetExplanation( target='two', feature_weights=FeatureWeights(pos=[], neg=[]), weighted_spans=WeightedSpans( docs_weighted_spans=[ DocWeightedSpans( document='abc', spans=[ ('a', [(0, 1)], 0.5), ('c', [(2, 3)], 3.5), ], ), DocWeightedSpans( document='xz', spans=[ # char_wb at the start of the document (' xz', [(-1, 2)], 1.5), ], ) ], )), ] assert prepare_weighted_spans(targets, preserve_density=False) == [ [ PreparedWeightedSpans( targets[0].weighted_spans.docs_weighted_spans[0], char_weights=np.array([1.5, 2.5]), weight_range=3.5), PreparedWeightedSpans( targets[0].weighted_spans.docs_weighted_spans[1], char_weights=np.array([-4.5, -4.5]), weight_range=4.5), ], [ PreparedWeightedSpans( targets[1].weighted_spans.docs_weighted_spans[0], char_weights=np.array([0.5, 0, 3.5]), weight_range=3.5), PreparedWeightedSpans( targets[1].weighted_spans.docs_weighted_spans[1], char_weights=np.array([1.5, 1.5]), weight_range=4.5), ], ]