def test_LicenseMatch_score_0(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) r1.relevance = 0 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(), ispan=Span()) assert m1.score() == 0
def test_LicenseMatch_score_100_non_contiguous(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) r1.relevance = 100 r1.length = 42 m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41)) assert m1.score() == 80.77
def test_merge_overlapping_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) matches = merge_matches([m1, m2]) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == matches
def test_LicenseMatch_equality_2(self): r1 = Rule(stored_text='r1', license_expression='apache-2.0 OR gpl') m1_r1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) r2 = Rule(stored_text='r2', license_expression='gpl OR apache-2.0') m2_r2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) assert r1.licensing is r2.licensing assert r1 != r2 assert r1.license_expression != r2.license_expression assert r1.license_expression_object == r2.license_expression_object assert str(r1.license_expression_object.simplify()) == str( r2.license_expression_object.simplify()) assert m1_r1 == m2_r2 assert not (m1_r1 != m2_r2) assert r2.same_licensing(r2) assert m1_r1.qspan == m2_r2.qspan assert m1_r1.ispan == m2_r2.ispan r3 = Rule(stored_text='r3', license_expression='gpl OR apache-2.0') m3_r3 = LicenseMatch(rule=r3, qspan=Span(0, 2), ispan=Span(0, 3)) assert m2_r2 != m3_r3 r4 = Rule(stored_text='r3', license_expression='gpl1 OR apache-2.0') m4_r4 = LicenseMatch(rule=r4, qspan=Span(0, 2), ispan=Span(0, 3)) assert m3_r3 != m4_r4
def test_merge_merges_duplicate_matches(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') m1 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8)) m2 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8)) matches = merge_matches([m1, m2]) assert ([m1] == matches) or ([m2] == matches)
def test_LicenseMatch_score_0_relevance(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') r1.relevance = 0 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) assert m1.score() == 0
def test_merge_does_not_merge_overlapping_matches_in_sequence_with_assymetric_overlap( self): r1 = Rule(text_file='r1', license_expression=u'lgpl-2.0-plus') # ---> merge_matches: current: LicenseMatch<'3-seq', lines=(9, 28), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=87.5, qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(199, 200), ispan=Span(5, 21)|Span(23, 46)|Span(48, 77)|Span(79, 93)|Span(95, 100)|Span(108, 128)|Span(130, 142), hispan=Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> merge_matches: next: LicenseMatch<'2-aho', lines=(28, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143), qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1)|Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> ###merge_matches: next overlaps in sequence current, merged as new: LicenseMatch<'3-seq 2-aho', lines=(9, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=268, ilen=144, hilen=21, rlen=144, qreg=(50, 341), ireg=(0, 143), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(198, 341), ispan=Span(0, 143), his # ---> merge_matches: current: qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142) # ---> merge_matches: next: qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143) m1 = LicenseMatch( rule=r1, qspan=Span(50, 90) | Span(92, 142) | Span(151, 182) | Span(199, 200), ispan=Span(5, 21) | Span(23, 46) | Span(48, 77) | Span(79, 93) | Span(95, 100) | Span(108, 128) | Span(130, 142), hispan=Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141), ) m2 = LicenseMatch(rule=r1, qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1) | Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141)) matches = merge_matches([m1, m2]) assert [m1, m2] == matches
def test_LicenseMatch_score_100_contiguous(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') r1.relevance = 100 r1.length = 42 m1 = LicenseMatch(rule=r1, qspan=Span(0, 41), ispan=Span(0, 41)) assert m1.score() == 100
def test_merge_does_merge_overlapping_matches_of_same_rules_if_in_sequence(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == merge_matches([m1, m2])
def test_match_has_correct_positions_basic(self): idx = cache.get_index() querys = u'''Licensed under the GNU General Public License (GPL). Licensed under the GNU General Public License (GPL). Licensed under the GNU General Public License (GPL).''' matches = idx.match(query_string=querys) rule = [r for r in idx.rules_by_rid if r.identifier == 'gpl_69.RULE'][0] m1 = LicenseMatch(rule=rule, qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1) m2 = LicenseMatch(rule=rule, qspan=Span(8, 15), ispan=Span(0, 7), start_line=2, end_line=2) m3 = LicenseMatch(rule=rule, qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3) assert [m1, m2, m3] == matches
def test_merge_should_not_merge_repeated_matches_out_of_sequence(self): rule = Rule(text_file='gpl-2.0_49.RULE', licenses=[u'gpl-2.0']) rule.rid = 2615 m1 = LicenseMatch(rule=rule, matcher='chunk1', qspan=Span(0, 7), ispan=Span(0, 7)) m2 = LicenseMatch(rule=rule, matcher='chunk2', qspan=Span(8, 15), ispan=Span(0, 7)) m3 = LicenseMatch(rule=rule, matcher='chunk3', qspan=Span(16, 23), ispan=Span(0, 7)) result = merge_matches([m1, m2, m3]) assert [m1, m2, m3] == result
def test_merge_contiguous_contained_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) m5 = LicenseMatch(rule=r1, qspan=Span(7, 8), ispan=Span(7, 8)) result = merge_matches([m1, m2, m5]) assert [LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))] == result
def test_merge_contiguous_touching_matches_in_sequence(self): r1 = Rule(_text='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) result = merge_matches([m1, m2]) match = result[0] assert LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)) == match
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_not_sequence(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(1, 3)) matches = merge_matches([m1, m2]) assert sorted([m1, m2]) == sorted(matches)
def test_merge_does_not_merge_overlapping_matches_of_different_rules_with_different_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl2']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) assert [m1, m2] == merge_matches([m1, m2])
def test_merge_does_merge_non_contiguous_matches_in_sequence(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(4, 6), ispan=Span(4, 6)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) results = merge_matches([m1, m2, m5]) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == results
def test_LicenseMatch_equals(self): rule = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=rule, matcher='chunk1', qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1) m2 = LicenseMatch(rule=rule, matcher='chunk2', qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1) assert m1 == m2 m3 = LicenseMatch(rule=rule, matcher='chunk3', qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3) assert m1 != m3
def test_LicenseMatch_score_25_with_stored_relevance(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') r1.relevance = 50 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) # NB we do not have a query here assert m1.score() == 25
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r1.length = 20 m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20)) expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))] results = merge_matches([m1, m2]) assert expected == results
def test_merge_does_not_merge_contained_matches_of_different_rules_with_same_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) matches = merge_matches([m1, m2]) assert sorted([m1, m2]) == sorted(matches)
def test_combine_matches_with_same_rules(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) match = m1.combine(m2) assert Span(0, 6) == match.qspan assert Span(0, 6) == match.ispan
def test_filter_matches_filters_multiple_nested_contained_matches_and_large_overlapping(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) large_overlap = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4)) in_contained = LicenseMatch(rule=r1, qspan=Span(2, 3), ispan=Span(2, 3)) result, discarded = filter_contained_matches([m1, contained, in_contained, large_overlap]) assert [m1] == result assert discarded
def test_files_does_filter_contained_matches_of_different_rules_with_same_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) matches, discarded = filter_contained_matches([m1, m2]) assert [m2] == matches assert [m1] == discarded
def test_merge_does_not_merges_matches_with_same_spans_if_rules_are_different(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) result = merge_matches([m1, m2, m5]) assert sorted([LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(result)
def test_merge_merges_contained_and_overlapping_match(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4)) overlapping = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) assert contained in overlapping assert contained in m1 result = merge_matches([m1, contained, overlapping]) expected = [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] assert expected == result
def test_filter_matches_filters_matches_with_medium_overlap_only_if_license_are_the_same(self): r1 = Rule(text_file='r1', licenses=['apache-1.1']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 11), ispan=Span(3, 11)) r2 = Rule(text_file='r2', licenses=['gpl', 'apache-2.0']) m3 = LicenseMatch(rule=r2, qspan=Span(7, 15), ispan=Span(7, 15)) result, discarded = filter_contained_matches([m1, m2, m3]) assert sorted([m1, m3]) == sorted(result) assert discarded
def test_filter_prefers_longer_overlaping_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 8), ispan=Span(1, 8)) matches, discarded = filter_contained_matches([overlap, same_span1, same_span2]) assert [same_span2] == matches assert discarded
def test_combine_raise_TypeError_for_matches_of_different_rules(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl2') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) try: m1.combine(m2) except TypeError: pass
def test_filter_matches_filters_non_contiguous_or_overlapping__but_contained_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 2), ispan=Span(1, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) m3 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) m4 = LicenseMatch(rule=r1, qspan=Span(0, 7), ispan=Span(0, 7)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) result, discarded = filter_contained_matches([m1, m2, m3, m4, m5]) assert [m4] == result assert discarded
def test_filter_does_filter_overlaping_matches_with_same_licensings(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl') overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) matches, discarded = filter_contained_matches( [overlap, same_span1, same_span2]) assert [overlap] == matches assert discarded