Ejemplos de merge_matches en Python, ejemplos de licensedcode.match.merge_matches en Python

Ejemplo n.º 1

0

Mostrar archivo

    def test_merge_does_not_merge_overlapping_matches_in_sequence_with_assymetric_overlap(
            self):
        r1 = Rule(text_file='r1', license_expression=u'lgpl-2.0-plus')

        # ---> merge_matches: current: LicenseMatch<'3-seq', lines=(9, 28), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=87.5, qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(199, 200), ispan=Span(5, 21)|Span(23, 46)|Span(48, 77)|Span(79, 93)|Span(95, 100)|Span(108, 128)|Span(130, 142), hispan=Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)>
        # ---> merge_matches: next:    LicenseMatch<'2-aho', lines=(28, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143), qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1)|Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)>
        #     ---> ###merge_matches: next overlaps in sequence current, merged as new: LicenseMatch<'3-seq 2-aho', lines=(9, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=268, ilen=144, hilen=21, rlen=144, qreg=(50, 341), ireg=(0, 143), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(198, 341), ispan=Span(0, 143), his

        # ---> merge_matches: current: qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142)
        # ---> merge_matches: next:    qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143)

        m1 = LicenseMatch(
            rule=r1,
            qspan=Span(50, 90) | Span(92, 142) | Span(151, 182)
            | Span(199, 200),
            ispan=Span(5, 21) | Span(23, 46) | Span(48, 77) | Span(79, 93)
            | Span(95, 100) | Span(108, 128) | Span(130, 142),
            hispan=Span(10) | Span(14) | Span(18) | Span(24) | Span(27)
            | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68)
            | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113)
            | Span(115) | Span(131) | Span(141),
        )
        m2 = LicenseMatch(rule=r1,
                          qspan=Span(198, 341),
                          ispan=Span(0, 143),
                          hispan=Span(1) | Span(10) | Span(14) | Span(18)
                          | Span(24) | Span(27) | Span(52) | Span(57)
                          | Span(61) | Span(65, 66) | Span(68) | Span(70)
                          | Span(80) | Span(88) | Span(96) | Span(111)
                          | Span(113) | Span(115) | Span(131) | Span(141))

        matches = merge_matches([m1, m2])
        assert [m1, m2] == matches

Ejemplo n.º 2

0

Mostrar archivo

    def test_merge_merges_duplicate_matches(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))
        m2 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))

        matches = merge_matches([m1, m2])
        assert ([m1] == matches) or ([m2] == matches)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_match.py Proyecto: ocabrisses/scancode-toolkit

    def test_merge_merges_duplicate_matches(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))
        m2 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))

        matches = merge_matches([m1, m2])
        assert ([m1] == matches) or ([m2] == matches)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_match.py Proyecto: ocabrisses/scancode-toolkit

    def test_merge_does_not_merge_overlapping_matches_in_sequence_with_assymetric_overlap(self):
        r1 = Rule(text_file='r1', licenses=[u'lgpl-2.0-plus'])

        # ---> merge_matches: current: LicenseMatch<'3-seq', lines=(9, 28), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=87.5, qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(199, 200), ispan=Span(5, 21)|Span(23, 46)|Span(48, 77)|Span(79, 93)|Span(95, 100)|Span(108, 128)|Span(130, 142), hispan=Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)>
        # ---> merge_matches: next:    LicenseMatch<'2-aho', lines=(28, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143), qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1)|Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)>
        #     ---> ###merge_matches: next overlaps in sequence current, merged as new: LicenseMatch<'3-seq 2-aho', lines=(9, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=268, ilen=144, hilen=21, rlen=144, qreg=(50, 341), ireg=(0, 143), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(198, 341), ispan=Span(0, 143), his

        # ---> merge_matches: current: qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142)
        # ---> merge_matches: next:    qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143)

        m1 = LicenseMatch(
            rule=r1,
            qspan=Span(50, 90) | Span(92, 142) | Span(151, 182) | Span(199, 200),
            ispan=
                Span(5, 21) | Span(23, 46) | Span(48, 77) | Span(79, 93) |
                Span(95, 100) | Span(108, 128) | Span(130, 142),
            hispan=
                Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) |
                Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) |
                Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) |
                Span(141),
        )
        m2 = LicenseMatch(
            rule=r1,
            qspan=Span(198, 341),
            ispan=Span(0, 143),
            hispan=
                Span(1) | Span(10) | Span(14) | Span(18) | Span(24) | Span(27) |
                Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) |
                Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) |
                Span(131) | Span(141))

        matches = merge_matches([m1, m2])
        assert [m1, m2] == matches

Ejemplo n.º 5

0

Mostrar archivo

    def test_merge_does_merge_overlapping_matches_of_same_rules_if_in_sequence(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == merge_matches([m1, m2])

Ejemplo n.º 6

0

Mostrar archivo

    def get_fragments_matches(
        self,
        query,
        matched_qspans,
        deadline=sys.maxsize,
        **kwargs,
    ):
        """
        Approximate matching strategy breaking a query in query_runs and using
        fragment matching. Return a list of matches.
        """
        matches = []

        for query_run in query.query_runs:
            # we cannot do a sequence match in query run without some high token left
            if not query_run.is_matchable(include_low=False,
                                          qspans=matched_qspans):
                continue
            qrun_matches = match_aho.match_fragments(self, query_run)
            matches.extend(match.merge_matches(qrun_matches))
            # break if deadline has passed
            if time() > deadline:
                break

        return matches

Ejemplo n.º 7

0

Mostrar archivo

    def test_merge_overlapping_matches(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        matches = merge_matches([m1, m2])
        assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == matches

Ejemplo n.º 8

0

Mostrar archivo

 def test_merge_should_not_merge_repeated_matches_out_of_sequence(self):
     rule = Rule(text_file='gpl-2.0_49.RULE', licenses=[u'gpl-2.0'])
     rule.rid = 2615
     m1 = LicenseMatch(rule=rule, matcher='chunk1', qspan=Span(0, 7), ispan=Span(0, 7))
     m2 = LicenseMatch(rule=rule, matcher='chunk2', qspan=Span(8, 15), ispan=Span(0, 7))
     m3 = LicenseMatch(rule=rule, matcher='chunk3', qspan=Span(16, 23), ispan=Span(0, 7))
     result = merge_matches([m1, m2, m3])
     assert [m1, m2, m3] == result

Ejemplo n.º 9

0

Mostrar archivo

    def test_merge_contiguous_contained_matches(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6))
        m5 = LicenseMatch(rule=r1, qspan=Span(7, 8), ispan=Span(7, 8))

        result = merge_matches([m1, m2, m5])
        assert [LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))] == result

Ejemplo n.º 10

0

Mostrar archivo

    def test_merge_contiguous_touching_matches_in_sequence(self):
        r1 = Rule(_text='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6))

        result = merge_matches([m1, m2])
        match = result[0]
        assert LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)) == match

Ejemplo n.º 11

0

Mostrar archivo

    def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_not_sequence(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])

        m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3))
        m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(1, 3))

        matches = merge_matches([m1, m2])
        assert sorted([m1, m2]) == sorted(matches)

Ejemplo n.º 12

0

Mostrar archivo

    def test_merge_does_not_merge_overlapping_matches_of_different_rules_with_different_licensing(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl2'])

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        assert [m1, m2] == merge_matches([m1, m2])

Ejemplo n.º 13

0

Mostrar archivo

    def test_merge_does_merge_non_contiguous_matches_in_sequence(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(4, 6), ispan=Span(4, 6))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        results = merge_matches([m1, m2, m5])
        assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == results

Ejemplo n.º 14

0

Mostrar archivo

    def test_merge_does_not_merge_contained_matches_of_different_rules_with_same_licensing(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])

        m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        matches = merge_matches([m1, m2])
        assert sorted([m1, m2]) == sorted(matches)

Ejemplo n.º 15

0

Mostrar archivo

    def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r1.length = 20
        m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10))
        m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20))

        expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))]
        results = merge_matches([m1, m2])
        assert expected == results

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_match.py Proyecto: ocabrisses/scancode-toolkit

 def test_merge_merges_contained_and_overlapping_match(self):
     r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
     m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
     contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4))
     overlapping = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
     assert contained in overlapping
     assert contained in m1
     result = merge_matches([m1, contained, overlapping])
     expected = [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))]
     assert expected == result

Ejemplo n.º 17

0

Mostrar archivo

    def test_merge_does_not_merges_matches_with_same_spans_if_rules_are_different(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        result = merge_matches([m1, m2, m5])
        assert sorted([LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(result)

Ejemplo n.º 18

0

Mostrar archivo

 def test_merge_merges_contained_and_overlapping_match(self):
     r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
     m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
     contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4))
     overlapping = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
     assert contained in overlapping
     assert contained in m1
     result = merge_matches([m1, contained, overlapping])
     expected = [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))]
     assert expected == result

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_match.py Proyecto: yashdsaraf/scancode-toolkit

    def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps(
            self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])

        m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3))
        m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(4, 10))

        assert [
            LicenseMatch(rule=r1,
                         qspan=Span(1, 3) | Span(14, 20),
                         ispan=Span(1, 10))
        ] == merge_matches([m1, m2])

Ejemplo n.º 20

0

Mostrar archivo

    def test_merge_does_not_merges_matches_with_same_spans_if_licenses_are_the_same_but_have_different_licenses_ordering(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', license_expression='gpl OR apache-2.0')
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        result = merge_matches([m1, m2, m5])
        assert sorted(
            [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)),
             m2]) == sorted(result)

Ejemplo n.º 21

0

Mostrar archivo

    def test_merge_does_not_merge_matches_with_same_spans_if_licenses_are_identical_but_rule_differ(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', license_expression='apache-2.0')
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        matches = merge_matches([m1, m2, m5])
        assert sorted(
            [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)),
             m2]) == sorted(matches)

Ejemplo n.º 22

0

Mostrar archivo

    def test_merge_then_filter_matches_with_same_spans_if_licenses_are_identical_but_rule_differ(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', licenses=['apache-2.0'])
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        matches = merge_matches([m1, m2, m5])
        matches, discarded = filter_contained_matches(matches)

        assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == matches
        assert discarded

Ejemplo n.º 23

0

Mostrar archivo

    def test_merge_does_not_merge_overlaping_matches_with_same_licensings(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])

        overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        result = merge_matches([overlap, same_span1, same_span2])
        expected = [
            LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)),
            LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)),
        ]
        assert sorted(expected) == sorted(result)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: index.py Proyecto: vsurge/barista

    def get_approximate_matches(self, query):
        """
        Approximate matching strategy using query_runs and multiple local
        alignments (aka. diff) Return a list of matches.
        """
        matches = []
        # we exclude small and "weak" rules from the subset entirely: they are
        # unlikely to be matchable with a seq match
        rules_subset = (self.regular_rids | self.small_rids).difference(
            self.weak_rids)

        for query_run in query.query_runs:
            if not query_run.is_matchable(include_low=True):
                continue

            # per query run hash matching just in case we are lucky
            hash_matches = match_hash.hash_match(self, query_run)
            if hash_matches:
                matches.extend(hash_matches)
                continue

            # inverted index match and ranking, query run-level
            # FIXME: we should consider aho matches to excludes them from candidates
            # FIXME: also exclude from candidates any rule that is only aho-matchable
            run_matches = []
            MAX_CANDIDATES = 50
            candidates = match_set.compute_candidates(
                query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES)

            # multiple sequence matching/alignment, query run-level
            for candidate in candidates:
                start_offset = 0
                while True:
                    rule_matches = match_seq.match_sequence(
                        self, candidate, query_run, start_offset=start_offset)
                    if not rule_matches:
                        break
                    else:
                        matches_end = max(m.qend for m in rule_matches)
                        run_matches.extend(rule_matches)
                        if matches_end + 1 < query_run.end:
                            start_offset = matches_end + 1
                            continue
                        else:
                            break

            matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST))

        return matches

Ejemplo n.º 25

0

Mostrar archivo

    def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        r1.length = 50

        m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3))
        m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(4, 10))

        expected = [
            LicenseMatch(rule=r1,
                         qspan=Span(1, 3) | Span(14, 20),
                         ispan=Span(1, 10))
        ]
        results = merge_matches([m1, m2])
        assert expected == results

Ejemplo n.º 26

0

Mostrar archivo

    def test_merge_does_not_merge_multiple_contained_matches_across_rules(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))

        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])
        contained1 = LicenseMatch(rule=r2, qspan=Span(1, 2), ispan=Span(1, 2))

        r3 = Rule(text_file='r3', licenses=['apache-2.0', 'gpl'])
        contained2 = LicenseMatch(rule=r3, qspan=Span(3, 4), ispan=Span(3, 4))

        r5 = Rule(text_file='r5', licenses=['apache-2.0', 'gpl'])
        m5 = LicenseMatch(rule=r5, qspan=Span(1, 6), ispan=Span(1, 6))

        result = merge_matches([m1, contained1, contained2, m5])
        assert sorted([m1, contained1, contained2, m5]) == sorted(result)

Ejemplo n.º 27

0

Mostrar archivo

    def get_approximate_matches(self, query, matched_qspans=None, **kwargs):
        """
        Approximate matching strategy breaking a query in query_runs and using
        exacat matching then multiple local alignments (aka. diff). Return a
        list of matches.
        """
        matches = []
        rules_subset = self.approx_matchable_rules_subset

        for query_run in query.query_runs:

            if not query_run.is_matchable(include_low=False,
                                          qspans=matched_qspans):
                continue

            # inverted index match and ranking, query run-level
            # FIXME: we should consider aho matches to excludes them from candidates
            # FIXME: also exclude from candidates any rule that is only aho-matchable
            run_matches = []
            MAX_CANDIDATES = 50
            candidates = match_set.compute_candidates(
                query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES)

            # multiple sequence matching/alignment, query run-level
            for candidate in candidates:
                start_offset = 0
                while True:
                    rule_matches = match_seq.match_sequence(
                        self, candidate, query_run, start_offset=start_offset)
                    if not rule_matches:
                        break
                    else:
                        matches_end = max(m.qend for m in rule_matches)
                        run_matches.extend(rule_matches)
                        if matches_end + 1 < query_run.end:
                            start_offset = matches_end + 1
                            continue
                        else:
                            break

            matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST))

        return matches

Ejemplo n.º 28

0

Mostrar archivo

    def get_query_run_approximate_matches(
        self,
        query_run,
        candidates,
        matched_qspans,
        deadline=sys.maxsize,
        **kwargs,
    ):
        """
        Return Return a list of approximate matches for a single query run.
        """
        matches = []

        # we cannot do a sequence match in query run without some high token left
        if not query_run.is_matchable(include_low=False, qspans=matched_qspans):
            if TRACE_APPROX:
                logger_debug(
                    'get_query_run_approximate_matches: query_run not matchable:', query_run)

            return matches

        # Perform multiple sequence matching/alignment for each candidate,
        # query run-level for as long as we have more non-overlapping
        # matches returned

        for _score_vecs, rid, candidate_rule, high_intersection in candidates:
            if USE_DMP:
                # Myers diff works best when the difference are small, otherwise
                # it performs rather poorly as it is not aware of legalese
                match_blocks = match_blocks_dmp
                high_postings = None

            else:
                # we prefer to use the high tken aware seq matching only
                # when the matches are not clear. it works best when things
                # are farther apart
                match_blocks = match_blocks_seq
                high_postings = self.high_postings_by_rid[rid]
                high_postings = {
                    tid: postings for tid, postings in high_postings.items()
                        if tid in high_intersection}

            start_offset = 0
            while True:
                rule_matches = match_seq.match_sequence(
                    self, candidate_rule, query_run,
                    high_postings=high_postings,
                    start_offset=start_offset,
                    match_blocks=match_blocks,
                )

                if TRACE_APPROX_MATCHES:
                    self.debug_matches(
                        matches=rule_matches,
                        message='get_query_run_approximate_matches: rule_matches:',
                        with_text=True,
                        qry=query_run.query,
                    )

                if not rule_matches:
                    break

                matches_end = max(m.qend for m in rule_matches)
                matches.extend(rule_matches)

                if matches_end + 1 < query_run.end:
                    start_offset = matches_end + 1
                    continue
                else:
                    break

                # break if deadline has passed
                if time() > deadline:
                    break

            # break if deadline has passed
            if time() > deadline:
                break

        # FIXME: is this really needed here?
        matches = match.merge_matches(matches)

        return matches

Ejemplo n.º 29

0

Mostrar archivo

    def match_query(
        self,
        qry,
        min_score=0,
        as_expression=False,
        expression_symbols=None,
        approximate=True,
        unknown_licenses=False,
        deadline=sys.maxsize,
        _skip_hash_match=False,
        **kwargs,
    ):
        """
        Return a sequence of LicenseMatch by matching the ``qry`` Query against
        this index. See Index.match() for arguments documentation.
        """

        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            return []

        if not _skip_hash_match:
            matches = match_hash.hash_match(self, whole_query_run)
            if matches:
                match.set_matched_lines(matches, qry.line_by_pos)
                return matches

        get_spdx_id_matches = partial(
            self.get_spdx_id_matches,
            expression_symbols=expression_symbols,
        )

        if as_expression:
            matches = get_spdx_id_matches(qry, from_spdx_id_lines=False)
            match.set_matched_lines(matches, qry.line_by_pos)
            return matches

        matches = []

        if USE_AHO_FRAGMENTS:
            approx = self.get_fragments_matches
        else:
            approx = self.get_approximate_matches

        matchers = [
            # matcher, include_low in post-matching remaining matchable check
            (self.get_exact_matches, False, 'aho'),
            (get_spdx_id_matches, True, 'spdx_lid'),
        ]

        if approximate:
            matchers += [(approx, False, 'seq'), ]

        already_matched_qspans = []
        for matcher, include_low, matcher_name in matchers:
            if TRACE:
                logger_debug()
                logger_debug('matching with matcher:', matcher_name)

            matched = matcher(
                qry,
                matched_qspans=already_matched_qspans,
                existing_matches=matches,
                deadline=deadline,
            )

            if TRACE:
                self.debug_matches(
                    matches=matched,
                    message='matched with: ' + matcher_name,
                    location=qry.location,
                    query_string=qry.query_string,
                )

            matched = match.merge_matches(matched)
            matches.extend(matched)

            # Subtract whole text matched if this is long enough
            for m in matched:
                if (m.rule.is_license_text
                    and m.rule.length > 120
                    and m.coverage() > 98
                ):
                    qry.subtract(m.qspan)

            # Check if we have some matchable left do not match futher if we do
            # not need to collect qspans matched exactly e.g. with coverage 100%
            # this coverage check is because we have provision to match
            # fragments (unused for now).

            already_matched_qspans.extend(
                m.qspan for m in matched if m.coverage() == 100)

            if not whole_query_run.is_matchable(
                include_low=include_low, qspans=already_matched_qspans):
                break

            # break if deadline has passed
            if time() > deadline:
                break

        # refining matches without filtering false positives
        matches, _discarded = match.refine_matches(
            matches=matches,
            query=qry,
            min_score=min_score,
            filter_false_positive=False,
            merge=True,
        )

        if unknown_licenses:
            good_matches, weak_matches = match.split_weak_matches(matches)
            # collect the positions that are "good matches" to exclude from
            # matching for unknown_licenses. Create a Span to check for unknown
            # based on this.
            original_qspan = Span(0, len(qry.tokens) - 1)
            good_qspans = (m.qspan for m in good_matches)
            good_qspan = Span().union(*good_qspans)

            unmatched_qspan = original_qspan.difference(good_qspan)

            # for each subspan, run unknown license detection
            unknown_matches = []
            for unspan in unmatched_qspan.subspans():
                unquery_run = query.QueryRun(
                    query=qry,
                    start=unspan.start,
                    end=unspan.end,
                )

                unknown_match = match_unknown.match_unknowns(
                    idx=self,
                    query_run=unquery_run,
                    automaton=self.unknown_automaton,
                )

                if unknown_match:
                    unknown_matches.append(unknown_match)

            unknown_matches = match.filter_invalid_contained_unknown_matches(
                unknown_matches=unknown_matches,
                good_matches=good_matches,
            )

            matches.extend(unknown_matches)
            # reinject weak matches and let refine matches keep the bests
            matches.extend(weak_matches)

        if not matches:
            return []

        if TRACE:
            logger_debug()
            self.debug_matches(
                matches=matches, message='matches before final merge',
                location=qry.location,
                query_string=qry.query_string,
                with_text=True, qry=qry)

        matches, _discarded = match.refine_matches(
            matches=matches,
            query=qry,
            min_score=min_score,
            filter_false_positive=True,
            merge=True,
        )

        matches.sort()

        if TRACE:
            self.debug_matches(
                matches=matches,
                message='final matches',
                location=qry.location,
                query_string=qry.query_string,
                with_text=True,
                qry=qry,
            )

        return matches

Ejemplo n.º 30

0

Mostrar archivo

def match_fragments(idx, query_run):
    """
    Return a list of Span by matching the `query_run` against the `automaton`
    and `idx` index.

    This is using a BLAST-like matching approach: we match ngram fragments of
    rules (e.g. a seed) and then we extend left and right.
    """
    if TRACE_FRAG:
        logger_debug('-------------->match_fragments')

    # Get matches using the AHO Fragments automaton
    matches = exact_match(idx,
                          query_run,
                          automaton=idx.fragments_automaton,
                          matcher=MATCH_AHO_FRAG)
    if TRACE_FRAG:
        logger_debug('match_fragments')
        for m in matches:
            print(m)

    # Discard fragments that have any already matched positions in previous matches
    from licensedcode.match import filter_already_matched_matches
    matches, _discarded = filter_already_matched_matches(
        matches, query_run.query)

    # Merge matches with a zero max distance, e.g. contiguous or overlapping
    # with matches to the same rule
    from licensedcode.match import merge_matches
    matches = merge_matches(matches, max_dist=0)

    # extend matched fragments left and right. We group by rule
    from licensedcode.seq import extend_match

    rules_by_rid = idx.rules_by_rid
    tids_by_rid = idx.tids_by_rid
    len_legalese = idx.len_legalese

    alo = qbegin = query_run.start
    ahi = query_run.end
    query = query_run.query
    qtokens = query.tokens
    matchables = query_run.matchables

    frag_matches = []

    keyf = lambda m: m.rule.rid
    matches.sort(key=keyf)
    matches_by_rule = groupby(matches, key=keyf)

    for rid, rule_matches in matches_by_rule:
        itokens = tids_by_rid[rid]
        blo, bhi = 0, len(itokens)
        rule = rules_by_rid[rid]

        for match in rule_matches:
            i, j, k = match.qstart, match.istart, match.len()
            # extend alignment left and right as long as we have matchables
            qpos, ipos, mlen = extend_match(i, j, k, qtokens, itokens, alo,
                                            ahi, blo, bhi, matchables)

            qspan = Span(range(qpos, qpos + mlen))
            ispan = Span(range(ipos, ipos + mlen))
            hispan = Span(p for p in ispan if itokens[p] < len_legalese)
            match = LicenseMatch(rule,
                                 qspan,
                                 ispan,
                                 hispan,
                                 qbegin,
                                 matcher=MATCH_AHO_FRAG,
                                 query=query)
            frag_matches.append(match)

    # Merge matches as usual
    matches = merge_matches(matches)

    return frag_matches

Ejemplo n.º 31

0

Mostrar archivo

Archivo: index.py Proyecto: akugarg/scancode-toolkit

    def match_query(
        self,
        qry,
        min_score=0,
        as_expression=False,
        expression_symbols=None,
        approximate=True,
        deadline=sys.maxsize,
        _skip_hash_match=False,
        **kwargs,
    ):
        """
        Return a sequence of LicenseMatch by matching the `qry` Query against
        this index. See Index.match() for arguments documentation.
        """

        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            return []

        if not _skip_hash_match:
            matches = match_hash.hash_match(self, whole_query_run)
            if matches:
                match.set_lines(matches, qry.line_by_pos)
                return matches

        get_spdx_id_matches = partial(
            self.get_spdx_id_matches,
            expression_symbols=expression_symbols,
        )

        if as_expression:
            matches = get_spdx_id_matches(qry, from_spdx_id_lines=False)
            match.set_lines(matches, qry.line_by_pos)
            return matches

        matches = []

        if USE_AHO_FRAGMENTS:
            approx = self.get_fragments_matches
        else:
            approx = self.get_approximate_matches

        matchers = [
            # matcher, include_low in post-matching remaining matchable check
            (self.get_exact_matches, False, 'aho'),
            (get_spdx_id_matches, True, 'spdx_lid'),
        ]

        if approximate:
            matchers += [
                (approx, False, 'seq'),
            ]

        already_matched_qspans = []
        for matcher, include_low, matcher_name in matchers:
            if TRACE:
                logger_debug()
                logger_debug('matching with matcher:', matcher_name)

            matched = matcher(
                qry,
                matched_qspans=already_matched_qspans,
                existing_matches=matches,
                deadline=deadline,
            )

            if TRACE:
                self.debug_matches(
                    matches=matched,
                    message='matched with: ' + matcher_name,
                    location=location,
                    query_string=query_string,
                )

            matched = match.merge_matches(matched)
            matches.extend(matched)

            # Subtract whole text matched if this is long enough
            for m in matched:
                if (m.rule.is_license_text and m.rule.length > 120
                        and m.coverage() > 98):
                    qry.subtract(m.qspan)

            # Check if we have some matchable left do not match futher if we do
            # not need to collect qspans matched exactly e.g. with coverage 100%
            # this coverage check is because we have provision to match
            # fragments (unused for now).

            already_matched_qspans.extend(m.qspan for m in matched
                                          if m.coverage() == 100)

            if not whole_query_run.is_matchable(include_low=include_low,
                                                qspans=already_matched_qspans):
                break

            # break if deadline has passed
            if time() > deadline:
                break

        if not matches:
            return []

        if TRACE:
            logger_debug()
            self.debug_matches(matches=matches,
                               message='matches before final merge',
                               location=location,
                               query_string=query_string,
                               with_text=True,
                               qry=qry)

        matches, _discarded = match.refine_matches(
            matches=matches,
            idx=self,
            query=qry,
            min_score=min_score,
            filter_false_positive=True,
            merge=True,
        )

        matches.sort()
        match.set_lines(matches, qry.line_by_pos)

        if TRACE:
            self.debug_matches(
                matches=matches,
                message='final matches',
                location=location,
                query_string=query_string,
                with_text=True,
                qry=qry,
            )

        return matches

Ejemplo n.º 32

0

Mostrar archivo

Archivo: index.py Proyecto: prankush-sharma/scancode-toolkit

    def match(self, location=None, query_string=None, min_score=0,
              as_expression=False, deadline=sys.maxsize, _skip_hash_match=False,
              **kwargs):
        """
        This is the main entry point to match licenses.

        Return a sequence of LicenseMatch by matching the file at `location` or
        the `query_string` text against the index. Only include matches with
        scores greater or equal to `min_score`.

        If `as_expression` is True, treat the whole text as a single SPDX
        license expression and use only expression matching.

        `deadline` is a time.time() value in seconds by which the processing should stop
        and return whatever was matched so far.

        `_skip_hash_match` is used only for testing.
        """
        assert 0 <= min_score <= 100

        if not location and not query_string:
            return []

        qry = query.build_query(location, query_string, idx=self,
            text_line_threshold=15, bin_line_threshold=50)
        if TRACE:
            logger_debug('match: for:', location, 'query:', qry)
        if not qry:
            return []

        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            return []

        if not _skip_hash_match:
            matches = match_hash.hash_match(self, whole_query_run)
            if matches:
                match.set_lines(matches, qry.line_by_pos)
                return matches

        # TODO: add match to degenerated expressions with custom symbols
        if as_expression:
            matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False)
            match.set_lines(matches, qry.line_by_pos)
            return matches

        negative_matches = []
        if self.negative_rids:
            negative_matches = self.negative_match(whole_query_run)
            for neg in negative_matches:
                whole_query_run.subtract(neg.qspan)
            if TRACE_NEGATIVE:
                self.debug_matches(
                    matches=negative_matches, message='negative_matches',
                    location=location, query_string=query_string)  # , with_text, query)

        matches = []

        if USE_AHO_FRAGMENTS:
            approx = self.get_fragments_matches
        else:
            approx = self.get_approximate_matches

        matchers = [
            # matcher, include_low in post-matching remaining matchable check
            (self.get_spdx_id_matches, True, 'spdx_lid'),
            (self.get_exact_matches, False, 'aho'),
            (approx, False, 'seq'),
        ]

        already_matched_qspans = []
        for matcher, include_low, matcher_name in matchers:
            if TRACE:
                logger_debug()
                logger_debug('matching with matcher:', matcher_name)

            matched = matcher(qry, matched_qspans=already_matched_qspans,
                              existing_matches=matches, deadline=deadline)
            if TRACE:
                self.debug_matches(
                    matches=matched, message='matched with: ' + matcher_name,
                    location=location, query_string=query_string)  # , with_text, query)

            matched = match.merge_matches(matched)
            matches.extend(matched)

            # subtract whole text matched if this is long enough
            for m in matched:
                if m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98:
                    qry.subtract(m.qspan)

            # check if we have some matchable left
            # do not match futher if we do not need to
            # collect qspans matched exactly e.g. with coverage 100%
            # this coverage check is because we have provision to match fragments (unused for now)

            already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100)

            if not whole_query_run.is_matchable(
                include_low=include_low, qspans=already_matched_qspans):
                break

            # break if deadline has passed
            if time() > deadline:
                break

        if not matches:
            return []

        if TRACE:
            logger_debug()
            self.debug_matches(matches=matches, message='matches before final merge',
                               location=location, query_string=query_string,
                               with_text=True, qry=qry)

        matches, _discarded = match.refine_matches(
            matches, idx=self, query=qry, min_score=min_score,
            max_dist=MAX_DIST // 2, filter_false_positive=True, merge=True)

        matches.sort()
        match.set_lines(matches, qry.line_by_pos)

        if TRACE:
            print()
            self.debug_matches(matches=matches, message='final matches',
                               location=location, query_string=query_string ,
                               with_text=True, qry=qry)
        return matches

Ejemplo n.º 33

0

Mostrar archivo

    def match(self,
              location=None,
              query_string=None,
              min_score=0,
              detect_negative=True):
        """
        Return a sequence of LicenseMatch by matching the file at `location` or
        the `query_string` text against the index. Only include matches with
        scores greater or equal to `min_score`.

        `detect_negative` is for testing purpose only.
        """
        assert 0 <= min_score <= 100

        if TRACE:
            print()
            logger_debug('match start....')

        if not location and not query_string:
            return []

        qry = query.build_query(location, query_string, self)
        if not qry:
            logger_debug('#match: No query returned for:', location)
            return []

        #######################################################################
        # Whole file matching: hash, negative and exact matching
        #######################################################################
        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            logger_debug('#match: whole query not matchable')
            return []

        # hash
        hash_matches = match_hash(self, whole_query_run)
        if hash_matches:
            self.debug_matches(hash_matches, '#match FINAL Hash matched',
                               location, query_string)
            set_lines(hash_matches, qry.line_by_pos)
            return hash_matches

        # negative rules exact matching
        negative = []
        # note: detect_negative is false only to test negative rules detection proper
        if detect_negative and self.negative_rids:
            if TRACE: logger_debug('#match: NEGATIVE')
            negative = self.negative_match(whole_query_run)
            for neg in negative:
                if TRACE_NEGATIVE:
                    self.debug_matches(negative,
                                       '   ##match: NEGATIVE subtracting #:',
                                       location, query_string)
                whole_query_run.subtract(neg.qspan)
            if TRACE_NEGATIVE:
                logger_debug('     #match: NEGATIVE found', negative)

        # exact matches
        if TRACE_EXACT: logger_debug('#match: EXACT')
        exact_matches = exact_match(self, whole_query_run,
                                    self.rules_automaton)
        if TRACE_EXACT:
            self.debug_matches(exact_matches, '  #match: EXACT matches#:',
                               location, query_string)

        exact_matches, exact_discarded = refine_matches(exact_matches,
                                                        self,
                                                        query=qry)

        if TRACE_EXACT:
            self.debug_matches(exact_matches,
                               '   #match: ===> exact matches refined')
        if TRACE_EXACT:
            self.debug_matches(exact_discarded,
                               '   #match: ===> exact matches discarded')

        matches = exact_matches
        discarded = exact_discarded

        #######################################################################
        # Per query run matching.
        #######################################################################
        if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs))

        # check if we have some matchable left
        # collect qspans matched exactly e.g. with coverage 100%
        # this coverage check is because we have provision to match fragments (unused for now)
        matched_qspans = [
            m.qspan for m in exact_matches if m.coverage() == 100
        ]
        # do not match futher if we do not need to
        if whole_query_run.is_matchable(include_low=True,
                                        qspans=matched_qspans):

            rules_subset = (self.regular_rids | self.small_rids)

            for qrnum, query_run in enumerate(qry.query_runs, 1):
                if TRACE_QUERY_RUN_SIMPLE:
                    logger_debug('#match: ===> processing query run #:', qrnum)
                    logger_debug('  #match:', query_run)

                if not query_run.is_matchable(include_low=True):
                    if TRACE: logger_debug('#match: query_run NOT MATCHABLE')
                    continue

                # hash match
                #########################
                hash_matches = match_hash(self, query_run)
                if hash_matches:
                    if TRACE:
                        self.debug_matches(
                            hash_matches, '  #match Query run matches (hash)',
                            location, query_string)
                    matches.extend(hash_matches)
                    continue

                # query run match proper using sequence matching
                #########################################
                if TRACE:
                    logger_debug('  #match: Query run MATCHING proper....')

                run_matches = []
                candidates = compute_candidates(query_run,
                                                self,
                                                rules_subset=rules_subset,
                                                top=40)

                if TRACE_QUERY_RUN:
                    logger_debug(
                        '      #match: query_run: number of candidates for seq match #',
                        len(candidates))

                for candidate_num, candidate in enumerate(candidates):
                    if TRACE_QUERY_RUN:
                        logger_debug(
                            '         #match: query_run: seq matching candidate#:',
                            candidate_num, 'candidate:', candidate)
                    start_offset = 0
                    while True:
                        rule_matches = match_sequence(
                            self,
                            candidate,
                            query_run,
                            start_offset=start_offset)
                        if TRACE_QUERY_RUN and rule_matches:
                            self.debug_matches(
                                rule_matches,
                                '           #match: query_run: seq matches for candidate'
                            )
                        if not rule_matches:
                            break
                        else:
                            matches_end = max(m.qend for m in rule_matches)
                            run_matches.extend(rule_matches)

                            if matches_end + 1 < query_run.end:
                                start_offset = matches_end + 1
                                continue
                            else:
                                break

                ############################################################################
                if TRACE_QUERY_RUN:
                    self.debug_matches(run_matches,
                                       '    #match: ===> Query run matches',
                                       location,
                                       query_string,
                                       with_text=True)

                run_matches = merge_matches(run_matches, max_dist=MAX_DIST)
                matches.extend(run_matches)

                if TRACE:
                    self.debug_matches(
                        run_matches, '     #match: Query run matches merged',
                        location, query_string)

        # final matching merge, refinement and filtering
        ################################################
        if matches:
            logger_debug()
            logger_debug(
                '!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            self.debug_matches(matches,
                               '#match: ALL matches from all query runs',
                               location, query_string)

            matches, whole_discarded = refine_matches(matches,
                                                      idx=self,
                                                      query=qry,
                                                      min_score=min_score,
                                                      max_dist=MAX_DIST // 2)
            if TRACE_MATCHES_DISCARD:
                discarded.extend(whole_discarded)
            matches.sort()
            set_lines(matches, qry.line_by_pos)
            self.debug_matches(matches, '#match: FINAL MERGED', location,
                               query_string)
            if TRACE_MATCHES_DISCARD:
                self.debug_matches(discarded, '#match: FINAL DISCARDED',
                                   location, query_string)

        self.debug_matches(matches,
                           '#match: FINAL MATCHES',
                           location,
                           query_string,
                           with_text=True)

        return matches

Ejemplo n.º 34

0

Mostrar archivo

Archivo: index.py Proyecto: ocabrisses/scancode-toolkit

    def match(self, location=None, query_string=None, min_score=0, detect_negative=True):
        """
        Return a sequence of LicenseMatch by matching the file at `location` or
        the `query_string` text against the index. Only include matches with
        scores greater or equal to `min_score`.

        `detect_negative` is for testing purpose only.
        """
        assert 0 <= min_score <= 100

        if TRACE:
            print()
            logger_debug('match start....')

        if not location and not query_string:
            return []

        qry = query.build_query(location, query_string, self)
        if not qry:
            logger_debug('#match: No query returned for:', location)
            return []

        #######################################################################
        # Whole file matching: hash and exact matching
        #######################################################################
        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            logger_debug('#match: whole query not matchable')
            return []

        # hash
        hash_matches = match_hash.hash_match(self, whole_query_run)
        if hash_matches:
            if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string)
            match.set_lines(hash_matches, qry.line_by_pos)
            return hash_matches

        # negative rules exact matching
        negative_matches = []
        # note: detect_negative is false only to test negative rules detection proper
        if detect_negative and self.negative_rids:
            if TRACE: logger_debug('#match: NEGATIVE')
            negative_matches = self.negative_match(whole_query_run)
            for neg in negative_matches:
                whole_query_run.subtract(neg.qspan)

        # exact matches
        if TRACE_EXACT: logger_debug('#match: EXACT')
        exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton)
        if TRACE_EXACT: self.debug_matches(exact_matches, '  #match: EXACT matches#:', location, query_string)

        exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False)

        if TRACE_EXACT: self.debug_matches(exact_matches, '   #match: ===> exact matches refined')
        if TRACE_EXACT: self.debug_matches(exact_discarded, '   #match: ===> exact matches discarded')

        matches = exact_matches
        discarded = exact_discarded

        #######################################################################
        # Per query run matching.
        #######################################################################
        if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs))

        # check if we have some matchable left
        # collect qspans matched exactly e.g. with coverage 100%
        # this coverage check is because we have provision to match fragments (unused for now)
        matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100]
        # do not match futher if we do not need to
        if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans):

            # FIXME: we should exclude small and "weak" rules from the subset entirely
            # they are unlikely to be matchable with a seq match
            rules_subset = (self.regular_rids | self.small_rids)

            for qrnum, query_run in enumerate(qry.query_runs, 1):
                if TRACE_QUERY_RUN_SIMPLE:
                    logger_debug('#match: ===> processing query run #:', qrnum)
                    logger_debug('  #match:query_run:', query_run)

                if not query_run.is_matchable(include_low=True):
                    if TRACE: logger_debug('#match: query_run NOT MATCHABLE')
                    continue

                # hash match
                #########################
                hash_matches = match_hash.hash_match(self, query_run)
                if hash_matches:
                    if TRACE: self.debug_matches(hash_matches, '  #match Query run matches (hash)', location, query_string)
                    matches.extend(hash_matches)
                    continue

                # FIXME: why do not we aho match again here? This would avoid
                # going into the costly set and seq re-match that may not be needed at all
                # alternatively we should consider aho matches to excludes them from candidates

                # query run match proper using sequence matching
                #########################################
                if TRACE: logger_debug('  #match: Query run MATCHING proper....')

                run_matches = []
                candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40)

                if TRACE_CANDIDATES: logger_debug('      #match: query_run: number of candidates for seq match #', len(candidates))

                for candidate_num, candidate in enumerate(candidates):
                    if TRACE_QUERY_RUN:
                        _, canrule, _ = candidate
                        logger_debug('         #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule)
                    start_offset = 0
                    while True:
                        rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset)
                        if TRACE_QUERY_RUN and rule_matches:
                            self.debug_matches(rule_matches, '           #match: query_run: seq matches for candidate', with_text=True, query=qry)
                        if not rule_matches:
                            break
                        else:
                            matches_end = max(m.qend for m in rule_matches)
                            run_matches.extend(rule_matches)

                            if matches_end + 1 < query_run.end:
                                start_offset = matches_end + 1
                                continue
                            else:
                                break

                ############################################################################
                if TRACE_QUERY_RUN: self.debug_matches(run_matches, '    #match: ===> Query run matches', location, query_string, with_text=True, query=qry)

                run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST)
                matches.extend(run_matches)

                if TRACE: self.debug_matches(run_matches, '     #match: Query run matches merged', location, query_string)

        # final matching merge, refinement and filtering
        ################################################
        if matches:
            logger_debug()
            logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string)

            matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True)
            if TRACE_MATCHES_DISCARD:
                discarded.extend(whole_discarded)
            matches.sort()
            match.set_lines(matches, qry.line_by_pos)
            self.debug_matches(matches, '#match: FINAL MERGED', location, query_string)
            if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string)

        self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry)

        return matches