Exemple #1
0
    def test_match_license_performance_profiling_on_index_with_single_license(self):
        from time import time
        from licensedcode import query

        # pre-index : we are profiling only the detection, not the indexing
        rule_dir = self.get_test_loc('perf/idx/rules')
        rules = models.load_rules(rule_dir)
        idx = index.LicenseIndex(rules)
        location = self.get_test_loc('perf/idx/query.txt')
        querys = open(location, 'rb').read()

        qry = query.build_query(query_string=querys, idx=idx)

        def mini_seq_match(idx):
            list(idx.get_approximate_matches(qry, [], []))


        # qtokens_as_str = array('h', tokens).tostring()
        start = time()
        for _ in range(100):
            mini_seq_match(idx)
        duration = time() - start
        values = ('ScanCode diff:', duration)
        print(*values)
        raise Exception(values)
    def test_match_freertos(self):
        rule_dir = self.get_test_loc('mach_aho/rtos_exact/')
        idx = index.LicenseIndex(models.load_rules(rule_dir))

        query_loc = self.get_test_loc('mach_aho/rtos_exact/gpl-2.0-freertos.RULE')

        qry = query.build_query(location=query_loc, idx=idx)

        matches = match_aho.exact_match(idx, qry.whole_query_run(), idx.rules_automaton)
        assert 1 == len(matches)
        match = matches[0]
        assert match_aho.MATCH_AHO_EXACT == match.matcher
Exemple #3
0
    def test_match_freertos(self):
        rule_dir = self.get_test_loc('mach_aho/rtos_exact/')
        idx = index.LicenseIndex(models.load_rules(rule_dir))

        query_loc = self.get_test_loc(
            'mach_aho/rtos_exact/gpl-2.0-freertos.RULE')

        qry = query.build_query(location=query_loc, idx=idx)

        matches = match_aho.exact_match(idx, qry.whole_query_run(),
                                        idx.rules_automaton)
        assert len(matches) == 1
        match = matches[0]
        assert match.matcher == match_aho.MATCH_AHO_EXACT
def get_license_matches_from_query_string(query_string, start_line=1):
    """
    Returns a sequence of LicenseMatch objects from license detection of the
    `query_string` starting at ``start_line`` number. This is useful when
    matching a text fragment alone when it is part of a larger text.
    """
    if not query_string:
        return []
    from licensedcode import cache

    idx = cache.get_index()
    qry = query.build_query(
        query_string=query_string,
        idx=idx,
        start_line=start_line,
    )

    return idx.match_query(qry=qry)
Exemple #5
0
    def match(
        self,
        location=None,
        query_string=None,
        min_score=0,
        as_expression=False,
        expression_symbols=None,
        approximate=True,
        unknown_licenses=False,
        deadline=sys.maxsize,
        _skip_hash_match=False,
        **kwargs,
    ):
        """
        This is the main entry point to match licenses.

        Return a sequence of LicenseMatch by matching the file at ``location`` or
        the ``query_string`` string against this index. Only include matches with
        scores greater or equal to ``min_score``.

        If ``as_expression`` is True, treat the whole text as a single SPDX
        license expression and use only expression matching.

        Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol}
        if provided. Otherwise use the standard SPDX license symbols mapping.

        If ``approximate`` is True, perform approximate matching as a last
        matching step. Otherwise, only do hash, exact and expression matching.

        If ``unknown_licenses`` is True, perform unknown licenses matching after
        all regular matching steps.

        ``deadline`` is a time.time() value in seconds by which the processing
        should stop and return whatever was matched so far.

        ``_skip_hash_match`` is used only for testing.
        """
        assert 0 <= min_score <= 100

        if not location and not query_string:
            return []

        qry = query.build_query(
            location=location,
            query_string=query_string,
            idx=self,
            text_line_threshold=15,
            bin_line_threshold=50,
        )

        if TRACE:
            logger_debug('Index.match: for:', location, 'query:', qry)

        if not qry:
            return []

        return self.match_query(
            qry=qry,
            min_score=min_score,
            as_expression=as_expression,
            expression_symbols=expression_symbols,
            approximate=approximate,
            unknown_licenses=unknown_licenses,
            deadline=deadline,
            _skip_hash_match=_skip_hash_match,
            **kwargs,
        )
    def match(self, location=None, query_string=None, min_score=0,
              as_expression=False, deadline=sys.maxsize, _skip_hash_match=False,
              **kwargs):
        """
        This is the main entry point to match licenses.

        Return a sequence of LicenseMatch by matching the file at `location` or
        the `query_string` text against the index. Only include matches with
        scores greater or equal to `min_score`.

        If `as_expression` is True, treat the whole text as a single SPDX
        license expression and use only expression matching.

        `deadline` is a time.time() value in seconds by which the processing should stop
        and return whatever was matched so far.

        `_skip_hash_match` is used only for testing.
        """
        assert 0 <= min_score <= 100

        if not location and not query_string:
            return []

        qry = query.build_query(location, query_string, idx=self,
            text_line_threshold=15, bin_line_threshold=50)
        if TRACE:
            logger_debug('match: for:', location, 'query:', qry)
        if not qry:
            return []

        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            return []

        if not _skip_hash_match:
            matches = match_hash.hash_match(self, whole_query_run)
            if matches:
                match.set_lines(matches, qry.line_by_pos)
                return matches

        # TODO: add match to degenerated expressions with custom symbols
        if as_expression:
            matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False)
            match.set_lines(matches, qry.line_by_pos)
            return matches

        negative_matches = []
        if self.negative_rids:
            negative_matches = self.negative_match(whole_query_run)
            for neg in negative_matches:
                whole_query_run.subtract(neg.qspan)
            if TRACE_NEGATIVE:
                self.debug_matches(
                    matches=negative_matches, message='negative_matches',
                    location=location, query_string=query_string)  # , with_text, query)

        matches = []

        if USE_AHO_FRAGMENTS:
            approx = self.get_fragments_matches
        else:
            approx = self.get_approximate_matches

        matchers = [
            # matcher, include_low in post-matching remaining matchable check
            (self.get_spdx_id_matches, True, 'spdx_lid'),
            (self.get_exact_matches, False, 'aho'),
            (approx, False, 'seq'),
        ]

        already_matched_qspans = []
        for matcher, include_low, matcher_name in matchers:
            if TRACE:
                logger_debug()
                logger_debug('matching with matcher:', matcher_name)

            matched = matcher(qry, matched_qspans=already_matched_qspans,
                              existing_matches=matches, deadline=deadline)
            if TRACE:
                self.debug_matches(
                    matches=matched, message='matched with: ' + matcher_name,
                    location=location, query_string=query_string)  # , with_text, query)

            matched = match.merge_matches(matched)
            matches.extend(matched)

            # subtract whole text matched if this is long enough
            for m in matched:
                if m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98:
                    qry.subtract(m.qspan)

            # check if we have some matchable left
            # do not match futher if we do not need to
            # collect qspans matched exactly e.g. with coverage 100%
            # this coverage check is because we have provision to match fragments (unused for now)

            already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100)

            if not whole_query_run.is_matchable(
                include_low=include_low, qspans=already_matched_qspans):
                break

            # break if deadline has passed
            if time() > deadline:
                break

        if not matches:
            return []

        if TRACE:
            logger_debug()
            self.debug_matches(matches=matches, message='matches before final merge',
                               location=location, query_string=query_string,
                               with_text=True, qry=qry)

        matches, _discarded = match.refine_matches(
            matches, idx=self, query=qry, min_score=min_score,
            max_dist=MAX_DIST // 2, filter_false_positive=True, merge=True)

        matches.sort()
        match.set_lines(matches, qry.line_by_pos)

        if TRACE:
            print()
            self.debug_matches(matches=matches, message='final matches',
                               location=location, query_string=query_string ,
                               with_text=True, qry=qry)
        return matches
Exemple #7
0
    def match(self,
              location=None,
              query_string=None,
              min_score=0,
              detect_negative=True):
        """
        Return a sequence of LicenseMatch by matching the file at `location` or
        the `query_string` text against the index. Only include matches with
        scores greater or equal to `min_score`.

        `detect_negative` is for testing purpose only.
        """
        assert 0 <= min_score <= 100

        if TRACE:
            print()
            logger_debug('match start....')

        if not location and not query_string:
            return []

        qry = query.build_query(location, query_string, self)
        if not qry:
            logger_debug('#match: No query returned for:', location)
            return []

        #######################################################################
        # Whole file matching: hash, negative and exact matching
        #######################################################################
        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            logger_debug('#match: whole query not matchable')
            return []

        # hash
        hash_matches = match_hash(self, whole_query_run)
        if hash_matches:
            self.debug_matches(hash_matches, '#match FINAL Hash matched',
                               location, query_string)
            set_lines(hash_matches, qry.line_by_pos)
            return hash_matches

        # negative rules exact matching
        negative = []
        # note: detect_negative is false only to test negative rules detection proper
        if detect_negative and self.negative_rids:
            if TRACE: logger_debug('#match: NEGATIVE')
            negative = self.negative_match(whole_query_run)
            for neg in negative:
                if TRACE_NEGATIVE:
                    self.debug_matches(negative,
                                       '   ##match: NEGATIVE subtracting #:',
                                       location, query_string)
                whole_query_run.subtract(neg.qspan)
            if TRACE_NEGATIVE:
                logger_debug('     #match: NEGATIVE found', negative)

        # exact matches
        if TRACE_EXACT: logger_debug('#match: EXACT')
        exact_matches = exact_match(self, whole_query_run,
                                    self.rules_automaton)
        if TRACE_EXACT:
            self.debug_matches(exact_matches, '  #match: EXACT matches#:',
                               location, query_string)

        exact_matches, exact_discarded = refine_matches(exact_matches,
                                                        self,
                                                        query=qry)

        if TRACE_EXACT:
            self.debug_matches(exact_matches,
                               '   #match: ===> exact matches refined')
        if TRACE_EXACT:
            self.debug_matches(exact_discarded,
                               '   #match: ===> exact matches discarded')

        matches = exact_matches
        discarded = exact_discarded

        #######################################################################
        # Per query run matching.
        #######################################################################
        if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs))

        # check if we have some matchable left
        # collect qspans matched exactly e.g. with coverage 100%
        # this coverage check is because we have provision to match fragments (unused for now)
        matched_qspans = [
            m.qspan for m in exact_matches if m.coverage() == 100
        ]
        # do not match futher if we do not need to
        if whole_query_run.is_matchable(include_low=True,
                                        qspans=matched_qspans):

            rules_subset = (self.regular_rids | self.small_rids)

            for qrnum, query_run in enumerate(qry.query_runs, 1):
                if TRACE_QUERY_RUN_SIMPLE:
                    logger_debug('#match: ===> processing query run #:', qrnum)
                    logger_debug('  #match:', query_run)

                if not query_run.is_matchable(include_low=True):
                    if TRACE: logger_debug('#match: query_run NOT MATCHABLE')
                    continue

                # hash match
                #########################
                hash_matches = match_hash(self, query_run)
                if hash_matches:
                    if TRACE:
                        self.debug_matches(
                            hash_matches, '  #match Query run matches (hash)',
                            location, query_string)
                    matches.extend(hash_matches)
                    continue

                # query run match proper using sequence matching
                #########################################
                if TRACE:
                    logger_debug('  #match: Query run MATCHING proper....')

                run_matches = []
                candidates = compute_candidates(query_run,
                                                self,
                                                rules_subset=rules_subset,
                                                top=40)

                if TRACE_QUERY_RUN:
                    logger_debug(
                        '      #match: query_run: number of candidates for seq match #',
                        len(candidates))

                for candidate_num, candidate in enumerate(candidates):
                    if TRACE_QUERY_RUN:
                        logger_debug(
                            '         #match: query_run: seq matching candidate#:',
                            candidate_num, 'candidate:', candidate)
                    start_offset = 0
                    while True:
                        rule_matches = match_sequence(
                            self,
                            candidate,
                            query_run,
                            start_offset=start_offset)
                        if TRACE_QUERY_RUN and rule_matches:
                            self.debug_matches(
                                rule_matches,
                                '           #match: query_run: seq matches for candidate'
                            )
                        if not rule_matches:
                            break
                        else:
                            matches_end = max(m.qend for m in rule_matches)
                            run_matches.extend(rule_matches)

                            if matches_end + 1 < query_run.end:
                                start_offset = matches_end + 1
                                continue
                            else:
                                break

                ############################################################################
                if TRACE_QUERY_RUN:
                    self.debug_matches(run_matches,
                                       '    #match: ===> Query run matches',
                                       location,
                                       query_string,
                                       with_text=True)

                run_matches = merge_matches(run_matches, max_dist=MAX_DIST)
                matches.extend(run_matches)

                if TRACE:
                    self.debug_matches(
                        run_matches, '     #match: Query run matches merged',
                        location, query_string)

        # final matching merge, refinement and filtering
        ################################################
        if matches:
            logger_debug()
            logger_debug(
                '!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            self.debug_matches(matches,
                               '#match: ALL matches from all query runs',
                               location, query_string)

            matches, whole_discarded = refine_matches(matches,
                                                      idx=self,
                                                      query=qry,
                                                      min_score=min_score,
                                                      max_dist=MAX_DIST // 2)
            if TRACE_MATCHES_DISCARD:
                discarded.extend(whole_discarded)
            matches.sort()
            set_lines(matches, qry.line_by_pos)
            self.debug_matches(matches, '#match: FINAL MERGED', location,
                               query_string)
            if TRACE_MATCHES_DISCARD:
                self.debug_matches(discarded, '#match: FINAL DISCARDED',
                                   location, query_string)

        self.debug_matches(matches,
                           '#match: FINAL MATCHES',
                           location,
                           query_string,
                           with_text=True)

        return matches
Exemple #8
0
    def match(self,
              location=None,
              query_string=None,
              min_score=0,
              as_expression=False,
              **kwargs):
        """
        Return a sequence of LicenseMatch by matching the file at `location` or
        the `query_string` text against the index. Only include matches with
        scores greater or equal to `min_score`.

        If `as_expression` is True, treat the whole text as a single SPDX
        license expression and use only expression matching.
        """
        assert 0 <= min_score <= 100

        if not location and not query_string:
            return []

        qry = query.build_query(location,
                                query_string,
                                idx=self,
                                text_line_threshold=15,
                                bin_line_threshold=50)

        if not qry:
            return []

        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            return []

        hash_matches = match_hash.hash_match(self, whole_query_run)
        if hash_matches:
            match.set_lines(hash_matches, qry.line_by_pos)
            return hash_matches

        if as_expression:
            matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False)
            match.set_lines(matches, qry.line_by_pos)
            return matches

        negative_matches = []
        if self.negative_rids:
            negative_matches = self.negative_match(whole_query_run)
            for neg in negative_matches:
                whole_query_run.subtract(neg.qspan)
            if TRACE_NEGATIVE:
                self.debug_matches(negative_matches, 'negative_matches',
                                   location,
                                   query_string)  #, with_text, query)

        matches = []

        matchers = [
            self.get_spdx_id_matches, self.get_exact_matches,
            self.get_approximate_matches
        ]

        for matcher in matchers:
            matched = matcher(qry)
            if TRACE:
                logger_debug('matching with matcher:', matcher)
                self.debug_matches(matched, 'matched', location,
                                   query_string)  #, with_text, query)

            matches.extend(matched)
            # check if we have some matchable left
            # do not match futher if we do not need to
            # collect qspans matched exactly e.g. with coverage 100%
            # this coverage check is because we have provision to match fragments (unused for now)
            matched_qspans = [m.qspan for m in matches if m.coverage() == 100]
            if not whole_query_run.is_matchable(include_low=True,
                                                qspans=matched_qspans):
                break

        if not matches:
            return []

        matches, _discarded = match.refine_matches(matches,
                                                   idx=self,
                                                   query=qry,
                                                   min_score=min_score,
                                                   max_dist=MAX_DIST // 2,
                                                   filter_false_positive=True)

        matches.sort()
        match.set_lines(matches, qry.line_by_pos)
        return matches
Exemple #9
0
    def match(self, location=None, query_string=None, min_score=0, detect_negative=True):
        """
        Return a sequence of LicenseMatch by matching the file at `location` or
        the `query_string` text against the index. Only include matches with
        scores greater or equal to `min_score`.

        `detect_negative` is for testing purpose only.
        """
        assert 0 <= min_score <= 100

        if TRACE:
            print()
            logger_debug('match start....')

        if not location and not query_string:
            return []

        qry = query.build_query(location, query_string, self)
        if not qry:
            logger_debug('#match: No query returned for:', location)
            return []

        #######################################################################
        # Whole file matching: hash and exact matching
        #######################################################################
        whole_query_run = qry.whole_query_run()
        if not whole_query_run or not whole_query_run.matchables:
            logger_debug('#match: whole query not matchable')
            return []

        # hash
        hash_matches = match_hash.hash_match(self, whole_query_run)
        if hash_matches:
            if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string)
            match.set_lines(hash_matches, qry.line_by_pos)
            return hash_matches

        # negative rules exact matching
        negative_matches = []
        # note: detect_negative is false only to test negative rules detection proper
        if detect_negative and self.negative_rids:
            if TRACE: logger_debug('#match: NEGATIVE')
            negative_matches = self.negative_match(whole_query_run)
            for neg in negative_matches:
                whole_query_run.subtract(neg.qspan)

        # exact matches
        if TRACE_EXACT: logger_debug('#match: EXACT')
        exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton)
        if TRACE_EXACT: self.debug_matches(exact_matches, '  #match: EXACT matches#:', location, query_string)

        exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False)

        if TRACE_EXACT: self.debug_matches(exact_matches, '   #match: ===> exact matches refined')
        if TRACE_EXACT: self.debug_matches(exact_discarded, '   #match: ===> exact matches discarded')

        matches = exact_matches
        discarded = exact_discarded

        #######################################################################
        # Per query run matching.
        #######################################################################
        if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs))

        # check if we have some matchable left
        # collect qspans matched exactly e.g. with coverage 100%
        # this coverage check is because we have provision to match fragments (unused for now)
        matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100]
        # do not match futher if we do not need to
        if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans):

            # FIXME: we should exclude small and "weak" rules from the subset entirely
            # they are unlikely to be matchable with a seq match
            rules_subset = (self.regular_rids | self.small_rids)

            for qrnum, query_run in enumerate(qry.query_runs, 1):
                if TRACE_QUERY_RUN_SIMPLE:
                    logger_debug('#match: ===> processing query run #:', qrnum)
                    logger_debug('  #match:query_run:', query_run)

                if not query_run.is_matchable(include_low=True):
                    if TRACE: logger_debug('#match: query_run NOT MATCHABLE')
                    continue

                # hash match
                #########################
                hash_matches = match_hash.hash_match(self, query_run)
                if hash_matches:
                    if TRACE: self.debug_matches(hash_matches, '  #match Query run matches (hash)', location, query_string)
                    matches.extend(hash_matches)
                    continue

                # FIXME: why do not we aho match again here? This would avoid
                # going into the costly set and seq re-match that may not be needed at all
                # alternatively we should consider aho matches to excludes them from candidates

                # query run match proper using sequence matching
                #########################################
                if TRACE: logger_debug('  #match: Query run MATCHING proper....')

                run_matches = []
                candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40)

                if TRACE_CANDIDATES: logger_debug('      #match: query_run: number of candidates for seq match #', len(candidates))

                for candidate_num, candidate in enumerate(candidates):
                    if TRACE_QUERY_RUN:
                        _, canrule, _ = candidate
                        logger_debug('         #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule)
                    start_offset = 0
                    while True:
                        rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset)
                        if TRACE_QUERY_RUN and rule_matches:
                            self.debug_matches(rule_matches, '           #match: query_run: seq matches for candidate', with_text=True, query=qry)
                        if not rule_matches:
                            break
                        else:
                            matches_end = max(m.qend for m in rule_matches)
                            run_matches.extend(rule_matches)

                            if matches_end + 1 < query_run.end:
                                start_offset = matches_end + 1
                                continue
                            else:
                                break

                ############################################################################
                if TRACE_QUERY_RUN: self.debug_matches(run_matches, '    #match: ===> Query run matches', location, query_string, with_text=True, query=qry)

                run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST)
                matches.extend(run_matches)

                if TRACE: self.debug_matches(run_matches, '     #match: Query run matches merged', location, query_string)

        # final matching merge, refinement and filtering
        ################################################
        if matches:
            logger_debug()
            logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string)

            matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True)
            if TRACE_MATCHES_DISCARD:
                discarded.extend(whole_discarded)
            matches.sort()
            match.set_lines(matches, qry.line_by_pos)
            self.debug_matches(matches, '#match: FINAL MERGED', location, query_string)
            if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string)

        self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry)

        return matches