コード例 #1
0
    def get_spdx_id_matches(self, query, from_spdx_id_lines=True, **kwargs):
        """
        Matching strategy for SPDX-Licensed-Identifier style of expressions. If
        `from_spdx_id_lines` is True detect only in the SPDX license identifier
        lines found in the query. Otherwise use the whole query for detection.
        """
        matches = []

        if from_spdx_id_lines:
            qrs_and_texts = query.spdx_lid_query_runs_and_text()
        else:
            # If we are not specifically looking at a single SPDX-Licene-
            # identifier line, then use the whole query run with the whole text.
            # Note this can only work for small texts or this will likely make
            # the expression parser choke if you feed it large texts
            query_lines = [ln for _, ln
                in tokenize.query_lines(query.location, query.query_string)]
            qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines)
            qrs_and_texts = [qrs_and_texts]

        for query_run, detectable_text in qrs_and_texts:
            if not query_run.matchables:
                # this could happen if there was some negative match applied
                continue
            spdx_match = match_spdx_lid.spdx_id_match(
                self, query_run, detectable_text)
            query_run.subtract(spdx_match.qspan)
            matches.append(spdx_match)

        return matches
コード例 #2
0
 def get_exact_matches(self, query, deadline=sys.maxsize, **kwargs):
     """
     Extract matching strategy using an automaton for multimatching at once.
     """
     wqr = query.whole_query_run()
     matches = match_aho.exact_match(self, wqr, self.rules_automaton, deadline=deadline)
     matches, _discarded = match.refine_matches(matches, self,
         query=query, filter_false_positive=False, merge=False)
     return matches
コード例 #3
0
    def get_spdx_id_matches(
        self,
        query,
        from_spdx_id_lines=True,
        expression_symbols=None,
        **kwargs,
    ):
        """
        Matching strategy for SPDX-Licensed-Identifier style of expressions. If
        `from_spdx_id_lines` is True detect only in the SPDX license identifier
        lines found in the query. Otherwise use the whole query for detection.

        Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol}
        if provided. Otherwise use the standard SPDX license symbols.
        """
        matches = []

        if from_spdx_id_lines:
            qrs_and_texts = query.spdx_lid_query_runs_and_text()
        else:
            # If we are not specifically looking at a single SPDX-Licene-
            # identifier line, then use the whole query run with the whole text.
            # Note this can only work for small texts or this will likely make
            # the expression parser choke if you feed it large texts
            query_lines = tokenize.query_lines(query.location,
                                               query.query_string)
            query_lines = [ln for _, ln in query_lines]
            qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines)
            qrs_and_texts = [qrs_and_texts]

        for query_run, detectable_text in qrs_and_texts:
            if not query_run.matchables:
                continue
            if TRACE_SPDX_LID:
                logger_debug(
                    'get_spdx_id_matches:',
                    'query_run:',
                    query_run,
                    'detectable_text:',
                    detectable_text,
                )

            spdx_match = match_spdx_lid.spdx_id_match(
                idx=self,
                query_run=query_run,
                text=detectable_text,
                expression_symbols=expression_symbols,
            )

            if spdx_match:
                query_run.subtract(spdx_match.qspan)
                matches.append(spdx_match)

        return matches
コード例 #4
0
    def get_approximate_matches(self, query, matched_qspans, existing_matches,
                                deadline=sys.maxsize, **kwargs):
        """
        Approximate matching strategy breaking a query in query_runs and using
        multiple local alignments (aka. diff). Return a list of matches.
        """
        matches = []
        matchable_rids = self.approx_matchable_rids

        already_matched_qspans = matched_qspans[:]

        MAX_NEAR_DUPE_CANDIDATES = 10

        # first check if the whole file may be close, near-dupe match
        whole_query_run = query.whole_query_run()
        near_dupe_candidates = match_set.compute_candidates(
            query_run=whole_query_run,
            idx=self,
            matchable_rids=matchable_rids,
            top=MAX_NEAR_DUPE_CANDIDATES,
            high_resemblance=True,
            _use_bigrams=USE_BIGRAM_MULTISETS,
        )

        # if near duplicates, we only match the whole file at once against these
        # candidates
        if near_dupe_candidates:
            if TRACE_APPROX_CANDIDATES:
                logger_debug('get_query_run_approximate_matches: near dupe candidates:')
                for rank, ((sv1, sv2), _rid, can, _inter) in enumerate(near_dupe_candidates, 1):
                    logger_debug(rank, sv1, sv2, can.identifier)

            matched = self.get_query_run_approximate_matches(
                whole_query_run, near_dupe_candidates, already_matched_qspans, deadline)

            matches.extend(matched)

            # subtract these
            for match in matched:
                qspan = match.qspan
                query.subtract(qspan)
                already_matched_qspans.append(qspan)

            # break if deadline has passed
            if time() > deadline:
                return matches

        # otherwise, and in all cases we break things in smaller query runs and
        # match each separately

        if USE_RULE_STARTS:
            query.refine_runs()

        if TRACE_APPROX:
            logger_debug('get_approximate_matches: len(query.query_runs):', len(query.query_runs))

        MAX_CANDIDATES = 70
        for query_run in query.query_runs:
            # inverted index match and ranking, query run-level
            candidates = match_set.compute_candidates(
                query_run=query_run,
                idx=self,
                matchable_rids=matchable_rids,
                top=MAX_CANDIDATES,
                high_resemblance=False,
                _use_bigrams=USE_BIGRAM_MULTISETS,
            )

            if TRACE_APPROX_CANDIDATES:
                logger_debug('get_query_run_approximate_matches: candidates:')
                for rank, ((sv1, sv2), _rid, can, _inter) in enumerate(candidates, 1):
                    logger_debug(rank, sv1, sv2, can.identifier)

            matched = self.get_query_run_approximate_matches(
                query_run, candidates, matched_qspans, deadline)

            matches.extend(matched)

            # break if deadline has passed
            if time() > deadline:
                break

        return matches