Example #1
0
    def __init__(self,
                 rules=None,
                 _ranked_tokens=global_tokens_by_ranks,
                 _spdx_tokens=None):
        """
        Initialize the index with an iterable of Rule objects.
        """
        # total number of unique known tokens
        self.len_tokens = 0

        # largest token ID for a "junk" token. A token with a smaller id than
        # len_junk is considered a "junk" very common token
        self.len_junk = 0
        # corresponding number of non-junk tokens: len_tokens = len_junk + len_good
        self.len_good = 0

        # mapping of token string > token id
        self.dictionary = {}

        # mapping of token id -> token string as a list where the index is the
        # token id and the value the actual token string.
        # This the reverse of the dictionary.
        self.tokens_by_tid = []

        # Note: all the following are mappings-like (using lists) of
        # rid-> data are lists of data where the index is the rule id.

        # rule objects proper
        self.rules_by_rid = []

        # token_id sequences
        self.tids_by_rid = []

        # mapping of rule id->(mapping of (token_id->[positions, ...])
        # We track only high/good tokens there. This is a "traditional"
        # positional inverted index
        self.high_postings_by_rid = []

        # mapping of rule_id -> tuple of low and high tokens ids sets/multisets
        # (low_tids_set, high_tids_set)
        self.tids_sets_by_rid = []
        # (low_tids_mset, high_tids_mset)
        self.tids_msets_by_rid = []

        # mapping of hash -> single rid : duplicated rules are not allowed
        self.rid_by_hash = {}

        # Aho-Corasick automatons for negative and small rules
        self.rules_automaton = match_aho.get_automaton()
        self.negative_automaton = match_aho.get_automaton()

        # disjunctive sets of rule ids: regular, negative, small, false positive
        self.regular_rids = set()
        self.negative_rids = set()
        self.small_rids = set()
        self.false_positive_rids = set()

        # these rule ids are for rules made entirely of low, junk tokens
        self.weak_rids = set()

        # length of the largest false_positive rule
        self.largest_false_positive_length = 0

        # if True the index has been optimized and becomes read only:
        # no new rules can be added
        self.optimized = False

        if rules:
            if TRACE_INDEXING_PERF:
                start = time()
                print('LicenseIndex: building index.')

            # index all and optimize
            self._add_rules(rules, _ranked_tokens, _spdx_tokens)

            if TRACE_INDEXING_PERF:
                duration = time() - start
                len_rules = len(self.rules_by_rid)
                print(
                    'LicenseIndex: built index with %(len_rules)d rules in %(duration)f seconds.'
                    % locals())
                self._print_index_stats()
Example #2
0
    def __init__(
        self,
        rules=None,
        _legalese=common_license_words,
        _spdx_tokens=frozenset(),
        _license_tokens=frozenset(),
    ):
        """
        Initialize the index with an iterable of Rule objects.
        ``_legalese`` is a set of common license-specific words aka. legalese
        ``_spdx_tokens`` is a set of tokens used in SPDX license identifiers
        ``license_tokens`` is a set of "license" tokens used as start or end of a rule
        """
        # total number of unique known tokens
        self.len_tokens = 0

        # largest token ID for a "legalese" token. A token with a larger id than
        # len_legalese is considered a "junk" very common token
        self.len_legalese = 0

        # mapping of token string > token id
        self.dictionary = {}

        # set of token ids made entirely of digits
        self.digit_only_tids = set()

        # mapping-like of token id -> token string as a list where the index is the
        # token id and the value the actual token string.
        # This the reverse of the dictionary.
        self.tokens_by_tid = []

        # Note: all the following are mappings-like (using lists) of
        # rid-> data are lists of data where the index is the rule id.

        # maping-like of rule_id -> rule objects proper
        self.rules_by_rid = []

        # maping-like of rule_id -> sequence of token_ids
        self.tids_by_rid = []

        # mapping-like of rule id->(mapping of (token_id->[positions, ...])
        # We track only high/good tokens there. This is a "traditional"
        # inverted index postings list
        self.high_postings_by_rid = []

        # mapping-like of rule_id -> tokens ids sets/multisets
        self.sets_by_rid = []
        self.msets_by_rid = []

        # mapping of hash -> single rid for hash match: duplicated rules are not allowed
        self.rid_by_hash = {}

        # Aho-Corasick automatons for regular rules and experimental fragments
        self.rules_automaton = match_aho.get_automaton()
        self.fragments_automaton = USE_AHO_FRAGMENTS and match_aho.get_automaton()
        self.starts_automaton = USE_RULE_STARTS and match_aho.get_automaton()
        self.unknown_automaton = match_unknown.get_automaton()

        # disjunctive sets of rule ids: regular and false positive

        # TODO: consider using intbitset instead
        self.regular_rids = set()
        self.false_positive_rids = set()

        # These rule ids are for rules that can be matched with a sequence
        # match. Other rules can only be matched exactly
        self.approx_matchable_rids = set()

        # if True the index has been optimized and becomes read only:
        # no new rules can be added
        self.optimized = False

        if rules:
            if TRACE_INDEXING_PERF:
                start = time()
                logger_debug('LicenseIndex: building index.')
            # index all and optimize
            self._add_rules(
                rules,
                _legalese=_legalese,
                _spdx_tokens=_spdx_tokens,
                _license_tokens=_license_tokens,
            )

            if TRACE_TOKEN_DOC_FREQ:
                logger_debug('LicenseIndex: token, frequency')
                from itertools import chain
                tf = Counter(chain.from_iterable(tids for rid, tids
                        in enumerate(self.tids_by_rid)
                        if rid in self.regular_rids))

            if TRACE_INDEXING_PERF:
                duration = time() - start
                len_rules = len(self.rules_by_rid)
                logger_debug('LicenseIndex: built index with %(len_rules)d rules in '
                      '%(duration)f seconds.' % locals())
                self._print_index_stats()
Example #3
0
    def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
        """
        Initialize the index with an iterable of Rule objects.
        """
        # total number of unique known tokens
        self.len_tokens = 0

        # largest token ID for a "junk" token. A token with a smaller id than
        # len_junk is considered a "junk" very common token
        self.len_junk = 0
        # corresponding number of non-junk tokens: len_tokens = len_junk + len_good
        self.len_good = 0

        # mapping of token string > token id
        self.dictionary = {}

        # mapping of token id -> token string as a list where the index is the
        # token id and the value the actual token string
        self.tokens_by_tid = []

        # Note: all the following are mappings-like (using lists) of
        # rid-> data are lists of data where the index is the rule id.

        # rule objects proper
        self.rules_by_rid = []

        # token_id sequences
        self.tids_by_rid = []

        # mapping of rule id->(mapping of (token_id->[positions, ...])
        # We track only high/good tokens there. This is a "traditional"
        # positional inverted index
        self.high_postings_by_rid = []

        # mapping of rule_id -> tuple of low and high tokens ids sets/multisets
        # (low_tids_set, high_tids_set)
        self.tids_sets_by_rid = []
        # (low_tids_mset, high_tids_mset)
        self.tids_msets_by_rid = []

        # mapping of hash -> single rid : duplicated rules are not allowed
        self.rid_by_hash = {}

        # Aho-Corasick automatons for negative and small rules
        self.rules_automaton = match_aho.get_automaton()
        self.negative_automaton = match_aho.get_automaton()

        # disjunctive sets of rule ids: regular, negative, small, false positive
        self.regular_rids = set()
        self.negative_rids = set()
        self.small_rids = set()
        self.false_positive_rids = set()

        # length of the largest false_positive rule
        self.largest_false_positive_length = 0

        # if True the index has been optimized and becomes read only:
        # no new rules can be added
        self.optimized = False

        if rules:
            if TRACE_INDEXING_PERF:
                start = time()
                print('LicenseIndex: building index.')

            # index all and optimize
            self._add_rules(rules, _ranked_tokens)

            if TRACE_INDEXING_PERF:
                duration = time() - start
                len_rules = len(self.rules_by_rid)
                print('LicenseIndex: built index with %(len_rules)d rules in %(duration)f seconds.' % locals())
                self._print_index_stats()