Example #1
0
    def _add_rules(self,
                   rules,
                   _ranked_tokens=global_tokens_by_ranks,
                   _spdx_tokens=None):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.

        `_spdx_tokens` if provided is a set of token strings from known SPDX
        keys: these receive a special treatment.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.is_false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.is_negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Add SPDX key tokens to the dictionary. track which are only from SPDX leys
        ########################################################################
        spdx_tokens = None
        if _spdx_tokens:
            spdx_tokens = _spdx_tokens.difference(frequencies_by_token)
            frequencies_by_token.update(_spdx_tokens)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        ########################################################################
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
        assert len_tokens <= MAX_TOKENS, msg

        # initial dictionary mapping to old/arbitrary token ids
        ########################################################################
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) integer ids
        ########################################################################
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        # Get SPDX-only token ids
        ########################################################################
        spdx_token_ids = None
        if spdx_tokens:
            spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens)

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens,
                                             _spdx_token_ids=spdx_token_ids)
        (
            self.len_junk,
            self.dictionary,
            self.tokens_by_tid,
            self.tids_by_rid,
            self.weak_rids,
        ) = renumbered

        len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered

        #######################################################################
        # build index structures
        #######################################################################
        self.len_good = len_good = len_tokens - len_junk
        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            rule_is_weak = rid in weak_rids

            if rule.is_negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)
            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by

                # FIXME: we do not want to keep small rules and rules that
                # cannot be seq matches in the index

                # no postings for junk only rules
                # we do not want to keep small rules and rules that
                # cannot be seq matches in the index
                if not rule_is_weak:
                    postings = defaultdict(list)
                    for pos, tid in enumerate(rule_token_ids):
                        if tid >= len_junk:
                            postings[tid].append(pos)
                    # OPTIMIZED: for speed and memory: convert postings to arrays
                    postings = {
                        tid: array('h', value)
                        for tid, value in postings.items()
                    }
                    # OPTIMIZED: for speed, sparsify dict
                    sparsify(postings)
                    self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(
                    rule_token_ids, len_junk, len_good)

                # no set indexes for junk only rules
                if not rule_is_weak:
                    self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                    self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100
                        and len(rule_token_ids) > NGRAM_LEN):
                    all_ngrams = tokenize.ngrams(rule_token_ids,
                                                 ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams,
                                                             with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # FIXME: this may not be updated for a rule that is createda at
                # match time such as SPDX rules

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [
                '\n'.join(
                    sorted([('file://' + rule.text_file) if rule.text_file else
                            ('text: ' + rule.stored_text) for rule in rules]))
                for rules in dupe_rules
            ]
            msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
            raise AssertionError(msg)

        self.optimized = True
Example #2
0
    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            rul_len = rul.length
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
                if rul_len > self.largest_false_positive_length:
                    self.largest_false_positive_length = rul_len
            elif rul.negative():
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.false_positive:
                # FP rules are not used for any matching
                # there is nothing else for these rules
                self.false_positive_rid_by_hash[rule_hash] = rid
            else:
                # negative, small and regular

                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {
                    tid: array('h', value)
                    for tid, value in postings.items()
                }
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets(
                    rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automatons...
                if rule.negative():
                    # ... with only the whole rule tokens sequence
                    negative_automaton_add(tids=rule_token_ids, rid=rid)
                else:
                    # ... or with the whole rule tokens sequence
                    rules_automaton_add(tids=rule_token_ids, rid=rid)
                    # ... and ngrams: compute ngrams and populate the automaton with ngrams
                    if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(
                            rule_token_ids) > NGRAM_LEN:
                        all_ngrams = ngrams(rule_token_ids,
                                            ngram_length=NGRAM_LEN)
                        selected_ngrams = select_ngrams(all_ngrams,
                                                        with_pos=True)
                        for pos, ngram in selected_ngrams:
                            rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = tids_set_counter(rlow_set)
                rule.high_unique = tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = tids_multiset_counter(rlow_mset)
                rule.high_length = tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)
        sparsify(self.false_positive_rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules]
                               for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' +
                   u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True
Example #3
0
    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)}
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)

            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True