def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks, _spdx_tokens=None): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. `_spdx_tokens` if provided is a set of token strings from known SPDX keys: these receive a special treatment. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.is_false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.is_negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Add SPDX key tokens to the dictionary. track which are only from SPDX leys ######################################################################## spdx_tokens = None if _spdx_tokens: spdx_tokens = _spdx_tokens.difference(frequencies_by_token) frequencies_by_token.update(_spdx_tokens) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way ######################################################################## tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS assert len_tokens <= MAX_TOKENS, msg # initial dictionary mapping to old/arbitrary token ids ######################################################################## self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) integer ids ######################################################################## self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] # Get SPDX-only token ids ######################################################################## spdx_token_ids = None if spdx_tokens: spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens) ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens, _spdx_token_ids=spdx_token_ids) ( self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid, self.weak_rids, ) = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered ####################################################################### # build index structures ####################################################################### self.len_good = len_good = len_tokens - len_junk len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) rule_is_weak = rid in weak_rids if rule.is_negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by # FIXME: we do not want to keep small rules and rules that # cannot be seq matches in the index # no postings for junk only rules # we do not want to keep small rules and rules that # cannot be seq matches in the index if not rule_is_weak: postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets( rule_token_ids, len_junk, len_good) # no set indexes for junk only rules if not rule_is_weak: self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN): all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # FIXME: this may not be updated for a rule that is createda at # match time such as SPDX rules # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [ '\n'.join( sorted([('file://' + rule.text_file) if rule.text_file else ('text: ' + rule.stored_text) for rule in rules])) for rules in dupe_rules ] msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths)) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids rul_len = rul.length if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) if rul_len > self.largest_false_positive_length: self.largest_false_positive_length = rul_len elif rul.negative(): # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.false_positive: # FP rules are not used for any matching # there is nothing else for these rules self.false_positive_rid_by_hash[rule_hash] = rid else: # negative, small and regular # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets( rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automatons... if rule.negative(): # ... with only the whole rule tokens sequence negative_automaton_add(tids=rule_token_ids, rid=rid) else: # ... or with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len( rule_token_ids) > NGRAM_LEN: all_ngrams = ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = tids_set_counter(rlow_set) rule.high_unique = tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = tids_multiset_counter(rlow_mset) rule.high_length = tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) sparsify(self.false_positive_rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)} sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = {tid: array('h', value) for tid, value in postings.items()} # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN: all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True