def test_select_ngrams_with_unicode_inputs(self): result = list( select_ngrams(x for x in [('b', 'ä', 'c'), ('ä', 'ä', 'c'), ('e', 'ä', 'c'), ('b', 'f', 'ä'), ('g', 'c', 'd')])) expected = [('b', 'ä', 'c'), ('ä', 'ä', 'c'), ('e', 'ä', 'c'), ('b', 'f', 'ä'), ('g', 'c', 'd')] assert result == expected
def _add_rules( self, rules, _legalese=common_license_words, _spdx_tokens=frozenset(), _license_tokens=frozenset(), ): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. `_legalese` is a set of common license-specific words aka. legalese `_spdx_tokens` is a set of token strings used in SPDX license identifiers ``license_tokens`` is a set of "license" tokens used as start or end of a rule """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # initial dictionary mapping for known legalese tokens ######################################################################## # FIXME: we should start enumerating at 1 below: token ids then become # valid "unichr" values, making it easier downstream when used in # automatons self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(sorted(_legalese)) } dictionary_get = dictionary.get self.len_legalese = len_legalese = len(dictionary) highest_tid = len_legalese - 1 # Add SPDX key tokens to the dictionary # these are always treated as non-legalese. This may seem weird # but they are detected in expressions alright and some of their # tokens exist as rules too (e.g. GPL) ######################################################################## for sts in sorted(_spdx_tokens): stid = dictionary_get(sts) if stid is None: # we have a never yet seen token, so we assign a new tokenid highest_tid += 1 stid = highest_tid dictionary[sts] = stid self.rules_by_rid = rules_by_rid = list(rules) # ensure that rules are sorted rules_by_rid.sort() len_rules = len(rules_by_rid) # create index data structures # OPTIMIZATION: bind frequently used methods to the local scope for # index structures ######################################################################## tids_by_rid_append = self.tids_by_rid.append false_positive_rids_add = self.false_positive_rids.add regular_rids_add = self.regular_rids.add approx_matchable_rids_add = self.approx_matchable_rids.add # since we only use these for regular rules, these lists may be sparse. # their index is the rule rid self.high_postings_by_rid = high_postings_by_rid = [None] * len_rules self.sets_by_rid = sets_by_rid = [None] * len_rules self.msets_by_rid = msets_by_rid = [None] * len_rules # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # create a set of known "license" words used to determine if a rule # starts or ends with a "license" word/token ######################################################################## license_tokens = set() for t in _license_tokens: tid = dictionary_get(t) if tid is not None: license_tokens.add(tid) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton, with_duplicates=False) if USE_AHO_FRAGMENTS: fragments_automaton_add = partial( match_aho.add_sequence, automaton=self.fragments_automaton, with_duplicates=True, ) if USE_RULE_STARTS: starts_automaton_add_start = partial( match_aho.add_start, automaton=self.starts_automaton, ) # OPTIMIZED: bind frequently used objects to local scope rid_by_hash = self.rid_by_hash match_hash_index_hash = match_hash.index_hash match_set_tids_set_counter = match_set.tids_set_counter match_set_multiset_counter = match_set.multiset_counter len_starts = SMALL_RULE min_len_starts = SMALL_RULE * 6 ngram_len = AHO_FRAGMENTS_NGRAM_LEN # Index each rule ######################################################################## for rid, rule in enumerate(rules_by_rid): # assign rid rule.rid = rid rule_token_ids = array('h', []) tids_by_rid_append(rule_token_ids) rule_token_ids_append = rule_token_ids.append rule_tokens = [] rule_tokens_append = rule_tokens.append # A rule is weak if it does not contain at least one legalese word: # we consider all rules to be weak until proven otherwise below. # "weak" rules can only be matched with an automaton. is_weak = True for rts in rule.tokens(): rule_tokens_append(rts) rtid = dictionary_get(rts) if rtid is None: # we have a never yet seen token, so we assign a new tokenid # note: we could use the length of the dictionary instead highest_tid += 1 rtid = highest_tid dictionary[rts] = rtid if is_weak and rtid < len_legalese: is_weak = False rule_token_ids_append(rtid) rule_length = rule.length is_tiny = rule_length < TINY_RULE # build hashes index and check for duplicates rule texts rule_hash = match_hash_index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) #################### # populate automaton with the whole rule tokens sequence, for all # RULEs, be they "standard"/regular, weak, false positive or small #################### rules_automaton_add(tids=rule_token_ids, rid=rid) if rule.is_false_positive: # False positive rules do not participate in the set or sequence # matching at all: they are used for exact matching and in post- # matching filtering false_positive_rids_add(rid) continue # from now on, we have regular rules rid_by_hash[rule_hash] = rid regular_rids_add(rid) # Does the rule starts or ends with a "license" word? We track this # to help disambiguate some overlapping false positive short rules # OPTIMIZED: the last rtid above IS the last token id if license_tokens: if rtid in license_tokens: rule.ends_with_license = True if rule_token_ids[0] in license_tokens: rule.starts_with_license = True # populate unknown_automaton that only makes sense for rules that # are also sequence matchable. #################### match_unknown.add_ngrams( automaton=self.unknown_automaton, tids=rule_token_ids, tokens=rule_tokens, len_legalese=len_legalese, rule_length=rule_length, ) # Some rules that cannot be matched as a sequence are "weak" rules # or can require to be matched only as a continuous sequence of # tokens. This includes, tiny, is_continuous or is_license_reference # rules. We skip adding these to the data structures used for # sequence matching. can_match_as_sequence = not ( is_weak or is_tiny or rule.is_continuous or (rule.is_small and (rule.is_license_reference or rule.is_license_tag)) ) if can_match_as_sequence: approx_matchable_rids_add(rid) #################### # update high postings: positions by high tids used to # speed up sequence matching #################### # no postings for rules that cannot be matched as a sequence (too short and weak) # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid < len_legalese: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = {tid: array('h', value) for tid, value in postings.items()} high_postings_by_rid[rid] = postings #################### # ... and ngram fragments: compute ngrams and populate an automaton with ngrams #################### if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and rule_length > ngram_len ): all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=ngram_len) all_ngrams_with_pos = tokenize.select_ngrams(all_ngrams, with_pos=True) # all_ngrams_with_pos = enumerate(all_ngrams) for pos, ngram in all_ngrams_with_pos: fragments_automaton_add(tids=ngram, rid=rid, start=pos) #################### # use the start and end of this rule as a break point for query runs #################### if USE_RULE_STARTS and rule_length > min_len_starts: starts_automaton_add_start( tids=rule_token_ids[:len_starts], rule_identifier=rule.identifier, rule_length=rule_length, ) #################### # build sets and multisets indexes, for all regular rules as we need # the thresholds #################### tids_set, mset = match_set.build_set_and_mset( rule_token_ids, _use_bigrams=USE_BIGRAM_MULTISETS) sets_by_rid[rid] = tids_set msets_by_rid[rid] = mset #################################################################### #################################################################### # FIXME!!!!!!! we should store them: we need them and we recompute # them later at match time tids_set_high = match_set.high_tids_set_subset( tids_set, len_legalese) mset_high = match_set.high_multiset_subset( mset, len_legalese, _use_bigrams=USE_BIGRAM_MULTISETS) # FIXME!!!!!!! #################################################################### #################################################################### #################### # update rule thresholds #################### rule.length_unique = match_set_tids_set_counter(tids_set) rule.high_length_unique = match_set_tids_set_counter(tids_set_high) rule.high_length = match_set_multiset_counter(mset_high) rule.compute_thresholds() ######################################################################## # Finalize index data structures ######################################################################## # Create the tid -> token string lookup structure. ######################################################################## self.tokens_by_tid = tokens_by_tid = [ ts for ts, _tid in sorted(dictionary.items(), key=itemgetter(1))] self.len_tokens = len_tokens = len(tokens_by_tid) # some tokens are made entirely of digits and these can create some # worst case behavior when there are long runs on these ######################################################################## self.digit_only_tids = intbitset([ i for i, s in enumerate(self.tokens_by_tid) if s.isdigit()]) # Finalize automatons ######################################################################## self.rules_automaton.make_automaton() if USE_AHO_FRAGMENTS: self.fragments_automaton.make_automaton() if USE_RULE_STARTS: match_aho.finalize_starts(self.starts_automaton) self.unknown_automaton.make_automaton() ######################################################################## # Do some sanity checks ######################################################################## msg = 'Inconsistent structure lengths' assert len_tokens == highest_tid + 1 == len(dictionary), msg msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS assert len_tokens <= MAX_TOKENS, msg dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1] if dupe_rules: dupe_rule_paths = [ '\n'.join( sorted([ ('file://' + rule.text_file) if rule.text_file else ('text: ' + rule.stored_text) for rule in rules]) ) for rules in dupe_rules ] msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths)) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks, _spdx_tokens=None): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. `_spdx_tokens` if provided is a set of token strings from known SPDX keys: these receive a special treatment. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.is_false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.is_negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Add SPDX key tokens to the dictionary. track which are only from SPDX leys ######################################################################## spdx_tokens = None if _spdx_tokens: spdx_tokens = _spdx_tokens.difference(frequencies_by_token) frequencies_by_token.update(_spdx_tokens) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way ######################################################################## tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS assert len_tokens <= MAX_TOKENS, msg # initial dictionary mapping to old/arbitrary token ids ######################################################################## self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) integer ids ######################################################################## self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] # Get SPDX-only token ids ######################################################################## spdx_token_ids = None if spdx_tokens: spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens) ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens, _spdx_token_ids=spdx_token_ids) ( self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid, self.weak_rids, ) = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered ####################################################################### # build index structures ####################################################################### self.len_good = len_good = len_tokens - len_junk len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) rule_is_weak = rid in weak_rids if rule.is_negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by # FIXME: we do not want to keep small rules and rules that # cannot be seq matches in the index # no postings for junk only rules # we do not want to keep small rules and rules that # cannot be seq matches in the index if not rule_is_weak: postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets( rule_token_ids, len_junk, len_good) # no set indexes for junk only rules if not rule_is_weak: self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN): all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # FIXME: this may not be updated for a rule that is createda at # match time such as SPDX rules # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [ '\n'.join( sorted([('file://' + rule.text_file) if rule.text_file else ('text: ' + rule.stored_text) for rule in rules])) for rules in dupe_rules ] msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths)) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids rul_len = rul.length if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) if rul_len > self.largest_false_positive_length: self.largest_false_positive_length = rul_len elif rul.negative(): # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = { ts: tid for tid, ts in enumerate(tokens_by_tid) } sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.false_positive: # FP rules are not used for any matching # there is nothing else for these rules self.false_positive_rid_by_hash[rule_hash] = rid else: # negative, small and regular # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = { tid: array('h', value) for tid, value in postings.items() } # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets( rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automatons... if rule.negative(): # ... with only the whole rule tokens sequence negative_automaton_add(tids=rule_token_ids, rid=rid) else: # ... or with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len( rule_token_ids) > NGRAM_LEN: all_ngrams = ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = tids_set_counter(rlow_set) rule.high_unique = tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = tids_multiset_counter(rlow_mset) rule.high_length = tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) sparsify(self.false_positive_rid_by_hash) dupe_rules = [ rules for rules in dupe_rules_by_hash.values() if len(rules) > 1 ] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True
def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks): """ Add a list of Rule objects to the index and constructs optimized and immutable index structures. """ if self.optimized: raise Exception('Index has been optimized and cannot be updated.') # this assigns the rule ids implicitly: this is the index in the list self.rules_by_rid = list(rules) ####################################################################### # classify rules, collect tokens and frequencies ####################################################################### # accumulate all rule tokens strings. This is used only during indexing token_strings_by_rid = [] # collect the unique token strings and compute their global frequency # This is used only during indexing frequencies_by_token = Counter() for rid, rul in enumerate(self.rules_by_rid): rul_tokens = list(rul.tokens()) token_strings_by_rid.append(rul_tokens) frequencies_by_token.update(rul_tokens) # assign the rid to the rule object for sanity rul.rid = rid # classify rules and build disjuncted sets of rids if rul.false_positive: # false positive rules do not participate in the matches at all # they are used only in post-matching filtering self.false_positive_rids.add(rid) elif rul.negative: # negative rules are matched early and their exactly matched # tokens are removed from the token stream self.negative_rids.add(rid) elif rul.small(): # small rules are best matched with a specialized approach self.small_rids.add(rid) else: # regular rules are matched using a common approach self.regular_rids.add(rid) # Create the tokens lookup structure at once. Note that tokens ids are # assigned randomly here at first by unzipping: we get the frequencies # and tokens->id at once this way tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items()) self.tokens_by_tid = tokens_by_tid self.len_tokens = len_tokens = len(tokens_by_tid) assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS # initial dictionary mapping to old/random token ids self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)} sparsify(dictionary) # replace token strings with arbitrary (and temporary) random integer ids self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid] ####################################################################### # renumber token ids based on frequencies and common words ####################################################################### renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens) self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered self.len_good = len_good = len_tokens - len_junk ####################################################################### # build index structures ####################################################################### len_rules = len(self.rules_by_rid) # since we only use these for regular rules, these lists may be sparse # their index is the rule rid self.high_postings_by_rid = [None for _ in range(len_rules)] self.tids_sets_by_rid = [None for _ in range(len_rules)] self.tids_msets_by_rid = [None for _ in range(len_rules)] # track all duplicate rules: fail and report dupes at once at the end dupe_rules_by_hash = defaultdict(list) # build closures for methods that populate automatons negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton) rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton) # build by-rule index structures over the token ids seq of each rule for rid, rule_token_ids in enumerate(tids_by_rid): rule = self.rules_by_rid[rid] # build hashes index and check for duplicates rule texts rule_hash = match_hash.index_hash(rule_token_ids) dupe_rules_by_hash[rule_hash].append(rule) if rule.negative: negative_automaton_add(tids=rule_token_ids, rid=rid) else: # update hashes index self.rid_by_hash[rule_hash] = rid # update high postings index: positions by high tids # TODO: this could be optimized with a group_by postings = defaultdict(list) for pos, tid in enumerate(rule_token_ids): if tid >= len_junk: postings[tid].append(pos) # OPTIMIZED: for speed and memory: convert postings to arrays postings = {tid: array('h', value) for tid, value in postings.items()} # OPTIMIZED: for speed, sparsify dict sparsify(postings) self.high_postings_by_rid[rid] = postings # build high and low tids sets and multisets rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good) self.tids_sets_by_rid[rid] = rlow_set, rhigh_set self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset # populate automaton with the whole rule tokens sequence rules_automaton_add(tids=rule_token_ids, rid=rid) # ... and ngrams: compute ngrams and populate the automaton with ngrams if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN: all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN) selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True) for pos, ngram in selected_ngrams: rules_automaton_add(tids=ngram, rid=rid, start=pos) # update rule thresholds rule.low_unique = match_set.tids_set_counter(rlow_set) rule.high_unique = match_set.tids_set_counter(rhigh_set) rule.length_unique = rule.high_unique + rule.low_unique rule.low_length = match_set.tids_multiset_counter(rlow_mset) rule.high_length = match_set.tids_multiset_counter(rhigh_mset) assert rule.length == rule.low_length + rule.high_length # # finalize automatons self.negative_automaton.make_automaton() self.rules_automaton.make_automaton() # sparser dicts for faster lookup sparsify(self.rid_by_hash) dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1] if dupe_rules: dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules] msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths))) raise AssertionError(msg) self.optimized = True