def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks, _spdx_tokens=None): """ Initialize the index with an iterable of Rule objects. """ # total number of unique known tokens self.len_tokens = 0 # largest token ID for a "junk" token. A token with a smaller id than # len_junk is considered a "junk" very common token self.len_junk = 0 # corresponding number of non-junk tokens: len_tokens = len_junk + len_good self.len_good = 0 # mapping of token string > token id self.dictionary = {} # mapping of token id -> token string as a list where the index is the # token id and the value the actual token string. # This the reverse of the dictionary. self.tokens_by_tid = [] # Note: all the following are mappings-like (using lists) of # rid-> data are lists of data where the index is the rule id. # rule objects proper self.rules_by_rid = [] # token_id sequences self.tids_by_rid = [] # mapping of rule id->(mapping of (token_id->[positions, ...]) # We track only high/good tokens there. This is a "traditional" # positional inverted index self.high_postings_by_rid = [] # mapping of rule_id -> tuple of low and high tokens ids sets/multisets # (low_tids_set, high_tids_set) self.tids_sets_by_rid = [] # (low_tids_mset, high_tids_mset) self.tids_msets_by_rid = [] # mapping of hash -> single rid : duplicated rules are not allowed self.rid_by_hash = {} # Aho-Corasick automatons for negative and small rules self.rules_automaton = match_aho.get_automaton() self.negative_automaton = match_aho.get_automaton() # disjunctive sets of rule ids: regular, negative, small, false positive self.regular_rids = set() self.negative_rids = set() self.small_rids = set() self.false_positive_rids = set() # these rule ids are for rules made entirely of low, junk tokens self.weak_rids = set() # length of the largest false_positive rule self.largest_false_positive_length = 0 # if True the index has been optimized and becomes read only: # no new rules can be added self.optimized = False if rules: if TRACE_INDEXING_PERF: start = time() print('LicenseIndex: building index.') # index all and optimize self._add_rules(rules, _ranked_tokens, _spdx_tokens) if TRACE_INDEXING_PERF: duration = time() - start len_rules = len(self.rules_by_rid) print( 'LicenseIndex: built index with %(len_rules)d rules in %(duration)f seconds.' % locals()) self._print_index_stats()
def __init__( self, rules=None, _legalese=common_license_words, _spdx_tokens=frozenset(), _license_tokens=frozenset(), ): """ Initialize the index with an iterable of Rule objects. ``_legalese`` is a set of common license-specific words aka. legalese ``_spdx_tokens`` is a set of tokens used in SPDX license identifiers ``license_tokens`` is a set of "license" tokens used as start or end of a rule """ # total number of unique known tokens self.len_tokens = 0 # largest token ID for a "legalese" token. A token with a larger id than # len_legalese is considered a "junk" very common token self.len_legalese = 0 # mapping of token string > token id self.dictionary = {} # set of token ids made entirely of digits self.digit_only_tids = set() # mapping-like of token id -> token string as a list where the index is the # token id and the value the actual token string. # This the reverse of the dictionary. self.tokens_by_tid = [] # Note: all the following are mappings-like (using lists) of # rid-> data are lists of data where the index is the rule id. # maping-like of rule_id -> rule objects proper self.rules_by_rid = [] # maping-like of rule_id -> sequence of token_ids self.tids_by_rid = [] # mapping-like of rule id->(mapping of (token_id->[positions, ...]) # We track only high/good tokens there. This is a "traditional" # inverted index postings list self.high_postings_by_rid = [] # mapping-like of rule_id -> tokens ids sets/multisets self.sets_by_rid = [] self.msets_by_rid = [] # mapping of hash -> single rid for hash match: duplicated rules are not allowed self.rid_by_hash = {} # Aho-Corasick automatons for regular rules and experimental fragments self.rules_automaton = match_aho.get_automaton() self.fragments_automaton = USE_AHO_FRAGMENTS and match_aho.get_automaton() self.starts_automaton = USE_RULE_STARTS and match_aho.get_automaton() self.unknown_automaton = match_unknown.get_automaton() # disjunctive sets of rule ids: regular and false positive # TODO: consider using intbitset instead self.regular_rids = set() self.false_positive_rids = set() # These rule ids are for rules that can be matched with a sequence # match. Other rules can only be matched exactly self.approx_matchable_rids = set() # if True the index has been optimized and becomes read only: # no new rules can be added self.optimized = False if rules: if TRACE_INDEXING_PERF: start = time() logger_debug('LicenseIndex: building index.') # index all and optimize self._add_rules( rules, _legalese=_legalese, _spdx_tokens=_spdx_tokens, _license_tokens=_license_tokens, ) if TRACE_TOKEN_DOC_FREQ: logger_debug('LicenseIndex: token, frequency') from itertools import chain tf = Counter(chain.from_iterable(tids for rid, tids in enumerate(self.tids_by_rid) if rid in self.regular_rids)) if TRACE_INDEXING_PERF: duration = time() - start len_rules = len(self.rules_by_rid) logger_debug('LicenseIndex: built index with %(len_rules)d rules in ' '%(duration)f seconds.' % locals()) self._print_index_stats()
def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks): """ Initialize the index with an iterable of Rule objects. """ # total number of unique known tokens self.len_tokens = 0 # largest token ID for a "junk" token. A token with a smaller id than # len_junk is considered a "junk" very common token self.len_junk = 0 # corresponding number of non-junk tokens: len_tokens = len_junk + len_good self.len_good = 0 # mapping of token string > token id self.dictionary = {} # mapping of token id -> token string as a list where the index is the # token id and the value the actual token string self.tokens_by_tid = [] # Note: all the following are mappings-like (using lists) of # rid-> data are lists of data where the index is the rule id. # rule objects proper self.rules_by_rid = [] # token_id sequences self.tids_by_rid = [] # mapping of rule id->(mapping of (token_id->[positions, ...]) # We track only high/good tokens there. This is a "traditional" # positional inverted index self.high_postings_by_rid = [] # mapping of rule_id -> tuple of low and high tokens ids sets/multisets # (low_tids_set, high_tids_set) self.tids_sets_by_rid = [] # (low_tids_mset, high_tids_mset) self.tids_msets_by_rid = [] # mapping of hash -> single rid : duplicated rules are not allowed self.rid_by_hash = {} # Aho-Corasick automatons for negative and small rules self.rules_automaton = match_aho.get_automaton() self.negative_automaton = match_aho.get_automaton() # disjunctive sets of rule ids: regular, negative, small, false positive self.regular_rids = set() self.negative_rids = set() self.small_rids = set() self.false_positive_rids = set() # length of the largest false_positive rule self.largest_false_positive_length = 0 # if True the index has been optimized and becomes read only: # no new rules can be added self.optimized = False if rules: if TRACE_INDEXING_PERF: start = time() print('LicenseIndex: building index.') # index all and optimize self._add_rules(rules, _ranked_tokens) if TRACE_INDEXING_PERF: duration = time() - start len_rules = len(self.rules_by_rid) print('LicenseIndex: built index with %(len_rules)d rules in %(duration)f seconds.' % locals()) self._print_index_stats()