Exemple #1
0
def index_token_sets(token_ids, len_junk, len_good):
    """
    Return a 4-tuple of low & high tids sets, low & high tids multisets given a
    token_ids sequence.
    """
    # For multisets, we use a defaultdict, rather than a Counter. This is midly
    # faster than a Counter for sparse sets.

    # this variant uses intbitset to evaluate its performance wrt to bitarray

    low_tids_set = intbitset(len_junk)
    low_tids_set_add = low_tids_set.add
    high_tids_set = intbitset(len_good)
    high_tids_set_add = high_tids_set.add
    low_tids_mset = defaultdict(int)
    high_tids_mset = defaultdict(int)
    for tid in token_ids:
        # this skips unknown token ids that are -1 as well as possible None
        if tid < 0:
            continue
        if tid < len_junk:
            low_tids_mset[tid] += 1
            low_tids_set_add(tid)
        else:
            high_tids_mset[tid] += 1
            high_tids_set_add(tid)

    # sparify for speed
    sparsify(low_tids_mset)
    sparsify(high_tids_mset)
    return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
def index_token_sets(token_ids, len_junk, len_good):
    """
    Return a 4-tuple of low & high tids sets, low & high tids multisets given a
    token_ids sequence.
    """
    # For multisets, we use a defaultdict, rather than a Counter. This is midly
    # faster than a Counter for sparse sets.

    # this variant uses intbitset to evaluate its performance wrt to bitarray

    low_tids_set = intbitset(len_junk)
    low_tids_set_add = low_tids_set.add
    high_tids_set = intbitset(len_good)
    high_tids_set_add = high_tids_set.add
    low_tids_mset = defaultdict(int)
    high_tids_mset = defaultdict(int)
    for tid in token_ids:
        # this skips unknown token ids that are -1 as well as possible None
        if tid < 0:
            continue
        if tid < len_junk:
            low_tids_mset[tid] += 1
            low_tids_set_add(tid)
        else:
            high_tids_mset[tid] += 1
            high_tids_set_add(tid)

    # sparify for speed
    sparsify(low_tids_mset)
    sparsify(high_tids_mset)
    return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
def index_token_bitsets(token_ids, len_junk, len_good):
    """
    Return a 4-tuple of low & high tids sets, low & high tids multisets given a
    token_ids sequence.
    """
    # For multisets, we use a defaultdict rather than a Counter. This is midly faster
    # than a Counter for the common case of rather sparse sets.

    tids_set = bitarray([0] * (len_good + len_junk))
    low_tids_mset = defaultdict(int)
    high_tids_mset = defaultdict(int)
    for tid in token_ids:
        # this skips unknown token ids that are -1 as well as possible None
        if tid < 0:
            continue
        tids_set[tid] = True
        if tid < len_junk:
            low_tids_mset[tid] += 1
        else:
            high_tids_mset[tid] += 1

    # sparify for speed
    sparsify(low_tids_mset)
    sparsify(high_tids_mset)

    low_tids_set = tids_set[:len_junk]
    high_tids_set = tids_set[len_junk:]
    return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
Exemple #4
0
 def loads(saved):
     """
     Return a LicenseIndex from a pickled string.
     """
     idx = cPickle.loads(saved)
     # perform some optimizations on the dictionaries
     sparsify(idx.dictionary)
     return idx
Exemple #5
0
 def loads(saved):
     """
     Return a LicenseIndex from a pickled string.
     """
     idx = cPickle.loads(saved)
     # perform some optimizations on the dictionaries
     sparsify(idx.dictionary)
     return idx
 def load(fn, fast=True):
     """
     Return a LicenseIndex loaded from the `fn` file-like object pickled index.
     """
     pickler = cPickle if fast else pickle
     idx = pickler.load(fn)
     # perform some optimizations on the dictionaries
     sparsify(idx.dictionary)
     return idx
 def loads(saved, fast=True):
     """
     Return a LicenseIndex from a pickled string.
     """
     pickler = cPickle if fast else pickle
     idx = pickler.loads(saved)
     # perform some optimizations on the dictionaries
     sparsify(idx.dictionary)
     return idx
Exemple #8
0
def build_set_and_tids_mset(token_ids):
    """
    Return a tuple of (tids set, multiset) given a `token_ids` tids
    sequence.
    """
    tids_mset = defaultdict(int)

    for tid in token_ids:
        # this skips already matched token ids that are -1
        if tid == -1:
            continue
        tids_mset[tid] += 1
    # OPTIMIZED: sparsify for speed
    sparsify(tids_mset)

    tids_set = intbitset(tids_mset.keys())

    return tids_set, tids_mset
Exemple #9
0
def build_set_and_bigrams_mset(token_ids):
    """
    Return a tuple of (tids set, multiset) given a `token_ids` tids
    sequence.
    """
    tids_set = intbitset()
    bigrams_mset = defaultdict(int)

    for bigram in ngrams(token_ids, 2):
        # this skips already matched token ids that are -1
        if -1 in bigram:
            continue
        bigrams_mset[bigram] += 1
        tids_set.update(bigram)

    # OPTIMIZED: sparsify for speed
    sparsify(bigrams_mset)

    return tids_set, bigrams_mset
    def dumps(self, fast=True):
        """
        Return a pickled string of self.
        """
        # here cPickle fails when we load it back. Pickle is slower to write but
        # works when we read with cPickle :|
        pickler = cPickle if fast else pickle
        pickled = pickler.dumps(self, protocol=cPickle.HIGHEST_PROTOCOL)

        # NB: this is making the usage of cPickle possible... as a weird workaround.
        # the gain from dumping using cPickle is not as big with this optimize
        # but still much faster than using the plain pickle module
        # TODO: revisit me after the Python3 port
        import pickletools
        pickletools.code2op = sparsify(pickletools.code2op)
        pickled = pickletools.optimize(pickled)
        return pickled
    def loads(saved):
        """
        Return a LicenseIndex from a pickled string.
        """
        idx = cPickle.loads(saved)

        # perform some optimizations on dictionaries
        sparsify(idx.dictionary)
        for post in idx.postings_by_rid:
            sparsify(post)
        for start in idx.start_ngrams_by_rid:
            sparsify(start)

        return idx
    def _add_rules(self, rules, _legalese=common_license_words, _spdx_tokens=frozenset()):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.

        `_legalese` is a set of common license-specific words aka. legalese
        `_spdx_tokens` is a set of token strings used in SPDX license identifiers
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # initial dictionary mapping for known legalese tokens
        ########################################################################

        # FIXME: we should start at 1, and ids are become valid unichr values

        self.dictionary = dictionary = {
            ts: tid for tid, ts in enumerate(sorted(_legalese))}
        dictionary_get = dictionary.get

        self.len_legalese = len_legalese = len(dictionary)
        highest_tid = len_legalese - 1

        # Add SPDX key tokens to the dictionary
        # these are always treated as non-legalese
        ########################################################################
        for sts in _spdx_tokens:
            stid = dictionary_get(sts)
            if stid is None:
                # we have a never yet seen token, so we assign a new tokenid
                highest_tid += 1
                stid = highest_tid
                dictionary[sts] = stid

        # OPTIMIZED
        sparsify(dictionary)

        self.rules_by_rid = rules_by_rid = list(rules)
        len_rules = len(rules_by_rid)

        # create index data structures
        # OPTIMIZATION: bind frequently used methods to the local scope for index structures
        ########################################################################
        tids_by_rid_append = self.tids_by_rid.append

        false_positive_rids_add = self.false_positive_rids.add
        negative_rids_add = self.negative_rids.add
        regular_rids_add = self.regular_rids.add
        approx_matchable_rids_add = self.approx_matchable_rids.add

        # since we only use these for regular rules, these lists may be sparse.
        # their index is the rule rid
        self.high_postings_by_rid = high_postings_by_rid = [None] * len_rules
        self.sets_by_rid = sets_by_rid = [None] * len_rules
        self.msets_by_rid = msets_by_rid = [None] * len_rules

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build partials for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
            automaton=self.negative_automaton, with_duplicates=False)

        rules_automaton_add = partial(match_aho.add_sequence,
            automaton=self.rules_automaton, with_duplicates=False)

        if USE_AHO_FRAGMENTS:
            fragments_automaton_add = partial(match_aho.add_sequence,
                automaton=self.fragments_automaton, with_duplicates=True)

        if USE_RULE_STARTS:
            starts_automaton_add_start = partial(match_aho.add_start,
                automaton=self.starts_automaton)

        # OPTIMIZED: bind frequently used objects to local scope
        rid_by_hash = self.rid_by_hash
        match_hash_index_hash = match_hash.index_hash
        match_set_tids_set_counter = match_set.tids_set_counter
        match_set_multiset_counter = match_set.multiset_counter

        len_starts = SMALL_RULE
        min_len_starts = SMALL_RULE * 6

        ngram_len = AHO_FRAGMENTS_NGRAM_LEN

        # Index each rule
        ########################################################################
        for rid, rule in enumerate(rules_by_rid):

            # assign rid
            rule.rid = rid

            rule_token_ids = array('h', [])
            tids_by_rid_append(rule_token_ids)

            # A rule is weak if it does not contain at least one legalese word:
            # we consider all rules to be weak until proven otherwise below.
            # "weak" rules can only be matched with an automaton.
            is_weak = True

            for rts in rule.tokens():
                rtid = dictionary_get(rts)
                if rtid is None:
                    # we have a never yet seen token, so we assign a new tokenid
                    # note: we could use the length of the dictionary instead
                    highest_tid += 1
                    rtid = highest_tid
                    dictionary[rts] = rtid
                if is_weak and rtid < len_legalese:
                    is_weak = False

                rule_token_ids.append(rtid)

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash_index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            # classify rules and build disjuncted sets of rids
            if rule.is_negative:
                # negative rules are matched early and their tokens are only
                # exactly matched. When matched as a whole, their tokens are
                # removed from the token stream
                negative_rids_add(rid)
                negative_automaton_add(tids=rule_token_ids, rid=rid)
                continue

            ####################
            # populate automaton with the whole rule tokens sequence, for all
            # RULEs, be they "standard"/regular, weak, false positive or small
            # (but not negative)
            ####################
            rules_automaton_add(tids=rule_token_ids, rid=rid)

            if rule.is_false_positive:
                # False positive rules do not participate in the set or sequence
                # matching at all: they are used for exact matching and in post-
                # matching filtering
                false_positive_rids_add(rid)
                continue

            # from now on, we have regular rules
            rid_by_hash[rule_hash] = rid
            regular_rids_add(rid)

            # Some rules cannot be matched as a sequence are "weak" rules
            if not is_weak:
                approx_matchable_rids_add(rid)

                ####################
                # update high postings: positions by high tids used to
                # speed up sequence matching
                ####################
                # no postings for rules that cannot be matched as a sequence (too short and weak)
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid < len_legalese:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                high_postings_by_rid[rid] = postings

                ####################
                # ... and ngram fragments: compute ngrams and populate an automaton with ngrams
                ####################
                if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and rule.length > ngram_len:
                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=ngram_len)
                    all_ngrams_with_pos = tokenize.select_ngrams(all_ngrams, with_pos=True)
                    # all_ngrams_with_pos = enumerate(all_ngrams)
                    for pos, ngram in all_ngrams_with_pos:
                        fragments_automaton_add(tids=ngram, rid=rid, start=pos)

                ####################
                # use the start and end of this rule as a break point for query runs
                ####################
                if USE_RULE_STARTS and rule.length > min_len_starts:
                    starts_automaton_add_start(
                        tids=rule_token_ids[:len_starts],
                        rule_identifier=rule.identifier,
                        rule_length=rule.length)

            ####################
            # build sets and multisets indexes, for all regular rules as we need
            # the thresholds
            ####################
            tids_set, mset = match_set.build_set_and_mset(
                rule_token_ids, _use_bigrams=USE_BIGRAM_MULTISETS)
            sets_by_rid[rid] = tids_set
            msets_by_rid[rid] = mset

            ####################################################################
            ####################################################################
            # FIXME!!!!!!! we should store them: we need them and we recompute
            # them later at match time
            tids_set_high = match_set.high_tids_set_subset(
                tids_set, len_legalese)
            mset_high = match_set.high_multiset_subset(
                mset, len_legalese, _use_bigrams=USE_BIGRAM_MULTISETS)

            # FIXME!!!!!!!
            ####################################################################
            ####################################################################

            ####################
            # update rule thresholds
            ####################
            rule.length_unique = match_set_tids_set_counter(tids_set)
            rule.high_length_unique = match_set_tids_set_counter(tids_set_high)

            rule.high_length = match_set_multiset_counter(mset_high)
            rule.compute_thresholds()

        ########################################################################
        # Finalize index data structures
        ########################################################################

        # some tokens are made entirely of digits and these can create some
        # worst case behavior when there are long runs on these
        ########################################################################
        self.digit_only_tids = intbitset([
            i for i, s in enumerate(self.tokens_by_tid) if s.isdigit()])

        # Create the tid -> token string lookup structure.
        ########################################################################
        self.tokens_by_tid = tokens_by_tid = [
            ts for ts, _tid in sorted(dictionary.items(), key=itemgetter(1))]
        self.len_tokens = len_tokens = len(tokens_by_tid)

        # Finalize automatons
        ########################################################################
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        if USE_AHO_FRAGMENTS:
            self.fragments_automaton.make_automaton()

        if USE_RULE_STARTS:
            match_aho.finalize_starts(self.starts_automaton)

        # OPTIMIZED: sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        ########################################################################
        # Do some sanity checks
        ########################################################################

        msg = 'Inconsistent structure lengths'
        assert len_tokens == highest_tid + 1 == len(dictionary), msg

        msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
        assert len_tokens <= MAX_TOKENS, msg

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [
                '\n'.join(
                    sorted([
                        ('file://' + rule.text_file)
                        if rule.text_file
                        else ('text: ' + rule.stored_text)
                            for rule in rules])
                    )
                for rules in dupe_rules
            ]
            msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
            raise AssertionError(msg)

        self.optimized = True
Exemple #13
0
    def renumber_token_ids(self, frequencies_by_old_tid, _ranked_tokens=global_tokens_by_ranks):
        """
        Return updated index structures with new token ids such that the most
        common tokens (aka. 'junk' or 'low' tokens) have the lowest ids.

        Return a tuple of (len_junk, dictionary, tokens_by_tid, tids_by_rid)
        - len_junk: the number of junk_old_tids tokens such that all junk token
        ids are smaller than this number.
        - dictionary: mapping of token string->token id
        - tokens_by_tid: reverse mapping of token id->token string
        - tids_by_rid: mapping of rule id-> array of token ids

        The arguments all relate to old, temporary token ids and are :
        - frequencies_by_old_tid: mapping of token id-> occurences across all rules
        - _ranked_tokens: callable returning a list of common lowercase token
        strings, ranked from most common to least common Used only for testing
        and default to a global list.

        Common tokens are computed based on a curated list of frequent words and
        token frequencies across rules such that:
         - common tokens have lower token ids smaller than len_junk
         - no rule is composed entirely of junk tokens.
        """
        old_dictionary = self.dictionary
        tokens_by_old_tid = self.tokens_by_tid
        old_tids_by_rid = self.tids_by_rid

        # track tokens for rules with a single token: their token is never junk
        # otherwise they can never be detected
        rules_of_one = set(r.rid for r in self.rules_by_rid if r.length == 1)
        never_junk_old_tids = set(rule_tokens[0] for rid, rule_tokens
                                  in enumerate(old_tids_by_rid)
                                  if rid in rules_of_one)

        # creat initial set of junk token ids
        junk_old_tids = set()
        junk_old_tids_add = junk_old_tids.add

        # Treat very common tokens composed only of digits or single chars as junk
        very_common_tids = set(old_tid for old_tid, token in enumerate(tokens_by_old_tid)
                          if token.isdigit() or len(token) == 1)
        junk_old_tids.update(very_common_tids)

        # TODO: ensure common number as words are treated as very common
        # (one, two, and first, second, etc.)?

        # TODO: add and treat person and place names as always being JUNK

        # Build the candidate junk set as an apprixmate proportion of total tokens
        len_tokens = len(tokens_by_old_tid)
        junk_max = len_tokens // PROPORTION_OF_JUNK

        # Use a curated list of common tokens sorted by decreasing frequency as
        # the basis to determine junk status.
        old_dictionary_get = old_dictionary.get
        for token in _ranked_tokens():
            # stop when we reach the maximum junk proportion
            if len(junk_old_tids) == junk_max:
                break
            old_tid = old_dictionary_get(token)
            if old_tid is not None and old_tid not in never_junk_old_tids:
                junk_old_tids_add(old_tid)

        len_junk = len(junk_old_tids)

        # Assemble our final set of good old token id
        good_old_tids = set(range(len_tokens)) - junk_old_tids
        assert len_tokens == len(junk_old_tids) + len(good_old_tids)

        # Sort the list of old token ids: junk before good, then by decreasing
        # frequencies, then old id.
        # This sort does the renumbering proper of old to new token ids
        key = lambda i: (i in good_old_tids, -frequencies_by_old_tid[i], i)
        new_to_old_tids = sorted(range(len_tokens), key=key)

        # keep a mapping from old to new id used for renumbering index structures
        old_to_new_tids = [new_tid for new_tid, _old_tid in sorted(enumerate(new_to_old_tids), key=itemgetter(1))]

        # create the new ids -> tokens string mapping
        tokens_by_new_tid = [tokens_by_old_tid[old_tid]  for _new_tid, old_tid in enumerate(new_to_old_tids)]

        # create the new dcitionary tokens trings -> new id
        new_dictionary = {token: new_tid  for new_tid, token in enumerate(tokens_by_new_tid)}
        sparsify(new_dictionary)
        old_tids_by_rid = self.tids_by_rid
        # mapping of rule_id->new token_ids array
        new_tids_by_rid = [array('h', (old_to_new_tids[tid] for tid in old_tids)) for old_tids in old_tids_by_rid]

        # Now do a few sanity checks...
        # By construction this should always be true
        assert set(tokens_by_new_tid) == set(tokens_by_old_tid)

        fatals = []
        for rid, new_tids in enumerate(new_tids_by_rid):
            # Check that no rule is all junk: this is a fatal indexing error
            if all(t < len_junk for t in new_tids):
                message = (
                    'WARNING: Weak rule, made only of frequent junk tokens. Can only be matched exactly:',
                    self.rules_by_rid[rid].identifier,
                    u' '.join(tokens_by_new_tid[t] for t in new_tids)
                )
                fatals.append(u' '.join(message))
        if TRACE and fatals:
            # raise IndexError(u'\n'.join(fatals))
            print()
            print('############################################')
            map(print, fatals)
            print('############################################')
            print()
        # TODO: Check that the junk count choice is correct: for instance using some
        # stats based on standard deviation or markov chains or similar
        # conditional probabilities such that we verify that we CANNOT create a
        # distinctive meaningful license string made entirely from junk tokens

        return len_junk, new_dictionary, tokens_by_new_tid, new_tids_by_rid
def renumber_token_ids(rules_tokens_ids, dictionary, tokens_by_tid, frequencies_by_tid, length=9, with_checks=True):
    """
    Return updated index structures with new token ids such that the most common
    aka. 'junk' tokens have the lowest ids. 

    `rules_tokens_ids` is a mapping of rule_id->sequence of token ids
    
    These common tokens are based on a curated list of frequent words and
    further refined such that:
     - no rule text sequence is composed entirely of these common tokens.
     - no or only a few rule text sub-sequence of `length` tokens (aka.
     ngrams) is not composed entirely of these common tokens.

    The returned structures are:
    - old_to_new: mapping of (old token id->new token id)
    - len_junk: the highest id of a junk token
    - dictionary (token string->token id)
    - tokens_by_tid (token id->token string)
    - frequencies_by_tid (token id->frequency)
    """
    # keep track of very common junk tokens: digits and single letters
    very_common = set()
    very_common_add = very_common.add
    string_lowercase = u'abcdefghijklmnopqrstuvwxyz'
    for tid, token in enumerate(tokens_by_tid):
        # DIGIT TOKENS: Treat tokens composed only of digits as common junk
        # SINGLE ASCII LETTER TOKENS: Treat single ASCII letter tokens as common junk

        # TODO: ensure common numbers as strings are always there (one, two, and first, second, etc.)
        if token.isdigit() or (len(token) == 1 and token in string_lowercase):
            very_common_add(tid)

    # keep track of good, "not junk" tokens
    good = set()
    good_update = good.update

    # Classify rules tokens as smaller or equal to `length` or regular.
    regular_rules = []
    regular_rules_append = regular_rules.append
    small_rules = []
    small_rules_append = small_rules.append

    for rid, rule_toks_ids in enumerate(rules_tokens_ids):
        len_toks = len(rule_toks_ids)
        if len_toks == 1:
            # RULES of ONE TOKEN: their token cannot be junk
            good_update(rule_toks_ids)
        if len_toks <= length:
            small_rules_append((rid, rule_toks_ids))
        else:
            regular_rules_append((rid, rule_toks_ids))

    # Build a candidate junk set of roughly ~ 1/10th the size of of tokens set:
    # we use a curated list of common words as a base. The final length (and
    # also biggest token id) of junk tokens set typically ~ 1200 for about 12K
    # tokens

    junk_max = abs((len(tokens_by_tid) / 11) - len(very_common))

    junk = set()
    junk_add = junk.add
    dictionary_get = dictionary.get
    junk_count = 0
    
    for token in global_tokens_by_ranks():
        tid = dictionary_get(token)
        if tid is None:
            continue

        if tid not in very_common and tid not in good:
            junk_add(tid)
            junk_count += 1

        if junk_count == junk_max:
            break

    # Assemble our final junk and not junk sets
    final_junk = (very_common | junk) - good
    good = set(range(len(tokens_by_tid))) - final_junk

    if with_checks:
        # Now do a few sanity checks...
        def tokens_str(_tks):
            return u' '.join(tokens_by_tid[_tk] for _tk in _tks)

        # Check that no small rule is made entirely of junk
        for rid, tokens in small_rules:
            try:
                assert not all([jt in final_junk for jt in tokens])
            except AssertionError:
                # this is a serious index issue
                print('!!!License Index FATAL ERROR: small rule: ', rid , 'is all made of junk:', tokens_str(tokens))
                raise

        # Check that not too many ngrams are made entirely of junk
        # we build a set of ngrams for `length` over tokens of rules at equal or
        # bigger than length and check them all

        all_junk_ngrams_count = 0
        for rid, tokens in regular_rules:
            for ngram in ngrams(tokens, length):
                # skip ngrams composed only of common junk as not significant
                if all(nt in very_common for nt in ngram):
                    continue
                try:
                    # note: we check only against junk, not final_junk
                    assert not all(nt in junk for nt in ngram)
                except AssertionError:
                    all_junk_ngrams_count += 1

        # TODO: test that the junk choice is correct: for instance using some
        # stats based on standard deviation or markov chains or similar
        # conditional probabilities such that we verify that CANNOT create a
        # distinctive meaningful license string made entirely from junk tokens


        # check that we do not have too many ngrams made entirely of junk
        assert all_junk_ngrams_count < (length * 20)

    # Sort each set of old token IDs by decreasing original frequencies
    # FIXME: should use a key function not a schwartzian sort
    decorated = ((frequencies_by_tid[old_id], old_id) for old_id in final_junk)
    final_junk = [t for _f, t in sorted(decorated, reverse=True)]

    # FIXME: should use a key function not a schwartzian sort
    decorated = ((frequencies_by_tid[old_id], old_id) for old_id in good)
    good = [t for _f, t in sorted(decorated, reverse=True)]

    # create the new ids -> tokens value mapping
    new_tokens_by_tid = [tokens_by_tid[t] for t in final_junk + good]

    # sanity check: by construction this should always be true
    assert set(new_tokens_by_tid) == set(tokens_by_tid)

    # create new structures based on new ids and a mapping from old to new id
    len_tokens = len(new_tokens_by_tid)
    old_to_new = array('h', [0] * len_tokens)
    new_frequencies_by_tid = [None] * len_tokens
    new_dictionary = {}

    # assign new ids, re build dictionary, frequency
    for new_id, token in enumerate(new_tokens_by_tid):
        old_id = dictionary[token]
        old_to_new[old_id] = new_id

        new_dictionary[token] = new_id

        old_freq = frequencies_by_tid[old_id]
        new_frequencies_by_tid[new_id] = old_freq

    sparsify(new_dictionary)
    return old_to_new, len(final_junk), new_dictionary, new_tokens_by_tid, new_frequencies_by_tid
    def _add_rules(self, rules, optimize=True, _ngram_length=NGRAM_LENGTH):
        """
        Add an iterable of Rule objects to the index as an optimized batch
        operation. This replaces any existing indexed rules previously added.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        rules = list(rules)

        # First pass: collect tokens, count frequencies and find unique tokens
        ######################################################################
        # compute the unique tokens and frequency at once
        unique_tokens = Counter()

        # accumulate all rule tokens at once. Also assign the rule ids
        tokens_by_rid = []

        regular_rids = set()
        regular_rids_add = regular_rids.add
        negative_rids = set()
        negative_rids_add = negative_rids.add

        for rid, rule in enumerate(rules):
            rule.rid = rid
            if rule.negative():
                negative_rids_add(rid)
            else:
                regular_rids_add(rid)
            rule_tokens = list(rule.tokens())
            tokens_by_rid.append(rule_tokens)
            unique_tokens.update(rule_tokens)

        # Create the tokens lookup structure at once.
        # Note that tokens ids are assigned randomly at first by unzipping we
        # get the frequencies and tokens->id at once.
        tokens_by_tid, frequencies_by_tid = izip(*sorted(unique_tokens.most_common()))
        dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)}

        # for speed
        sparsify(dictionary)

        # replace strings with token ids
        rules_tokens_ids = [[dictionary[tok] for tok in rule_tok] for rule_tok in tokens_by_rid]
        len_tokens = len(tokens_by_tid)

        # Second pass: Optimize token ids based on frequencies and common words
        #######################################################################

        # renumber tokens ids
        if optimize:
            renumbered = renumber_token_ids(rules_tokens_ids, dictionary, tokens_by_tid, frequencies_by_tid)
            old_to_new, len_junk, dictionary, tokens_by_tid, frequencies_by_tid = renumbered
        else:
            # for testing only
            len_junk = 0
            # this becomes a noop mapping existing id to themselves
            old_to_new = range(len_tokens)

        # mapping of rule_id->new token_ids array
        new_rules_tokens_ids = []
        # renumber old token ids to new
        for rule_token_ids in rules_tokens_ids:
            new_rules_tokens_ids.append(array('h', (old_to_new[tid] for tid in rule_token_ids)))

        # Third pass: build index structures
        ####################################
        # lists of bitvectors for high and low tokens, one per rule
        high_bitvectors_by_rid = [0 for _r in rules]
        low_bitvectors_by_rid = [0 for _r in rules]

        frequencies_by_rid = [0 for _r in rules]
        lengths_by_rid = array('h', [0 for _r in rules])

        # nested inverted index by rule_id->token_id->[postings array]
        postings_by_rid = [defaultdict(list) for _r in rules]

        # mapping of rule_id -> mapping of starter ngrams -> [(start, end,), ...]
        start_ngrams_by_rid = [defaultdict(list) for _r in rules]

        bv_template = bitarray([0 for _t in tokens_by_tid])

        # build posting lists and other index structures
        for rid, new_rule_token_ids in enumerate(new_rules_tokens_ids):
            rid_postings = postings_by_rid[rid]

            tokens_frequency = Counter()
            # rule bitvector: index is the token id, 1 means token is present, and 0 absent
            tokens_occurrence = bv_template.copy()

            # loop through rules token (new) ids
            for pos, new_tid in enumerate(new_rule_token_ids):
                # append posting
                rid_postings[new_tid].append(pos)
                # set bit to one in bitvector for the token id
                # TODO: optimize: slice assignments could be faster?
                tokens_frequency[new_tid] += 1
                tokens_occurrence[new_tid] = 1

            sparsify(rid_postings)

            # build a  high and low bitvector for the rule
            high_bitvectors_by_rid[rid] = tokens_occurrence[len_junk:]
            # build a  high and low bitvector for the rule
            low_bitvectors_by_rid[rid] = tokens_occurrence[:len_junk]

            frequencies_by_rid[rid] = tokens_frequency
            lengths_by_rid[rid] = len(new_rule_token_ids)

            # collect starters
            rid_starters = start_ngrams_by_rid[rid]
            gaps = rules[rid].gaps
            for starter_ngram, start in index_starters(new_rule_token_ids, gaps, _ngram_length):
                rid_starters[starter_ngram].append(start)

            sparsify(rid_starters)

            # OPTIMIZED: for faster access to index: convert postings to arrays
            postings_by_rid[rid] = {key: array('h', value) for key, value in rid_postings.items()}

        # assign back the created index structure to self attributes
        self.postings_by_rid = postings_by_rid
        self.len_junk = len_junk
        self.len_tokens = len_tokens
        self.tokens_by_tid = tokens_by_tid
        self.frequencies_by_tid = frequencies_by_tid
        self.lengths_by_rid = lengths_by_rid
        self.dictionary = dictionary
        self.rules_by_rid = rules
        self.high_bitvectors_by_rid = high_bitvectors_by_rid
        self.low_bitvectors_by_rid = low_bitvectors_by_rid
        self.frequencies_by_rid = frequencies_by_rid
        self.tokens_by_rid = new_rules_tokens_ids
        self.start_ngrams_by_rid = start_ngrams_by_rid
        self.negative_rids = negative_rids
        self.regular_rids = regular_rids
        if optimize:
            self.optimized = True
        else:
            # for testing
            return rules_tokens_ids
Exemple #16
0
    def _add_rules(self,
                   rules,
                   _ranked_tokens=global_tokens_by_ranks,
                   _spdx_tokens=None):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.

        `_spdx_tokens` if provided is a set of token strings from known SPDX
        keys: these receive a special treatment.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.is_false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.is_negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Add SPDX key tokens to the dictionary. track which are only from SPDX leys
        ########################################################################
        spdx_tokens = None
        if _spdx_tokens:
            spdx_tokens = _spdx_tokens.difference(frequencies_by_token)
            frequencies_by_token.update(_spdx_tokens)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        ########################################################################
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
        assert len_tokens <= MAX_TOKENS, msg

        # initial dictionary mapping to old/arbitrary token ids
        ########################################################################
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) integer ids
        ########################################################################
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        # Get SPDX-only token ids
        ########################################################################
        spdx_token_ids = None
        if spdx_tokens:
            spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens)

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens,
                                             _spdx_token_ids=spdx_token_ids)
        (
            self.len_junk,
            self.dictionary,
            self.tokens_by_tid,
            self.tids_by_rid,
            self.weak_rids,
        ) = renumbered

        len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered

        #######################################################################
        # build index structures
        #######################################################################
        self.len_good = len_good = len_tokens - len_junk
        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            rule_is_weak = rid in weak_rids

            if rule.is_negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)
            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by

                # FIXME: we do not want to keep small rules and rules that
                # cannot be seq matches in the index

                # no postings for junk only rules
                # we do not want to keep small rules and rules that
                # cannot be seq matches in the index
                if not rule_is_weak:
                    postings = defaultdict(list)
                    for pos, tid in enumerate(rule_token_ids):
                        if tid >= len_junk:
                            postings[tid].append(pos)
                    # OPTIMIZED: for speed and memory: convert postings to arrays
                    postings = {
                        tid: array('h', value)
                        for tid, value in postings.items()
                    }
                    # OPTIMIZED: for speed, sparsify dict
                    sparsify(postings)
                    self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(
                    rule_token_ids, len_junk, len_good)

                # no set indexes for junk only rules
                if not rule_is_weak:
                    self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                    self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100
                        and len(rule_token_ids) > NGRAM_LEN):
                    all_ngrams = tokenize.ngrams(rule_token_ids,
                                                 ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams,
                                                             with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # FIXME: this may not be updated for a rule that is createda at
                # match time such as SPDX rules

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [
                '\n'.join(
                    sorted([('file://' + rule.text_file) if rule.text_file else
                            ('text: ' + rule.stored_text) for rule in rules]))
                for rules in dupe_rules
            ]
            msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
            raise AssertionError(msg)

        self.optimized = True
Exemple #17
0
    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            rul_len = rul.length
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
                if rul_len > self.largest_false_positive_length:
                    self.largest_false_positive_length = rul_len
            elif rul.negative():
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.false_positive:
                # FP rules are not used for any matching
                # there is nothing else for these rules
                self.false_positive_rid_by_hash[rule_hash] = rid
            else:
                # negative, small and regular

                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {
                    tid: array('h', value)
                    for tid, value in postings.items()
                }
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets(
                    rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automatons...
                if rule.negative():
                    # ... with only the whole rule tokens sequence
                    negative_automaton_add(tids=rule_token_ids, rid=rid)
                else:
                    # ... or with the whole rule tokens sequence
                    rules_automaton_add(tids=rule_token_ids, rid=rid)
                    # ... and ngrams: compute ngrams and populate the automaton with ngrams
                    if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(
                            rule_token_ids) > NGRAM_LEN:
                        all_ngrams = ngrams(rule_token_ids,
                                            ngram_length=NGRAM_LEN)
                        selected_ngrams = select_ngrams(all_ngrams,
                                                        with_pos=True)
                        for pos, ngram in selected_ngrams:
                            rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = tids_set_counter(rlow_set)
                rule.high_unique = tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = tids_multiset_counter(rlow_mset)
                rule.high_length = tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)
        sparsify(self.false_positive_rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules]
                               for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' +
                   u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True
Exemple #18
0
    def renumber_token_ids(self,
                           frequencies_by_old_tid,
                           _ranked_tokens=global_tokens_by_ranks):
        """
        Return updated index structures with new token ids such that the most
        common tokens (aka. 'junk' or 'low' tokens) have the lowest ids.

        Return a tuple of (len_junk, dictionary, tokens_by_tid, tids_by_rid)
        - len_junk: the number of junk_old_tids tokens such that all junk token
        ids are smaller than this number.
        - dictionary: mapping of token string->token id
        - tokens_by_tid: reverse mapping of token id->token string
        - tids_by_rid: mapping of rule id-> array of token ids

        The arguments all relate to old, temporary token ids and are :
        - frequencies_by_old_tid: mapping of token id-> occurences across all rules
        - _ranked_tokens: callable returning a list of common lowercase token
        strings, ranked from most common to least common Used only for testing
        and default to a global list.

        Common tokens are computed based on a curated list of frequent words and
        token frequencies across rules such that:
         - common tokens have lower token ids smaller than len_junk
         - no rule is composed entirely of junk tokens.
        """
        old_dictionary = self.dictionary
        tokens_by_old_tid = self.tokens_by_tid
        old_tids_by_rid = self.tids_by_rid

        # track tokens for rules with a single token: their token is never junk
        # otherwise they can never be detected
        rules_of_one = set(r.rid for r in self.rules_by_rid if r.length == 1)
        never_junk_old_tids = set(
            rule_tokens[0] for rid, rule_tokens in enumerate(old_tids_by_rid)
            if rid in rules_of_one)

        # creat initial set of junk token ids
        junk_old_tids = set()
        junk_old_tids_add = junk_old_tids.add

        # Treat very common tokens composed only of digits or single chars as junk
        very_common_tids = set(
            old_tid for old_tid, token in enumerate(tokens_by_old_tid)
            if token.isdigit() or len(token) == 1)
        junk_old_tids.update(very_common_tids)

        # TODO: ensure common number as words are treated as very common
        # (one, two, and first, second, etc.)?

        # TODO: add and treat person and place names as always being JUNK

        # Build the candidate junk set as an apprixmate proportion of total tokens
        len_tokens = len(tokens_by_old_tid)
        junk_max = len_tokens // PROPORTION_OF_JUNK

        # Use a curated list of common tokens sorted by decreasing frequency as
        # the basis to determine junk status.
        old_dictionary_get = old_dictionary.get
        for token in _ranked_tokens():
            # stop when we reach the maximum junk proportion
            if len(junk_old_tids) == junk_max:
                break
            old_tid = old_dictionary_get(token)
            if old_tid is not None and old_tid not in never_junk_old_tids:
                junk_old_tids_add(old_tid)

        len_junk = len(junk_old_tids)

        # Assemble our final set of good old token id
        good_old_tids = set(range(len_tokens)) - junk_old_tids
        assert len_tokens == len(junk_old_tids) + len(good_old_tids)

        # Sort the list of old token ids: junk before good, then by decreasing
        # frequencies, then old id.
        # This sort does the renumbering proper of old to new token ids
        key = lambda i: (i in good_old_tids, -frequencies_by_old_tid[i], i)
        new_to_old_tids = sorted(range(len_tokens), key=key)

        # keep a mapping from old to new id used for renumbering index structures
        old_to_new_tids = [
            new_tid for new_tid, _old_tid in sorted(enumerate(new_to_old_tids),
                                                    key=itemgetter(1))
        ]

        # create the new ids -> tokens string mapping
        tokens_by_new_tid = [
            tokens_by_old_tid[old_tid]
            for _new_tid, old_tid in enumerate(new_to_old_tids)
        ]

        # create the new dcitionary tokens trings -> new id
        new_dictionary = {
            token: new_tid
            for new_tid, token in enumerate(tokens_by_new_tid)
        }
        sparsify(new_dictionary)
        old_tids_by_rid = self.tids_by_rid
        # mapping of rule_id->new token_ids array
        new_tids_by_rid = [
            array('h', (old_to_new_tids[tid] for tid in old_tids))
            for old_tids in old_tids_by_rid
        ]

        # Now do a few sanity checks...
        # By construction this should always be true
        assert set(tokens_by_new_tid) == set(tokens_by_old_tid)

        fatals = []
        for rid, new_tids in enumerate(new_tids_by_rid):
            # Check that no rule is all junk: this is a fatal indexing error
            if all(t < len_junk for t in new_tids):
                message = (
                    'WARNING: Weak rule, made only of frequent junk tokens. Can only be matched exactly:',
                    self.rules_by_rid[rid].identifier,
                    u' '.join(tokens_by_new_tid[t] for t in new_tids))
                fatals.append(u' '.join(message))
        if TRACE and fatals:
            # raise IndexError(u'\n'.join(fatals))
            print()
            print('############################################')
            map(print, fatals)
            print('############################################')
            print()
        # TODO: Check that the junk count choice is correct: for instance using some
        # stats based on standard deviation or markov chains or similar
        # conditional probabilities such that we verify that we CANNOT create a
        # distinctive meaningful license string made entirely from junk tokens

        return len_junk, new_dictionary, tokens_by_new_tid, new_tids_by_rid
Exemple #19
0
    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)}
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)

            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True