Python ngrams Examples, licensedcode.tokenize.ngrams Python Examples

Example #1

0

Show file

File: match_chunk.py Project: balusarakesh/dje_license_search

def index_starters(rule_tokens, gaps, _ngram_length=NGRAM_LENGTH):
    """
    Given an sequence of rule tokens and a set of gaps for that rule, return a
    sequence of tuples of (starter ngram, start,) computed from the tokens,
    gaps and ngram len. start is the starting position of the ngram.
    """
    rule_tokens = list(rule_tokens)
    len_tokens = len(rule_tokens)
    if not gaps:
        # no gaps: consider only the first ngram and the whole rule.
        if len_tokens >= _ngram_length:
            yield tuple(rule_tokens[:_ngram_length]), 0
    else:
        # T' starts at -1
        # pos:
        # 0 1 2 T 3 4 5 6 7 T 8 9 L
        # gaps + len:
        #       2           7     10
        # slices:
        #  [0:3]      [3:8]     [8:11]
        # spans:
        #  [0:2]      [3:7]     [8:10]
        # recipe:
        #  [T'+1:T+1] [T'+1:T+1] [T'+1:T+1]

        for start, ngram in enumerate(ngrams(rule_tokens, ngram_length=_ngram_length)):
            if start == 0:
                if not any(g in gaps for g in range(0, _ngram_length - 2)):
                    yield ngram, start

            elif start - 1 in gaps and not any(p in range(start, start + _ngram_length - 1) for p in gaps):
                yield ngram, start

Example #2

0

Show file

def write_ngrams(texts, output, _seen=set(), ngram_length=6):
    """
    Write the texts list as ngrams to the output file-like object.
    """
    for text in ['\n'.join(ngs) for ngs in ngrams(texts, ngram_length=ngram_length)]:
        if text in _seen:
            continue
        _seen.add(text)
        output.write(template.format(text))

Example #3

0

Show file

File: test_tokenize.py Project: akugarg/scancode-toolkit

 def test_ngrams_with_None_length_three(self):
     tokens = [
         'Redistribution', 'and', 'use', None, 'in', 'source', 'and',
         'binary', 'are', None
     ]
     result = list(ngrams(tokens, ngram_length=3))
     expected = [('Redistribution', 'and', 'use'), ('and', 'use', None),
                 ('use', None, 'in'), (None, 'in', 'source'),
                 ('in', 'source', 'and'), ('source', 'and', 'binary'),
                 ('and', 'binary', 'are'), ('binary', 'are', None)]
     assert result == expected

Example #4

0

Show file

File: match_unknown.py Project: Siddhant-K-code/scancode-toolkit

def add_ngrams(
    automaton,
    tids,
    tokens,
    rule_length,
    len_legalese,
    unknown_ngram_length=UNKNOWN_NGRAM_LENGTH,
):
    """
    Add the `tids` sequence of token ids to an unknown ngram automaton.
    """
    if rule_length >= unknown_ngram_length:
        tids_ngrams = tokenize.ngrams(tids, ngram_length=unknown_ngram_length)
        toks_ngrams = tokenize.ngrams(tokens,
                                      ngram_length=unknown_ngram_length)
        for tids_ngram, toks_ngram in zip(tids_ngrams, toks_ngrams):
            if is_good_tokens_ngram(toks_ngram, tids_ngram, len_legalese):
                # note that we do not store positions as values, only the ngram
                # since we do not keep the rule origin of an ngram
                automaton.add_word(tids_ngram)

Example #5

0

Show file

File: test_tokenize.py Project: ocabrisses/scancode-toolkit

 def test_ngrams_with_None(self):
     tokens = ['Redistribution', 'and', 'use', None, 'in', 'source', 'and', 'binary', 'are', None]
     result = list(ngrams(tokens, ngram_length=4))
     expected = [
         ('Redistribution', 'and', 'use', None),
         ('and', 'use', None, 'in'),
         ('use', None, 'in', 'source'),
         (None, 'in', 'source', 'and'),
         ('in', 'source', 'and', 'binary'),
         ('source', 'and', 'binary', 'are'),
         ('and', 'binary', 'are', None)]
     assert expected == result

Example #6

0

Show file

File: test_tokenize.py Project: akugarg/scancode-toolkit

    def test_ngrams(self):
        tokens = '''
            Redistribution and use in source and binary are permitted.
            '''.split()

        result = list(ngrams(tokens, ngram_length=4))
        expected = [('Redistribution', 'and', 'use', 'in'),
                    ('and', 'use', 'in', 'source'),
                    ('use', 'in', 'source', 'and'),
                    ('in', 'source', 'and', 'binary'),
                    ('source', 'and', 'binary', 'are'),
                    ('and', 'binary', 'are', 'permitted.')]
        assert result == expected

Example #7

0

Show file

File: test_tokenize.py Project: ocabrisses/scancode-toolkit

    def test_ngrams2(self):
        tokens = '''
            Redistribution and use in source and binary are permitted.
            '''.split()

        result = list(ngrams(tokens, ngram_length=4))
        expected = [
            ('Redistribution', 'and', 'use', 'in'),
            ('and', 'use', 'in', 'source'),
            ('use', 'in', 'source', 'and'),
            ('in', 'source', 'and', 'binary'),
            ('source', 'and', 'binary', 'are'),
            ('and', 'binary', 'are', 'permitted.')]

        assert expected == result

Example #8

0

Show file

File: fingerprint.py Project: yinqianshu/scancode-toolkit

    def get_weighted_hash(self):
        """
        Return a weighted array from the word token list.
        """
        result = [0] * HASH_LENGTH
        length = len(self.tokens) - SHINGLE_LENGTH + 1
        shingles = ngrams(self.tokens, SHINGLE_LENGTH)

        if length > 0:
            for shingle in shingles:
                shingle = ''.join(shingle)
                self.process_shingles(shingle, result)
        else:
            self.process_shingles(''.join(self.tokens), result)

        return result

Example #9

0

Show file

def build_set_and_bigrams_mset(token_ids):
    """
    Return a tuple of (tids set, multiset) given a `token_ids` tids
    sequence.
    """
    tids_set = intbitset()
    bigrams_mset = defaultdict(int)

    for bigram in ngrams(token_ids, 2):
        # this skips already matched token ids that are -1
        if -1 in bigram:
            continue
        bigrams_mset[bigram] += 1
        tids_set.update(bigram)

    return tids_set, bigrams_mset

Example #10

0

Show file

File: strings.py Project: SmartsYoung/FenixscanX

def filter_strings(strs, nglen=4):
    """
    Filter cluster of short strings.
    If a string two previous and next neighbors and itself have a
    small length less than mlen, discard that string.
    """
    from licensedcode.tokenize import ngrams
    # FIXME: the ngrams function skips things if we have less than ngram_len strings
    strs = list(strs)
    if len(strs) < nglen:
        for s in strs:
            yield s
    else:
        for ngm in ngrams(strs, nglen):
            junk = (all(len(s) <= 5 for s in ngm)
                    or sum(len(s) for s in ngm) <= nglen * 5
                    or len(set(ngm[0])) / float(len(ngm[0])) < 0.01)
            if junk:
                continue
            yield ngm[0]

Example #11

0

Show file

    def _add_rules(
        self,
        rules,
        _legalese=common_license_words,
        _spdx_tokens=frozenset(),
        _license_tokens=frozenset(),
    ):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.

        `_legalese` is a set of common license-specific words aka. legalese
        `_spdx_tokens` is a set of token strings used in SPDX license identifiers
        ``license_tokens`` is a set of "license" tokens used as start or end of a rule
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # initial dictionary mapping for known legalese tokens
        ########################################################################

        # FIXME: we should start enumerating at 1 below: token ids then become
        # valid "unichr" values, making it easier downstream when used in
        # automatons

        self.dictionary = dictionary = {
            ts: tid for tid, ts in enumerate(sorted(_legalese))
        }

        dictionary_get = dictionary.get

        self.len_legalese = len_legalese = len(dictionary)
        highest_tid = len_legalese - 1

        # Add SPDX key tokens to the dictionary
        # these are always treated as non-legalese. This may seem weird
        # but they are detected in expressions alright and some of their
        # tokens exist as rules too (e.g. GPL)
        ########################################################################
        for sts in sorted(_spdx_tokens):
            stid = dictionary_get(sts)
            if stid is None:
                # we have a never yet seen token, so we assign a new tokenid
                highest_tid += 1
                stid = highest_tid
                dictionary[sts] = stid

        self.rules_by_rid = rules_by_rid = list(rules)
        # ensure that rules are sorted
        rules_by_rid.sort()
        len_rules = len(rules_by_rid)

        # create index data structures
        # OPTIMIZATION: bind frequently used methods to the local scope for
        # index structures
        ########################################################################
        tids_by_rid_append = self.tids_by_rid.append

        false_positive_rids_add = self.false_positive_rids.add
        regular_rids_add = self.regular_rids.add
        approx_matchable_rids_add = self.approx_matchable_rids.add

        # since we only use these for regular rules, these lists may be sparse.
        # their index is the rule rid
        self.high_postings_by_rid = high_postings_by_rid = [None] * len_rules
        self.sets_by_rid = sets_by_rid = [None] * len_rules
        self.msets_by_rid = msets_by_rid = [None] * len_rules

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # create a set of known "license" words used to determine if a rule
        # starts or ends with a "license" word/token
        ########################################################################
        license_tokens = set()
        for t in _license_tokens:
            tid = dictionary_get(t)
            if tid is not None:
                license_tokens.add(tid)

        rules_automaton_add = partial(match_aho.add_sequence,
            automaton=self.rules_automaton, with_duplicates=False)

        if USE_AHO_FRAGMENTS:
            fragments_automaton_add = partial(
                match_aho.add_sequence,
                automaton=self.fragments_automaton,
                with_duplicates=True,
            )

        if USE_RULE_STARTS:
            starts_automaton_add_start = partial(
                match_aho.add_start,
                automaton=self.starts_automaton,
            )

        # OPTIMIZED: bind frequently used objects to local scope
        rid_by_hash = self.rid_by_hash
        match_hash_index_hash = match_hash.index_hash
        match_set_tids_set_counter = match_set.tids_set_counter
        match_set_multiset_counter = match_set.multiset_counter

        len_starts = SMALL_RULE
        min_len_starts = SMALL_RULE * 6

        ngram_len = AHO_FRAGMENTS_NGRAM_LEN

        # Index each rule
        ########################################################################
        for rid, rule in enumerate(rules_by_rid):

            # assign rid
            rule.rid = rid

            rule_token_ids = array('h', [])
            tids_by_rid_append(rule_token_ids)
            rule_token_ids_append = rule_token_ids.append

            rule_tokens = []
            rule_tokens_append = rule_tokens.append

            # A rule is weak if it does not contain at least one legalese word:
            # we consider all rules to be weak until proven otherwise below.
            # "weak" rules can only be matched with an automaton.
            is_weak = True

            for rts in rule.tokens():
                rule_tokens_append(rts)
                rtid = dictionary_get(rts)
                if rtid is None:
                    # we have a never yet seen token, so we assign a new tokenid
                    # note: we could use the length of the dictionary instead
                    highest_tid += 1
                    rtid = highest_tid
                    dictionary[rts] = rtid
                if is_weak and rtid < len_legalese:
                    is_weak = False

                rule_token_ids_append(rtid)

            rule_length = rule.length
            is_tiny = rule_length < TINY_RULE

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash_index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            ####################
            # populate automaton with the whole rule tokens sequence, for all
            # RULEs, be they "standard"/regular, weak, false positive or small
            ####################
            rules_automaton_add(tids=rule_token_ids, rid=rid)

            if rule.is_false_positive:
                # False positive rules do not participate in the set or sequence
                # matching at all: they are used for exact matching and in post-
                # matching filtering
                false_positive_rids_add(rid)
                continue

            # from now on, we have regular rules
            rid_by_hash[rule_hash] = rid
            regular_rids_add(rid)

            # Does the rule starts or ends with a "license" word? We track this
            # to help disambiguate some overlapping false positive short rules
            # OPTIMIZED: the last rtid above IS the last token id
            if license_tokens:
                if rtid in license_tokens:
                    rule.ends_with_license = True
                if rule_token_ids[0] in license_tokens:
                    rule.starts_with_license = True

            # populate unknown_automaton that only makes sense for rules that
            # are also sequence matchable.
            ####################
            match_unknown.add_ngrams(
                automaton=self.unknown_automaton,
                tids=rule_token_ids,
                tokens=rule_tokens,
                len_legalese=len_legalese,
                rule_length=rule_length,
            )

            # Some rules that cannot be matched as a sequence are "weak" rules
            # or can require to be matched only as a continuous sequence of
            # tokens. This includes, tiny, is_continuous or is_license_reference
            # rules. We skip adding these to the data structures used for
            # sequence matching.
            can_match_as_sequence = not (
                is_weak
                or is_tiny
                or rule.is_continuous
                or (rule.is_small
                    and (rule.is_license_reference or rule.is_license_tag))
            )

            if can_match_as_sequence:
                approx_matchable_rids_add(rid)

                ####################
                # update high postings: positions by high tids used to
                # speed up sequence matching
                ####################
                # no postings for rules that cannot be matched as a sequence (too short and weak)
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid < len_legalese:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                high_postings_by_rid[rid] = postings

                ####################
                # ... and ngram fragments: compute ngrams and populate an automaton with ngrams
                ####################
                if (USE_AHO_FRAGMENTS
                    and rule.minimum_coverage < 100
                    and rule_length > ngram_len
                ):
                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=ngram_len)
                    all_ngrams_with_pos = tokenize.select_ngrams(all_ngrams, with_pos=True)
                    # all_ngrams_with_pos = enumerate(all_ngrams)
                    for pos, ngram in all_ngrams_with_pos:
                        fragments_automaton_add(tids=ngram, rid=rid, start=pos)

                ####################
                # use the start and end of this rule as a break point for query runs
                ####################
                if USE_RULE_STARTS and rule_length > min_len_starts:
                    starts_automaton_add_start(
                        tids=rule_token_ids[:len_starts],
                        rule_identifier=rule.identifier,
                        rule_length=rule_length,
                    )

            ####################
            # build sets and multisets indexes, for all regular rules as we need
            # the thresholds
            ####################
            tids_set, mset = match_set.build_set_and_mset(
                rule_token_ids, _use_bigrams=USE_BIGRAM_MULTISETS)
            sets_by_rid[rid] = tids_set
            msets_by_rid[rid] = mset

            ####################################################################
            ####################################################################
            # FIXME!!!!!!! we should store them: we need them and we recompute
            # them later at match time
            tids_set_high = match_set.high_tids_set_subset(
                tids_set, len_legalese)
            mset_high = match_set.high_multiset_subset(
                mset, len_legalese, _use_bigrams=USE_BIGRAM_MULTISETS)

            # FIXME!!!!!!!
            ####################################################################
            ####################################################################

            ####################
            # update rule thresholds
            ####################
            rule.length_unique = match_set_tids_set_counter(tids_set)
            rule.high_length_unique = match_set_tids_set_counter(tids_set_high)

            rule.high_length = match_set_multiset_counter(mset_high)
            rule.compute_thresholds()

        ########################################################################
        # Finalize index data structures
        ########################################################################
        # Create the tid -> token string lookup structure.
        ########################################################################
        self.tokens_by_tid = tokens_by_tid = [
            ts for ts, _tid in sorted(dictionary.items(), key=itemgetter(1))]
        self.len_tokens = len_tokens = len(tokens_by_tid)

        # some tokens are made entirely of digits and these can create some
        # worst case behavior when there are long runs on these
        ########################################################################
        self.digit_only_tids = intbitset([
            i for i, s in enumerate(self.tokens_by_tid) if s.isdigit()])

        # Finalize automatons
        ########################################################################
        self.rules_automaton.make_automaton()
        if USE_AHO_FRAGMENTS:
            self.fragments_automaton.make_automaton()
        if USE_RULE_STARTS:
            match_aho.finalize_starts(self.starts_automaton)
        self.unknown_automaton.make_automaton()

        ########################################################################
        # Do some sanity checks
        ########################################################################

        msg = 'Inconsistent structure lengths'
        assert len_tokens == highest_tid + 1 == len(dictionary), msg

        msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
        assert len_tokens <= MAX_TOKENS, msg

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [
                '\n'.join(
                    sorted([
                        ('file://' + rule.text_file)
                        if rule.text_file
                        else ('text: ' + rule.stored_text)
                            for rule in rules])
                    )
                for rules in dupe_rules
            ]
            msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
            raise AssertionError(msg)

        self.optimized = True

Example #12

0

Show file

File: index.py Project: balusarakesh/dje_license_search

def renumber_token_ids(rules_tokens_ids, dictionary, tokens_by_tid, frequencies_by_tid, length=9, with_checks=True):
    """
    Return updated index structures with new token ids such that the most common
    aka. 'junk' tokens have the lowest ids. 

    `rules_tokens_ids` is a mapping of rule_id->sequence of token ids
    
    These common tokens are based on a curated list of frequent words and
    further refined such that:
     - no rule text sequence is composed entirely of these common tokens.
     - no or only a few rule text sub-sequence of `length` tokens (aka.
     ngrams) is not composed entirely of these common tokens.

    The returned structures are:
    - old_to_new: mapping of (old token id->new token id)
    - len_junk: the highest id of a junk token
    - dictionary (token string->token id)
    - tokens_by_tid (token id->token string)
    - frequencies_by_tid (token id->frequency)
    """
    # keep track of very common junk tokens: digits and single letters
    very_common = set()
    very_common_add = very_common.add
    string_lowercase = u'abcdefghijklmnopqrstuvwxyz'
    for tid, token in enumerate(tokens_by_tid):
        # DIGIT TOKENS: Treat tokens composed only of digits as common junk
        # SINGLE ASCII LETTER TOKENS: Treat single ASCII letter tokens as common junk

        # TODO: ensure common numbers as strings are always there (one, two, and first, second, etc.)
        if token.isdigit() or (len(token) == 1 and token in string_lowercase):
            very_common_add(tid)

    # keep track of good, "not junk" tokens
    good = set()
    good_update = good.update

    # Classify rules tokens as smaller or equal to `length` or regular.
    regular_rules = []
    regular_rules_append = regular_rules.append
    small_rules = []
    small_rules_append = small_rules.append

    for rid, rule_toks_ids in enumerate(rules_tokens_ids):
        len_toks = len(rule_toks_ids)
        if len_toks == 1:
            # RULES of ONE TOKEN: their token cannot be junk
            good_update(rule_toks_ids)
        if len_toks <= length:
            small_rules_append((rid, rule_toks_ids))
        else:
            regular_rules_append((rid, rule_toks_ids))

    # Build a candidate junk set of roughly ~ 1/10th the size of of tokens set:
    # we use a curated list of common words as a base. The final length (and
    # also biggest token id) of junk tokens set typically ~ 1200 for about 12K
    # tokens

    junk_max = abs((len(tokens_by_tid) / 11) - len(very_common))

    junk = set()
    junk_add = junk.add
    dictionary_get = dictionary.get
    junk_count = 0
    
    for token in global_tokens_by_ranks():
        tid = dictionary_get(token)
        if tid is None:
            continue

        if tid not in very_common and tid not in good:
            junk_add(tid)
            junk_count += 1

        if junk_count == junk_max:
            break

    # Assemble our final junk and not junk sets
    final_junk = (very_common | junk) - good
    good = set(range(len(tokens_by_tid))) - final_junk

    if with_checks:
        # Now do a few sanity checks...
        def tokens_str(_tks):
            return u' '.join(tokens_by_tid[_tk] for _tk in _tks)

        # Check that no small rule is made entirely of junk
        for rid, tokens in small_rules:
            try:
                assert not all([jt in final_junk for jt in tokens])
            except AssertionError:
                # this is a serious index issue
                print('!!!License Index FATAL ERROR: small rule: ', rid , 'is all made of junk:', tokens_str(tokens))
                raise

        # Check that not too many ngrams are made entirely of junk
        # we build a set of ngrams for `length` over tokens of rules at equal or
        # bigger than length and check them all

        all_junk_ngrams_count = 0
        for rid, tokens in regular_rules:
            for ngram in ngrams(tokens, length):
                # skip ngrams composed only of common junk as not significant
                if all(nt in very_common for nt in ngram):
                    continue
                try:
                    # note: we check only against junk, not final_junk
                    assert not all(nt in junk for nt in ngram)
                except AssertionError:
                    all_junk_ngrams_count += 1

        # TODO: test that the junk choice is correct: for instance using some
        # stats based on standard deviation or markov chains or similar
        # conditional probabilities such that we verify that CANNOT create a
        # distinctive meaningful license string made entirely from junk tokens


        # check that we do not have too many ngrams made entirely of junk
        assert all_junk_ngrams_count < (length * 20)

    # Sort each set of old token IDs by decreasing original frequencies
    # FIXME: should use a key function not a schwartzian sort
    decorated = ((frequencies_by_tid[old_id], old_id) for old_id in final_junk)
    final_junk = [t for _f, t in sorted(decorated, reverse=True)]

    # FIXME: should use a key function not a schwartzian sort
    decorated = ((frequencies_by_tid[old_id], old_id) for old_id in good)
    good = [t for _f, t in sorted(decorated, reverse=True)]

    # create the new ids -> tokens value mapping
    new_tokens_by_tid = [tokens_by_tid[t] for t in final_junk + good]

    # sanity check: by construction this should always be true
    assert set(new_tokens_by_tid) == set(tokens_by_tid)

    # create new structures based on new ids and a mapping from old to new id
    len_tokens = len(new_tokens_by_tid)
    old_to_new = array('h', [0] * len_tokens)
    new_frequencies_by_tid = [None] * len_tokens
    new_dictionary = {}

    # assign new ids, re build dictionary, frequency
    for new_id, token in enumerate(new_tokens_by_tid):
        old_id = dictionary[token]
        old_to_new[old_id] = new_id

        new_dictionary[token] = new_id

        old_freq = frequencies_by_tid[old_id]
        new_frequencies_by_tid[new_id] = old_freq

    sparsify(new_dictionary)
    return old_to_new, len(final_junk), new_dictionary, new_tokens_by_tid, new_frequencies_by_tid

Example #13

0

Show file

    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            rul_len = rul.length
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
                if rul_len > self.largest_false_positive_length:
                    self.largest_false_positive_length = rul_len
            elif rul.negative():
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.false_positive:
                # FP rules are not used for any matching
                # there is nothing else for these rules
                self.false_positive_rid_by_hash[rule_hash] = rid
            else:
                # negative, small and regular

                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {
                    tid: array('h', value)
                    for tid, value in postings.items()
                }
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = index_token_sets(
                    rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automatons...
                if rule.negative():
                    # ... with only the whole rule tokens sequence
                    negative_automaton_add(tids=rule_token_ids, rid=rid)
                else:
                    # ... or with the whole rule tokens sequence
                    rules_automaton_add(tids=rule_token_ids, rid=rid)
                    # ... and ngrams: compute ngrams and populate the automaton with ngrams
                    if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(
                            rule_token_ids) > NGRAM_LEN:
                        all_ngrams = ngrams(rule_token_ids,
                                            ngram_length=NGRAM_LEN)
                        selected_ngrams = select_ngrams(all_ngrams,
                                                        with_pos=True)
                        for pos, ngram in selected_ngrams:
                            rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = tids_set_counter(rlow_set)
                rule.high_unique = tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = tids_multiset_counter(rlow_mset)
                rule.high_length = tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)
        sparsify(self.false_positive_rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules]
                               for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' +
                   u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True

Example #14

0

Show file

File: index.py Project: vsurge/barista

    def _add_rules(self,
                   rules,
                   _ranked_tokens=global_tokens_by_ranks,
                   _spdx_tokens=None):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.

        `_spdx_tokens` if provided is a set of token strings from known SPDX
        keys: these receive a special treatment.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.is_false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.is_negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Add SPDX key tokens to the dictionary. track which are only from SPDX leys
        ########################################################################
        spdx_tokens = None
        if _spdx_tokens:
            spdx_tokens = _spdx_tokens.difference(frequencies_by_token)
            frequencies_by_token.update(_spdx_tokens)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        ########################################################################
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
        assert len_tokens <= MAX_TOKENS, msg

        # initial dictionary mapping to old/arbitrary token ids
        ########################################################################
        self.dictionary = dictionary = {
            ts: tid
            for tid, ts in enumerate(tokens_by_tid)
        }
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) integer ids
        ########################################################################
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok]
                            for rule_tok in token_strings_by_rid]

        # Get SPDX-only token ids
        ########################################################################
        spdx_token_ids = None
        if spdx_tokens:
            spdx_token_ids = set(dictionary[tok] for tok in spdx_tokens)

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid,
                                             _ranked_tokens,
                                             _spdx_token_ids=spdx_token_ids)
        (
            self.len_junk,
            self.dictionary,
            self.tokens_by_tid,
            self.tids_by_rid,
            self.weak_rids,
        ) = renumbered

        len_junk, dictionary, tokens_by_tid, tids_by_rid, weak_rids = renumbered

        #######################################################################
        # build index structures
        #######################################################################
        self.len_good = len_good = len_tokens - len_junk
        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence,
                                         automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence,
                                      automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            rule_is_weak = rid in weak_rids

            if rule.is_negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)
            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by

                # FIXME: we do not want to keep small rules and rules that
                # cannot be seq matches in the index

                # no postings for junk only rules
                # we do not want to keep small rules and rules that
                # cannot be seq matches in the index
                if not rule_is_weak:
                    postings = defaultdict(list)
                    for pos, tid in enumerate(rule_token_ids):
                        if tid >= len_junk:
                            postings[tid].append(pos)
                    # OPTIMIZED: for speed and memory: convert postings to arrays
                    postings = {
                        tid: array('h', value)
                        for tid, value in postings.items()
                    }
                    # OPTIMIZED: for speed, sparsify dict
                    sparsify(postings)
                    self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(
                    rule_token_ids, len_junk, len_good)

                # no set indexes for junk only rules
                if not rule_is_weak:
                    self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                    self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if (USE_AHO_FRAGMENTS and rule.minimum_coverage < 100
                        and len(rule_token_ids) > NGRAM_LEN):
                    all_ngrams = tokenize.ngrams(rule_token_ids,
                                                 ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams,
                                                             with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # FIXME: this may not be updated for a rule that is createda at
                # match time such as SPDX rules

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [
            rules for rules in dupe_rules_by_hash.values() if len(rules) > 1
        ]
        if dupe_rules:
            dupe_rule_paths = [
                '\n'.join(
                    sorted([('file://' + rule.text_file) if rule.text_file else
                            ('text: ' + rule.stored_text) for rule in rules]))
                for rules in dupe_rules
            ]
            msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
            raise AssertionError(msg)

        self.optimized = True

Example #15

0

Show file

File: index.py Project: ocabrisses/scancode-toolkit

    def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
        """
        Add a list of Rule objects to the index and constructs optimized and
        immutable index structures.
        """
        if self.optimized:
            raise Exception('Index has been optimized and cannot be updated.')

        # this assigns the rule ids implicitly: this is the index in the list
        self.rules_by_rid = list(rules)

        #######################################################################
        # classify rules, collect tokens and frequencies
        #######################################################################
        # accumulate all rule tokens strings. This is used only during indexing
        token_strings_by_rid = []
        # collect the unique token strings and compute their global frequency
        # This is used only during indexing
        frequencies_by_token = Counter()

        for rid, rul in enumerate(self.rules_by_rid):
            rul_tokens = list(rul.tokens())
            token_strings_by_rid.append(rul_tokens)
            frequencies_by_token.update(rul_tokens)
            # assign the rid to the rule object for sanity
            rul.rid = rid

            # classify rules and build disjuncted sets of rids
            if rul.false_positive:
                # false positive rules do not participate in the matches at all
                # they are used only in post-matching filtering
                self.false_positive_rids.add(rid)
            elif rul.negative:
                # negative rules are matched early and their exactly matched
                # tokens are removed from the token stream
                self.negative_rids.add(rid)
            elif rul.small():
                # small rules are best matched with a specialized approach
                self.small_rids.add(rid)
            else:
                # regular rules are matched using a common approach
                self.regular_rids.add(rid)

        # Create the tokens lookup structure at once. Note that tokens ids are
        # assigned randomly here at first by unzipping: we get the frequencies
        # and tokens->id at once this way
        tokens_by_tid, frequencies_by_tid = izip(*frequencies_by_token.items())
        self.tokens_by_tid = tokens_by_tid
        self.len_tokens = len_tokens = len(tokens_by_tid)
        assert len_tokens <= MAX_TOKENS, 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS

        # initial dictionary mapping to old/random token ids
        self.dictionary = dictionary = {ts: tid for tid, ts in enumerate(tokens_by_tid)}
        sparsify(dictionary)

        # replace token strings with arbitrary (and temporary) random integer ids
        self.tids_by_rid = [[dictionary[tok] for tok in rule_tok] for rule_tok in token_strings_by_rid]

        #######################################################################
        # renumber token ids based on frequencies and common words
        #######################################################################
        renumbered = self.renumber_token_ids(frequencies_by_tid, _ranked_tokens)
        self.len_junk, self.dictionary, self.tokens_by_tid, self.tids_by_rid = renumbered
        len_junk, dictionary, tokens_by_tid, tids_by_rid = renumbered
        self.len_good = len_good = len_tokens - len_junk

        #######################################################################
        # build index structures
        #######################################################################

        len_rules = len(self.rules_by_rid)

        # since we only use these for regular rules, these lists may be sparse
        # their index is the rule rid
        self.high_postings_by_rid = [None for _ in range(len_rules)]
        self.tids_sets_by_rid = [None for _ in range(len_rules)]
        self.tids_msets_by_rid = [None for _ in range(len_rules)]

        # track all duplicate rules: fail and report dupes at once at the end
        dupe_rules_by_hash = defaultdict(list)

        # build closures for methods that populate automatons
        negative_automaton_add = partial(match_aho.add_sequence, automaton=self.negative_automaton)
        rules_automaton_add = partial(match_aho.add_sequence, automaton=self.rules_automaton)

        # build by-rule index structures over the token ids seq of each rule
        for rid, rule_token_ids in enumerate(tids_by_rid):
            rule = self.rules_by_rid[rid]

            # build hashes index and check for duplicates rule texts
            rule_hash = match_hash.index_hash(rule_token_ids)
            dupe_rules_by_hash[rule_hash].append(rule)

            if rule.negative:
                negative_automaton_add(tids=rule_token_ids, rid=rid)

            else:
                # update hashes index
                self.rid_by_hash[rule_hash] = rid

                # update high postings index: positions by high tids
                # TODO: this could be optimized with a group_by
                postings = defaultdict(list)
                for pos, tid in enumerate(rule_token_ids):
                    if tid >= len_junk:
                        postings[tid].append(pos)
                # OPTIMIZED: for speed and memory: convert postings to arrays
                postings = {tid: array('h', value) for tid, value in postings.items()}
                # OPTIMIZED: for speed, sparsify dict
                sparsify(postings)
                self.high_postings_by_rid[rid] = postings

                # build high and low tids sets and multisets
                rlow_set, rhigh_set, rlow_mset, rhigh_mset = match_set.index_token_sets(rule_token_ids, len_junk, len_good)
                self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset

                # populate automaton with the whole rule tokens sequence
                rules_automaton_add(tids=rule_token_ids, rid=rid)
                # ... and ngrams: compute ngrams and populate the automaton with ngrams
                if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
                    selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
                    for pos, ngram in selected_ngrams:
                        rules_automaton_add(tids=ngram, rid=rid, start=pos)

                # update rule thresholds
                rule.low_unique = match_set.tids_set_counter(rlow_set)
                rule.high_unique = match_set.tids_set_counter(rhigh_set)
                rule.length_unique = rule.high_unique + rule.low_unique
                rule.low_length = match_set.tids_multiset_counter(rlow_mset)
                rule.high_length = match_set.tids_multiset_counter(rhigh_mset)
                assert rule.length == rule.low_length + rule.high_length

        # # finalize automatons
        self.negative_automaton.make_automaton()
        self.rules_automaton.make_automaton()

        # sparser dicts for faster lookup
        sparsify(self.rid_by_hash)

        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
        if dupe_rules:
            dupe_rule_paths = [['file://' + rule.text_file for rule in rules] for rules in dupe_rules]
            msg = (u'Duplicate rules: \n' + u'\n'.join(map(repr, dupe_rule_paths)))
            raise AssertionError(msg)

        self.optimized = True