Ejemplo n.º 1
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses = cache.get_licenses_db()
    licensing = Licensing(licenses.values())

    print()
    errors = validate_license_rules(rules_data, licensing)
    if errors:
        print('Invalid rules: exiting....')
        for error in errors:
            print(error)
            print()

        raise Exception('Invalid rules: exiting....')

    print()
    for rule in rules_data:
        is_negative = rule.data.get('is_negative')
        is_false_positive = rule.data.get('is_false_positive')
        existing = rule_exists(rule.text)
        if existing and not is_negative:
            print('Skipping existing non-negative rule:', existing,
                  'with text:\n', rule.text[:50].strip(), '...')
            continue

        if is_negative:
            base_name = 'not-a-license'
        else:
            license_expression = rule.data.get('license_expression')
            license_expression = str(
                licensing.parse(license_expression, validate=True,
                                simple=True))
            base_name = license_expression
            if is_false_positive:
                base_name = 'false-positive_' + base_name

        base_loc = find_rule_base_loc(base_name)

        data_file = base_loc + '.yml'
        with io.open(data_file, 'w', encoding='utf-8') as o:
            o.write(rule.raw_data)

        text_file = base_loc + '.RULE'
        with io.open(text_file, 'w', encoding='utf-8') as o:
            o.write(rule.text)

        rulerec = models.Rule(data_file=data_file, text_file=text_file)
        rule_tokens = tuple(rulerec.tokens())
        if rule_tokens in rules_tokens:
            # cleanup
            os.remove(text_file)
            os.remove(data_file)
            print('Skipping already added rule with text for:', base_name)
        else:
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            print(
                'Rule added:',
                'file://' + rulerec.data_file,
                '\n',
                'file://' + rulerec.text_file,
            )
Ejemplo n.º 2
0
    def test_compute_relevance_is_using_rule_length(self):
        rule = models.Rule(stored_text='1', license_expression='some-license')
        rule.relevance = 13
        rule.has_stored_relevance = False
        rule.is_false_positive = False

        rule.length = 1000
        rule.set_relevance()
        assert rule.relevance == 100

        rule.length = 21
        rule.set_relevance()
        assert rule.relevance == 100

        rule.length = 20
        rule.set_relevance()
        assert rule.relevance == 100

        rule.length = 18
        rule.set_relevance()
        assert rule.relevance == 100

        rule.length = 17
        rule.set_relevance()
        assert rule.relevance == 94

        rule.length = 16
        rule.set_relevance()
        assert rule.relevance == 88

        rule.length = 15
        rule.set_relevance()
        assert rule.relevance == 83

        rule.length = 14
        rule.set_relevance()
        assert rule.relevance == 77

        rule.length = 13
        rule.set_relevance()
        assert rule.relevance == 72

        rule.length = 12
        rule.set_relevance()
        assert rule.relevance == 66

        rule.length = 11
        rule.set_relevance()
        assert rule.relevance == 61

        rule.length = 10
        rule.set_relevance()
        assert rule.relevance == 55

        rule.length = 8
        rule.set_relevance()
        assert rule.relevance == 44

        rule.length = 5
        rule.set_relevance()
        assert rule.relevance == 27

        rule.length = 2
        rule.set_relevance()
        assert rule.relevance == 11

        rule.length = 1
        rule.set_relevance()
        assert rule.relevance == 5

        rule.length = 0
        rule.set_relevance()
        assert rule.relevance == 0
Ejemplo n.º 3
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get('relevance')
        rdata.data['has_stored_relevance'] = bool(relevance)

        minimum_coverage = rdata.data.get('minimum_coverage')
        rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:
        existing = rule_exists(rule.text())
        if existing:
            print('Skipping existing rule:', existing, 'with text:\n',
                  rule.text()[:50].strip(), '...')
            continue

        if rule.is_false_positive:
            base_name = 'false-positive'
        elif rule.is_license_intro:
            base_name = 'license-intro'
        else:
            base_name = rule.license_expression

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd['stored_text'] = rule.stored_text
        rd['has_stored_relevance'] = rule.has_stored_relevance
        rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        rulerec.data_file = base_loc + '.yml'
        rulerec.text_file = base_loc + '.RULE'

        rule_tokens = tuple(rulerec.tokens())

        if rule_tokens in rules_tokens:
            print('Skipping already added rule with text for:', base_name)
        else:
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            print(
                'Rule added:',
                'file://' + rulerec.data_file,
                '\n',
                'file://' + rulerec.text_file,
            )
Ejemplo n.º 4
0
    def test_match_to_indexed_template_with_few_tokens_around_gaps(self):
        # Was failing when a gap in a template starts very close to the start of
        # a rule tokens seq. We may still skip that, but we capture a large
        # match anyway.

        rule = models.Rule(text_file=self.get_test_loc('index/templates/idx.txt'), licenses=['test'],)
        idx = index.LicenseIndex([rule])

        query_loc = self.get_test_loc('index/templates/query.txt')
        result = idx.match(location=query_loc)
        assert 1 == len(result)
        match = result[0]

        exp_qtext = u"""
            All Rights Reserved

            Redistribution and use of this software and associated documentation
            Software with or without modification are permitted provided that
            the following conditions are met

            1 Redistributions of source code must retain copyright statements
            and notices Redistributions must also contain a copy of this
            document

            2 Redistributions in binary form must reproduce the above copyright
            notice this list of conditions and the following disclaimer in the
            documentation and or other materials provided with the distribution

            3 The name [groovy] must not be used to endorse or promote
            products derived from this Software without prior written permission
            of <The> [Codehaus] For written permission please contact
            [info] [codehaus] [org]

            4 Products derived from this Software may not be called [groovy]
            nor may [groovy] appear in their names without prior written
            permission of <The> [Codehaus] [groovy] is a registered
            trademark of <The> [Codehaus]

            5 Due credit should be given to <The> [Codehaus]
            [http] [groovy] [codehaus] [org]


            <THIS> <SOFTWARE> <IS> <PROVIDED> <BY> <THE> [CODEHAUS] AND CONTRIBUTORS
            AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT
            LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
            A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL <THE> [CODEHAUS]
            OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT
            INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT
            NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF
            USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
            ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT
            INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE
            OF THIS SOFTWARE EVEN IF ADVISED OF THE [POSSIBILITY] <OF> [SUCH] DAMAGE
        """.split()

        exp_itext = u"""
            All Rights Reserved

            Redistribution and use of this software and associated documentation
            Software with or without modification are permitted provided that
            the following conditions are met

            1 Redistributions of source code must retain copyright statements
            and notices Redistributions must also contain a copy of this
            document

            2 Redistributions in binary form must reproduce the above copyright
            notice this list of conditions and the following disclaimer in the
            documentation and or other materials provided with the distribution

            3 The name must not be used to endorse or promote products
            derived from this Software without prior written permission of
            For written permission please contact

            4 Products derived from this Software may not be called nor
            may appear in their names without prior written permission of
            is a registered trademark of

            5 Due credit should be given to

            <THIS> <SOFTWARE> <IS> <PROVIDED> <BY>
            AND CONTRIBUTORS AS IS AND ANY
            EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE
            IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
            PURPOSE ARE DISCLAIMED IN NO EVENT SHALL OR ITS CONTRIBUTORS
            BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR
            CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF
            SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS
            INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER
            IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR
            OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF
            ADVISED OF THE DAMAGE
        """.split()
        qtext, itext = get_texts(match, location=query_loc, idx=idx)
        assert exp_qtext == qtext.split()
        assert exp_itext == itext.split()
        assert match.coverage() > 97
        assert match_seq.MATCH_SEQ == match.matcher
Ejemplo n.º 5
0
    def test_index_structures(self):
        # rule text, unique low/high len, low/high len
        test_rules = [
            (u'a one a two a three licensed.', 4, 1, 6, 1),
            (u'a four a five a six licensed.', 2, 3, 4, 3),
            (u'one two three four five gpl', 4, 2, 4, 2),
            (u'The rose is a rose mit', 3, 2, 3, 3),
            (u'The license is GPL', 3, 1, 3, 1),
            (u'The license is a GPL', 4, 1, 4, 1),
            (u'a license is a rose', 3, 1, 4, 1),
            (u'the gpl', 1, 1, 1, 1),
            (u'the mit', 1, 1, 1, 1),
            (u'the bsd', 1, 1, 1, 1),
            (u'the lgpl', 1, 1, 1, 1),
        ]
        idx = index.LicenseIndex()
        rules = [models.Rule(_text=t[0]) for t in test_rules]
        idx._add_rules(rules)

        assert 8 == idx.len_junk
        expected_lengths = [r[1:] for r in test_rules]
        results = [(rule.low_unique, rule.high_unique, rule.low_length, rule.high_length) for rule in rules]
        assert expected_lengths == results

        xdict = {
            u'a': 0,
            u'bsd': 15,
            u'five': 11,
            u'four': 5,
            u'gpl': 8,
            u'is': 2,
            u'lgpl': 13,
            u'license': 3,
            u'licensed': 10,
            u'mit': 12,
            u'one': 7,
            u'rose': 9,
            u'six': 14,
            u'the': 1,
            u'three': 4,
            u'two': 6}

        assert xdict == idx.dictionary

        xtbi = [
            u'a',
            u'the',
            u'is',
            u'license',
            u'three',
            u'four',
            u'two',
            u'one',
            u'gpl',
            u'rose',
            u'licensed',
            u'five',
            u'mit',
            u'lgpl',
            u'six',
            u'bsd']

        assert xtbi == idx.tokens_by_tid

        expected_as_dict = {
            '_tst_18_4': {u'gpl': [3]},
            '_tst_19_6': {u'rose': [4]},
            '_tst_20_5': {u'gpl': [4]},
            '_tst_22_3': {u'mit': [5], u'rose': [1, 4]},
            '_tst_27_2': {u'five': [4], u'gpl': [5]},
            '_tst_29_0': {u'licensed': [6]},
            '_tst_29_1': {u'five': [3], u'licensed': [6], u'six': [5]},
            '_tst_7_7': {u'gpl': [1]},
            '_tst_7_8': {u'mit': [1]},
            '_tst_7_9': {u'bsd': [1]},
            '_tst_8_10': {u'lgpl': [1]}}

        assert expected_as_dict == idx.to_dict()
Ejemplo n.º 6
0
    def test_compute_relevance_is_using_rule_length(self):
        rule = models.Rule(stored_text='1', license_expression='some-license')
        rule.relevance = 13
        rule.has_stored_relevance = False
        rule.is_false_positive = False

        rule.length = 1000
        rule.compute_relevance()
        assert 100 == rule.relevance

        rule.length = 21
        rule.compute_relevance()
        assert 100 == rule.relevance

        rule.length = 20
        rule.compute_relevance()
        assert 100 == rule.relevance

        rule.length = 18
        rule.compute_relevance()
        assert 100 == rule.relevance

        rule.length = 17
        rule.compute_relevance()
        assert 94 == rule.relevance

        rule.length = 16
        rule.compute_relevance()
        assert 88 == rule.relevance

        rule.length = 15
        rule.compute_relevance()
        assert 83 == rule.relevance

        rule.length = 14
        rule.compute_relevance()
        assert 77 == rule.relevance

        rule.length = 13
        rule.compute_relevance()
        assert 72 == rule.relevance

        rule.length = 12
        rule.compute_relevance()
        assert 66 == rule.relevance

        rule.length = 11
        rule.compute_relevance()
        assert 61 == rule.relevance

        rule.length = 10
        rule.compute_relevance()
        assert 55 == rule.relevance

        rule.length = 8
        rule.compute_relevance()
        assert 44 == rule.relevance

        rule.length = 5
        rule.compute_relevance()
        assert 27 == rule.relevance

        rule.length = 2
        rule.compute_relevance()
        assert 11 == rule.relevance

        rule.length = 1
        rule.compute_relevance()
        assert 5 == rule.relevance

        rule.length = 0
        rule.compute_relevance()
        assert 0 == rule.relevance
Ejemplo n.º 7
0
 def test_negative(self):
     assert models.Rule(_text='test_text').negative()
     assert not models.Rule(_text='test_text', licenses=['mylicense'
                                                         ]).negative()
     assert models.Rule(_text='test_text', licenses=[]).negative()
Ejemplo n.º 8
0
    def test_index_structures_with__add_rules(self):
        base = self.get_test_loc('index/tokens_count')
        keys = sorted(os.listdir(base))
        idx = MiniLicenseIndex()
        rules = []
        for key in keys:
            rules.append(
                models.Rule(text_file=os.path.join(base, key),
                            license_expression='gpl-2.0'))

        idx._add_rules(rules, _legalese=mini_legalese)

        assert idx.len_legalese == 40

        expected = set([
            'all', 'allowed', 'and', 'any', 'for', 'is', 'redistribution',
            'thing', 'yes'
        ])

        xdict = {
            key
            for key, val in idx.dictionary.items() if val >= idx.len_legalese
        }

        assert xdict == expected

        xtbi = sorted([
            'all', 'allowed', 'and', 'any', 'for', 'is', 'redistribution',
            'thing', 'yes'
        ])

        assert sorted([
            t for i, t in enumerate(idx.tokens_by_tid) if i >= idx.len_legalese
        ]) == xtbi

        expected_msets_by_rid = [{
            u'redistribution': 1
        }, {
            u'is': 1,
            u'redistribution': 1,
            u'yes': 1
        }, {
            u'allowed': 1,
            u'is': 1,
            u'redistribution': 1,
            u'yes': 1
        }, {
            u'allowed': 1,
            u'for': 1,
            u'is': 1,
            u'redistribution': 1,
            u'yes': 1
        }, {
            u'all': 1,
            u'allowed': 1,
            u'for': 1,
            u'is': 1,
            u'redistribution': 1
        }, {
            u'all': 1,
            u'allowed': 1,
            u'and': 1,
            u'any': 1,
            u'is': 1,
            u'redistribution': 1,
            u'thing': 1
        }, {
            u'is': 1,
            u'redistribution': 1
        }, {
            u'allowed': 1,
            u'is': 1,
            u'redistribution': 1
        }, {
            u'allowed': 1,
            u'for': 1,
            u'is': 1,
            u'redistribution': 1
        }, {
            u'all': 1,
            u'allowed': 1,
            u'is': 1,
            u'redistribution': 1,
            u'yes': 1
        }, {
            u'all': 1,
            u'allowed': 1,
            u'and': 1,
            u'is': 1,
            u'redistribution': 1
        }, {
            u'all': 1,
            u'allowed': 1,
            u'is': 1,
            u'redistribution': 1
        }, {
            u'all': 1,
            u'allowed': 1,
            u'and': 1,
            u'any': 1,
            u'is': 1,
            u'redistribution': 1
        }]

        htmset = [{
            idx.tokens_by_tid[tok]: freq
            for (tok, freq) in tids_mset.items()
        } for tids_mset in idx.msets_by_rid]
        assert htmset == expected_msets_by_rid
Ejemplo n.º 9
0
 def test_create_template_rule(self):
     test_rule = models.Rule(stored_text='A one. A {{}}two. A three.')
     expected = ['a', 'one', 'a', 'two', 'a', 'three']
     assert expected == list(test_rule.tokens())
     assert 6 == test_rule.length
Ejemplo n.º 10
0
 def test_rule_identifier_ignores_small_text_differences(self):
     r1 = models.Rule(text_file=self.create_test_file('Some text'),
                      template=False)
     r2 = models.Rule(text_file=self.create_test_file(' some  \n  text '),
                      template=False)
     assert models.rule_identifier(r1) == models.rule_identifier(r2)
Ejemplo n.º 11
0
 def test_rule_identifier_includes_structure(self):
     r1 = models.Rule(text_file=self.create_test_file('Some text'),
                      license_choice=False)
     r2 = models.Rule(text_file=self.create_test_file('Some text'),
                      license_choice=True)
     assert models.rule_identifier(r1) != models.rule_identifier(r2)
Ejemplo n.º 12
0
 def test_rule_identifier_includes_rule_type(self):
     r1 = models.Rule(text_file=self.create_test_file('Some text'),
                      template=True)
     r2 = models.Rule(text_file=self.create_test_file('Some text'),
                      template=False)
     assert models.rule_identifier(r1) != models.rule_identifier(r2)
Ejemplo n.º 13
0
    def test_index_structures(self):
        # rule text, unique low/high len, low/high len
        test_rules = [
            (u'a one a two a three licensed.', (4, 1, 4, 1)),
            (u'a four a five a six licensed.', (4, 1, 4, 1)),
            (u'one two three four five gpl', (6, 0, 6, 0)),
            (u'The rose is a rose mit', (4, 0, 5, 0)),
            (u'The license is GPL', (4, 1, 4, 1)),
            (u'The license is this GPL', (5, 1, 5, 1)),
            (u'a license is a rose', (3, 1, 3, 1)),
            (u'the gpl', (2, 0, 2, 0)),
            (u'the mit', (2, 0, 2, 0)),
            (u'the bsd', (2, 0, 2, 0)),
            (u'the lgpl', (2, 0, 2, 0)),
        ]
        idx = MiniLicenseIndex()
        rules = [models.Rule(stored_text=t[0]) for t in test_rules]
        idx._add_rules(rules, _legalese=mini_legalese,)

        assert 40 == idx.len_legalese
        expected_lengths = [r[1] for r in test_rules]
        results = [
            (rule.length_unique, rule.high_length_unique,
             rule.length, rule.high_length) for rule in rules]
        assert expected_lengths == results

        expected = set([
            'bsd',
            'five',
            'four',
            'gpl',
            'is',
            'lgpl',
            'mit',
            'one',
            'rose',
            'six',
            'the',
            'this',
            'three',
            'two'])

        xdict = {key for key, val in idx.dictionary.items() if val >= idx.len_legalese}

        assert expected == xdict

        xtbi = sorted([
            'one',
            'two',
            'three',
            'four',
            'five',
            'six',
            'gpl',
            'the',
            'rose',
            'is',
            'mit',
            'this',
            'bsd',
            'lgpl'])

        assert xtbi == sorted([t for i, t in enumerate(idx.tokens_by_tid) if i >= idx.len_legalese])
Ejemplo n.º 14
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rule_by_tokens = all_rule_by_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get('relevance')
        rdata.data['has_stored_relevance'] = bool(relevance)

        minimum_coverage = rdata.data.get('minimum_coverage')
        rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:

        if rule.is_false_positive:
            base_name = 'false-positive'
        elif rule.is_license_intro:
            base_name = 'license-intro'
        else:
            base_name = rule.license_expression

        text = rule.text()

        existing_rule = rule_exists(text)
        skinny_text = ' '.join(text[:80].split())

        existing_msg = (f'Skipping rule for: {base_name!r}, '
                        'dupe of: {existing_rule} '
                        f'with text: {skinny_text!r}...')

        if existing_rule:
            print(existing_msg.format(**locals()))
            continue

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd['stored_text'] = rule.stored_text
        rd['has_stored_relevance'] = rule.has_stored_relevance
        rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.set_relevance()

        rulerec.data_file = base_loc + '.yml'
        rulerec.text_file = base_loc + '.RULE'

        rule_tokens = tuple(rulerec.tokens())

        existing_rule = rule_by_tokens.get(rule_tokens)
        if existing_rule:
            print(existing_msg.format(**locals()))
            continue
        else:
            print(f'Adding new rule: {base_name}')
            print('  file://' + rulerec.data_file)
            print('  file://' + rulerec.text_file, )
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            rulerec.dump()

            rule_by_tokens[rule_tokens] = base_name
Ejemplo n.º 15
0
 def test_create_rule_ignore_punctuation(self):
     test_rule = models.Rule(stored_text='A one. A {{}}two. A three.')
     expected = ['one', 'two', 'three']
     assert expected == list(test_rule.tokens())
     assert 3 == test_rule.length
Ejemplo n.º 16
0
    def test_index_structures_with__add_rules(self):
        base = self.get_test_loc('index/tokens_count')
        keys = sorted(os.listdir(base))
        idx = index.LicenseIndex()
        rules = []
        for key in keys:
            rules.append(models.Rule(text_file=os.path.join(base, key)))

        idx._add_rules(rules)

        assert 4 == idx.len_junk

        expected_index = {
            'plain1_0': {u'redistribution': [0]},
            'plain2_1': {u'is': [1], u'redistribution': [0], u'yes': [2]},
            'plain3_2': {u'is': [1], u'redistribution': [0], u'yes': [3]},
            'plain4_3': {u'is': [1], u'redistribution': [0], u'yes': [4]},
            'plain5_4': {u'is': [1], u'redistribution': [0]},
            'tmpl10_5': {u'any': [8], u'is': [1], u'redistribution': [0], u'thing': [9]},
            'tmpl2_6': {u'is': [1], u'redistribution': [0]},
            'tmpl3_7': {u'is': [1], u'redistribution': [0]},
            'tmpl4_8': {u'is': [1], u'redistribution': [0]},
            'tmpl5_2_10': {u'is': [1], u'redistribution': [0], u'yes': [5]},
            'tmpl5_9': {u'is': [1, 2], u'redistribution': [0]},
            'tmpl6_11': {u'is': [1], u'redistribution': [0]},
            'tmpl7_12': {u'is': [1], u'redistribution': [0]},
            'tmpl8_13': {u'is': [1], u'redistribution': [0]},
            'tmpl9_14': {u'any': [8], u'is': [1], u'redistribution': [0]}
        }
        assert expected_index == idx.to_dict()

        expected_dict = {
            u'all': 1,
            u'allowed': 0,
            u'and': 3,
            u'any': 7,
            u'for': 2,
            u'is': 4,
            u'redistribution': 5,
            u'thing': 8,
            u'yes': 6
        }
        assert expected_dict == idx.dictionary

        expected_tids = [u'allowed', u'all', u'for', u'and', u'is', u'redistribution', u'yes', u'any', u'thing']
        assert expected_tids == idx.tokens_by_tid

        expected_high_tids_msets_by_rid = [
            {u'redistribution': 1},
            {u'is': 1, u'redistribution': 1, u'yes': 1},
            {u'is': 1, u'redistribution': 1, u'yes': 1},
            {u'is': 1, u'redistribution': 1, u'yes': 1},
            {u'is': 1, u'redistribution': 1},
            {u'any': 1, u'is': 1, u'redistribution': 1, u'thing': 1},
            {u'is': 1, u'redistribution': 1},
            {u'is': 1, u'redistribution': 1},
            {u'is': 1, u'redistribution': 1},
            {u'is': 2, u'redistribution': 1},
            {u'is': 1, u'redistribution': 1, u'yes': 1},
            {u'is': 1, u'redistribution': 1},
            {u'is': 1, u'redistribution': 1},
            {u'is': 1, u'redistribution': 1},
            {u'any': 1, u'is': 1, u'redistribution': 1}
        ]
        low_tids_msets_by_rid, high_tids_msets_by_rid = zip(*idx.tids_msets_by_rid)
        htmset = [{idx.tokens_by_tid[tok]:freq for (tok, freq) in tids_mset.items()}
                  for tids_mset in high_tids_msets_by_rid]
        assert expected_high_tids_msets_by_rid == htmset

        expected_low_tids_msets_by_rid = [
            {},
            {},
            {u'allowed': 1},
            {u'allowed': 1, u'for': 1},
            {u'all': 1, u'allowed': 1, u'for': 1},
            {u'all': 2, u'allowed': 1, u'and': 2, u'for': 1},
            {},
            {u'allowed': 1},
            {u'allowed': 1, u'for': 1},
            {u'allowed': 1, u'for': 1},
            {u'all': 1, u'allowed': 1, u'for': 1},
            {u'all': 1, u'allowed': 1, u'and': 1, u'for': 1},
            {u'all': 2, u'allowed': 1, u'and': 1, u'for': 1},
            {u'all': 2, u'allowed': 1, u'and': 2, u'for': 1},
            {u'all': 2, u'allowed': 1, u'and': 2, u'for': 1}
        ]
        assert expected_low_tids_msets_by_rid == [{idx.tokens_by_tid[tok]: freq for tok, freq in tids_mset.items()}
                                                  for tids_mset in low_tids_msets_by_rid]
Ejemplo n.º 17
0
 def test_rule_templates_are_ignored(self):
     test_text = '''{{gap0}}zero one two three{{gap2}}'''
     r1 = models.Rule(stored_text=test_text)
     assert ['gap0', 'zero', 'one', 'two', 'three',
             'gap2'] == list(r1.tokens())
Ejemplo n.º 18
0
 def get_test_rules(self, base, subset=None):
     base = self.get_test_loc(base)
     test_files = sorted(os.listdir(base))
     if subset:
         test_files = [t for t in test_files if t in subset]
     return [models.Rule(text_file=os.path.join(base, license_key), licenses=[license_key]) for license_key in test_files]
Ejemplo n.º 19
0
    def test_match_to_indexed_template_with_few_tokens_around_gaps(self):
        # Was failing when a gap in a template starts very close to the start of
        # a rule tokens seq. We may still skip that, but we capture a large
        # match anyway.

        rule = models.Rule(
            text_file=self.get_test_loc('index/templates/idx.txt'),
            license_expression='test')
        legalese = (mini_legalese
                    | set([
                        'permission', 'written', 'registered', 'derived',
                        'damage', 'due'
                    ]))
        idx = index.LicenseIndex([rule], _legalese=legalese)

        query_loc = self.get_test_loc('index/templates/query.txt')
        result = idx.match(location=query_loc)
        assert 1 == len(result)
        match = result[0]

        exp_qtext = u"""
            All Rights Reserved.

             Redistribution and use of this software and associated documentation
             ("Software"), with or without modification, are permitted provided
             that the following conditions are met:

             1. Redistributions of source code must retain copyright
                statements and notices.  Redistributions must also contain a
                copy of this document.

             2. Redistributions in binary form must reproduce the
                above copyright notice, this list of conditions and the
                following disclaimer in the documentation and/or other
                materials provided with the distribution.

             3. The name "[groovy]" must not be used to endorse or promote
                products derived from this Software without prior written
                permission of [The] [Codehaus].  For written permission,
                please contact [info]@[codehaus].[org].

             4. Products derived from this Software may not be called "[groovy]"
                nor may "[groovy]" appear in their names without prior written
                permission of [The] [Codehaus]. "[groovy]" is a registered
                trademark of [The] [Codehaus].

             5. Due credit should be given to [The] [Codehaus] -
                [http]://[groovy].[codehaus].[org]/

             [THIS] [SOFTWARE] [IS] [PROVIDED] [BY] [THE] [CODEHAUS] AND CONTRIBUTORS
             ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT
             NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
             FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
             [THE] [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
             INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
             (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
             SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
             HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
             STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
             ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
             OF THE [POSSIBILITY] [OF] [SUCH] DAMAGE.
        """.split()

        exp_itext = u"""
            All Rights Reserved

            Redistribution and use of this software and associated documentation
            Software with or without modification are permitted provided that
            the following conditions are met

            1 Redistributions of source code must retain copyright statements
            and notices Redistributions must also contain copy of this
            document

            2 Redistributions in binary form must reproduce the above copyright
            notice this list of conditions and the following disclaimer in the
            documentation and or other materials provided with the distribution

            3 The name must not be used to endorse or promote products
            derived from this Software without prior written permission of
            For written permission please contact

            4 Products derived from this Software may not be called nor
            may appear in their names without prior written permission of
            is registered trademark of

            5 Due credit should be given to

            <THIS> <SOFTWARE> <IS> <PROVIDED> <BY>
            AND CONTRIBUTORS AS IS AND ANY
            EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE
            IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR
            PURPOSE ARE DISCLAIMED IN NO EVENT SHALL OR ITS CONTRIBUTORS
            BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR
            CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF
            SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS
            INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER
            IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR
            OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF
            ADVISED OF THE DAMAGE
        """.lower().split()
        qtext, itext = get_texts(match)
        assert exp_qtext == qtext.split()
        assert exp_itext == itext.split()
        assert match.coverage() > 97
        assert match_seq.MATCH_SEQ == match.matcher
Ejemplo n.º 20
0
 def test_gaps_at_start_and_end_are_ignored(self):
     test_text = '''{{gap0}}zero one two three{{gap2}}'''
     r1 = models.Rule(_text=test_text)
     assert ['zero', 'one', 'two', 'three'] == list(r1.tokens())