Beispiel #1
0
    def test_LicenseMatch_small(self):
        r1_text = u'licensed under the GPL, licensed under the GPL distribute extent of law'
        small_rule = Rule(text_file='small_rule',
                          license_expression='apache-1.1',
                          stored_text=r1_text)

        r2_text = u'licensed under the GPL, licensed under the GPL re distribute extent of law' * 10
        long_rule = Rule(text_file='long_rule',
                         license_expression='apache-1.1',
                         stored_text=r2_text)

        _idx = index.LicenseIndex([small_rule, long_rule])

        test = LicenseMatch(rule=small_rule,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12))
        assert test.small()
        test = LicenseMatch(rule=small_rule,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(11, 12))
        assert test.small()

        test = LicenseMatch(rule=small_rule,
                            qspan=Span(10, 11, 12),
                            ispan=Span(10, 11, 12),
                            hispan=Span(11, 12))
        assert test.small()

        test = LicenseMatch(rule=small_rule,
                            qspan=Span(1, 6),
                            ispan=Span(1, 6))
        assert test.small()

        test = LicenseMatch(rule=long_rule,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12))
        assert test.small()

        test = LicenseMatch(rule=long_rule,
                            qspan=Span(5, 10),
                            ispan=Span(5, 10),
                            hispan=Span(5, 6))
        assert test.small()

        test = LicenseMatch(rule=small_rule,
                            qspan=Span(1, 10),
                            ispan=Span(1, 10),
                            hispan=Span(3, 6))
        assert not test.small()
Beispiel #2
0
    def tokens_by_line(self):
        """
        Yield one sequence of tokens for each line in this query. Populate the
        query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`,
        `shorts_and_digits_pos` and `spdx_lines` as a side effect.
        """
        from licensedcode.match_spdx_lid import split_spdx_lid

        # bind frequently called functions to local scope
        tokenizer = query_tokenizer
        line_by_pos_append = self.line_by_pos.append
        self_unknowns_by_pos = self.unknowns_by_pos

        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add
        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # absolute position in a query, including all known and unknown tokens
        abs_pos = -1

        # absolute position in a query, including only known tokens
        known_pos = -1

        # flag ifset to True when we have found the first known token globally
        # across all query lines
        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids

        if TRACE:
            logger_debug('tokens_by_line: query lines')
            for line_num, line in query_lines(self.location, self.query_string):
                logger_debug(' ', line_num, ':', line)

        for line_num, line in query_lines(self.location, self.query_string):
            # keep track of tokens in a line
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_first_known_pos = None

            # FIXME: the implicit update of abs_pos is not clear
            for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
                tid = dic_get(token)

                if tid is not None:
                    # this is a known token
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_first_known_pos is None:
                        line_first_known_pos = known_pos
                else:
                    if not started:
                        # If we have not yet started globally, then all tokens
                        # seen so far are unknowns and we keep a count of them
                        # in the magic "-1" position.
                        self_unknowns_by_pos[-1] += 1
                    else:
                        # here we have a new unknwon token positioned right after
                        # the current known_pos
                        self_unknowns_by_pos[known_pos] += 1
                        unknowns_pos_add(known_pos)

                line_tokens_append(tid)

            # last known token position in the current line
            line_last_known_pos = known_pos

            # ONLY collect as SPDX a line that starts with SPDX License
            # Identifier. There are cases where this prefix does not start as
            # the firt tokens such as when we have one or two words (such as a
            # comment indicator DNL, REM etc.) that start the line and then and
            # an SPDX license identifier.
            spdx_start_offset = None
            if line_tokens[:3] in spdx_lid_token_ids:
                spdx_start_offset = 0
            elif line_tokens[1:4] in spdx_lid_token_ids:
                spdx_start_offset = 1
            elif line_tokens[2:5] in spdx_lid_token_ids:
                spdx_start_offset = 2

            if spdx_start_offset is not None:
                # keep the line, start/end known pos for SPDX matching
                spdx_prefix, spdx_expression = split_spdx_lid(line)
                spdx_text = ' '.join([spdx_prefix or '', spdx_expression])
                spdx_start_known_pos = line_first_known_pos + spdx_start_offset

                if spdx_start_known_pos <= line_last_known_pos:
                    self.spdx_lines.append((spdx_text, spdx_start_known_pos, line_last_known_pos))

            yield line_tokens

        # finally create a Span of positions followed by unkwnons, used
        # for intersection with the query span for scoring matches
        self.unknowns_span = Span(unknowns_pos)
Beispiel #3
0
    def test_LicenseMatch_comparisons(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        contained1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        contained2 = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4))
        same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        same_span2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        before_after = LicenseMatch(rule=r1,
                                    qspan=Span(8, 9),
                                    ispan=Span(8, 9))
        touching = LicenseMatch(rule=r1, qspan=Span(7, 7), ispan=Span(7, 7))
        overlaping = LicenseMatch(rule=r1, qspan=Span(4, 7), ispan=Span(4, 7))

        assert same_span1 == same_span2
        assert same_span1 in same_span2

        assert same_span1.overlap(same_span2)
        assert same_span2.overlap(same_span1)

        assert contained1 not in same_span1
        assert same_span1 not in contained1

        assert contained1.overlap(same_span2)
        assert contained1.surround(contained2)

        assert contained2 in same_span2
        assert contained2 in contained1

        assert contained2.overlap(overlaping)

        assert overlaping.overlap(contained2)
        assert overlaping.overlap(same_span1)
        assert not overlaping.overlap(before_after)

        assert before_after.is_after(same_span1)
        assert before_after.is_after(touching)
        assert before_after.is_after(contained1)
    def test_match_return_correct_positions_with_short_index_and_queries(self):
        idx = index.LicenseIndex(
            [Rule(stored_text='MIT License', license_expression='mit')]
        )
        assert {'_tst_11_0': {'license': [1]}} == idx.to_dict(True)

        matches = idx.match(query_string='MIT License')
        assert 1 == len(matches)

        qtext, itext = get_texts(matches[0], query_string='MIT License', idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        matches = idx.match(query_string='MIT MIT License')
        assert 1 == len(matches)

        qtext, itext = get_texts(matches[0], query_string='MIT MIT License', idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(1, 2) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        query_doc1 = 'do you think I am a mit license MIT License, yes, I think so'
        # #                                  0       1   2       3
        matches = idx.match(query_string=query_doc1)
        assert 2 == len(matches)

        qtext, itext = get_texts(matches[0], query_string=query_doc1, idx=idx)
        assert 'mit license' == qtext
        assert 'mit license' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        qtext, itext = get_texts(matches[1], query_string=query_doc1, idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(2, 3) == matches[1].qspan
        assert Span(0, 1) == matches[1].ispan

        query_doc2 = '''do you think I am a mit license
                        MIT License
                        yes, I think so'''
        matches = idx.match(query_string=query_doc2)
        assert 2 == len(matches)

        qtext, itext = get_texts(matches[0], query_string=query_doc2, idx=idx)
        assert 'mit license' == qtext
        assert 'mit license' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        qtext, itext = get_texts(matches[1], query_string=query_doc2, idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(2, 3) == matches[1].qspan
        assert Span(0, 1) == matches[1].ispan
    def test_match_can_match_with_rule_template_with_gap_near_start_with_few_tokens_before(self):
        # failed when a gapped token starts at a beginning of rule with few tokens before
        test_file = self.get_test_loc('detect/templates/license7.txt')
        rule = Rule(text_file=test_file, license_expression='lic')
        idx = index.LicenseIndex([rule])

        qloc = self.get_test_loc('detect/templates/license8.txt')
        matches = idx.match(qloc)
        assert 1 == len(matches)

        match = matches[0]
        expected_qtokens = u"""
        All Rights Reserved Redistribution and use of this software and associated
        documentation Software with or without modification are permitted provided
        that the following conditions are met

        1 Redistributions of source code must retain copyright statements and notices
        Redistributions must also contain copy of this document

        2 Redistributions in binary form must reproduce the above copyright notice
        this of conditions and the following disclaimer in the documentation and
        or other materials provided with the distribution

        3 The name [groovy] must not be used to endorse or promote products derived
        from this Software without prior written permission of <The> [Codehaus] For
        written permission please contact [info] [codehaus] [org]

        4 Products derived from this Software may not be called [groovy] nor may
        [groovy] appear in their names without prior written permission of <The>
        [Codehaus]

        [groovy] is registered trademark of <The> [Codehaus]

        5 Due credit should be given to <The> [Codehaus]
        [http] [groovy] [codehaus] [org]

        <THIS> <SOFTWARE> <IS> <PROVIDED> <BY> <THE> [CODEHAUS] <AND> <CONTRIBUTORS>
        AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO
        THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR
        PURPOSE ARE DISCLAIMED IN NO EVENT SHALL <THE> [CODEHAUS] OR ITS
        CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY
        OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF
        SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS
        INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN
        CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING
        IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY
        OF SUCH DAMAGE
        """.split()

        expected_itokens = u''' All Rights Reserved Redistribution and use of this
        software and associated documentation Software with or without modification
        are permitted provided that the following conditions are met

        1 Redistributions of source code must retain copyright statements and notices
        Redistributions must also contain copy of this document

        2 Redistributions in binary form must reproduce the above copyright notice
        this of conditions and the following disclaimer in the documentation and
        or other materials provided with the distribution

        3 The name must not be used to endorse or promote products derived from this
        Software without prior written permission of For written permission please
        contact

        4 Products derived from this Software may not be called nor may appear in
        their names without prior written permission of is registered trademark of

        5 Due credit should be given to


        <THIS> <SOFTWARE> <IS> <PROVIDED> <BY>

        AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO
        THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR
        PURPOSE ARE DISCLAIMED IN NO EVENT SHALL OR ITS CONTRIBUTORS BE LIABLE FOR
        ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES
        INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS
        OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY
        THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING
        NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE
        EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
        '''.lower().split()

        qtext, itext = get_texts(match, location=qloc, idx=idx)
        assert expected_qtokens == qtext.split()
        assert expected_itokens == itext.split()

        assert 97.51 == match.coverage()
        assert 97.51 == match.score()
        expected = Span(2, 96) | Span(98, 123) | Span(125, 128) | Span(130, 136) | Span(146, 174) | Span(176, 249)
        assert expected == match.qspan
        assert  Span(1, 132) | Span(138, 240) == match.ispan
Beispiel #6
0
    def test_match_with_template_and_multiple_rules(self):
        test_rules = self.get_test_rules('index/bsd_templates',)
        idx = index.LicenseIndex(test_rules)
        querys = u'''


Hello, what about this

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
    this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
    and/or other materials provided with the distribution.

    * Neither the name of nexB Inc. nor the names of its contributors may be
    used to endorse or promote products derived from this software without
    specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


Goodbye
No part of match        '''
        result = idx.match(query_string=querys)

        assert 1 == len(result)
        match = result[0]
        assert match_seq.MATCH_SEQ == match.matcher

        exp_qtext = u"""
            Redistribution and use in source and binary forms with or without
            modification are permitted provided that the following conditions
            are met

            Redistributions of source code must retain the above copyright
            notice this list of conditions and the following disclaimer

            Redistributions in binary form must reproduce the above copyright
            notice this list of conditions and the following disclaimer in the
            documentation and or other materials provided with the distribution

            Neither the name of [nexB] <Inc> nor the names of its
            contributors may be used to endorse or promote products derived from
            this software without specific prior written permission

            THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
            AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES INCLUDING BUT NOT
            LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
            A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL THE COPYRIGHT
            OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL
            SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED
            TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR
            PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF
            LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING
            NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
            SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
        """.split()

        exp_itext = u"""
            Redistribution and use in source and binary forms with or without
            modification are permitted provided that the following conditions
            are met

            Redistributions of source code must retain the above copyright
            notice this list of conditions and the following disclaimer

            Redistributions in binary form must reproduce the above copyright
            notice this list of conditions and the following disclaimer in the
            documentation and or other materials provided with the distribution

            Neither the name of nor the names of its contributors may be
            used to endorse or promote products derived from this software
            without specific prior written permission

            THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
            AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES INCLUDING BUT NOT
            LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
            A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL THE COPYRIGHT
            OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL
            SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED
            TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR
            PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF
            LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING
            NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
            SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
        """.split()
#         q = Query(query_string=querys, idx=idx)

#         print('######################')
#         print('######################')
#         print('q=', querys.lower().replace('*', ' ').replace('/', ' '). split())
#         print('q2=', [None if t is None else idx.tokens_by_tid[t] for t in q.tokens_with_unknowns()])
#         print('######################')


        qtext, itext = get_texts(match, query_string=querys, idx=idx)
        assert exp_qtext == qtext.split()
        assert exp_itext == itext.split()

        assert Span(Span(1, 72) | Span(74, 212)) == match.qspan

        assert Span(0, 210) == match.ispan
        assert 100 == match.coverage()
Beispiel #7
0
    def tokens_by_line(
        self,
        location=None,
        query_string=None,
        start_line=1,
    ):
        """
        Yield multiple sequences of tokens, one for each line in this query.
        Line numbers start at ``start_line`` which is 1-based by default.

        SIDE EFFECT: This populates the query `line_by_pos`, `unknowns_by_pos`,
        `unknowns_span`, `stopwords_by_pos`, `stopwords_span`,
        `shorts_and_digits_pos` and `spdx_lines` .
        """
        from licensedcode.match_spdx_lid import split_spdx_lid
        from licensedcode.stopwords import STOPWORDS

        location = location or self.location
        query_string = query_string or self.query_string

        # bind frequently called functions to local scope
        line_by_pos_append = self.line_by_pos.append

        self_unknowns_by_pos = self.unknowns_by_pos
        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add

        self_stopwords_by_pos = self.stopwords_by_pos
        stopwords_pos = set()
        stopwords_pos_add = stopwords_pos.add

        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # absolute position in a query, including only known tokens
        known_pos = -1

        # flag ifset to True when we have found the first known token globally
        # across all query lines
        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids

        qlines = query_lines(
            location=location,
            query_string=query_string,
            start_line=start_line,
        )
        if TRACE:
            logger_debug('tokens_by_line: query lines:')
            qlines = list(qlines)
            for line_num, line in qlines:
                logger_debug(' ', line_num, ':', line)

        for line_num, line in qlines:
            # keep track of tokens in a line
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_first_known_pos = None

            for token in query_tokenizer(line):
                tid = dic_get(token)
                is_stopword = token in STOPWORDS
                if tid is not None and not is_stopword:
                    # this is a known token
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_first_known_pos is None:
                        line_first_known_pos = known_pos
                else:
                    # process STOPWORDS and unknown words
                    if is_stopword:
                        if not started:
                            # If we have not yet started globally, then all tokens
                            # seen so far are stopwords and we keep a count of them
                            # in the magic "-1" position.
                            self_stopwords_by_pos[-1] += 1
                        else:
                            # here we have a new unknwon token positioned right after
                            # the current known_pos
                            self_stopwords_by_pos[known_pos] += 1
                            stopwords_pos_add(known_pos)
                        # we do not track stopwords, only their position
                        continue
                    else:
                        if not started:
                            # If we have not yet started globally, then all tokens
                            # seen so far are unknowns and we keep a count of them
                            # in the magic "-1" position.
                            self_unknowns_by_pos[-1] += 1
                        else:
                            # here we have a new unknwon token positioned right after
                            # the current known_pos
                            self_unknowns_by_pos[known_pos] += 1
                            unknowns_pos_add(known_pos)

                line_tokens_append(tid)

            # last known token position in the current line
            line_last_known_pos = known_pos

            # ONLY collect as SPDX a line that starts with SPDX License
            # Identifier. There are cases where this prefix does not start as
            # the firt tokens such as when we have one or two words (such as a
            # comment indicator DNL, REM etc.) that start the line and then and
            # an SPDX license identifier.
            spdx_start_offset = None
            if line_tokens[:3] in spdx_lid_token_ids:
                spdx_start_offset = 0
            elif line_tokens[1:4] in spdx_lid_token_ids:
                spdx_start_offset = 1
            elif line_tokens[2:5] in spdx_lid_token_ids:
                spdx_start_offset = 2

            if spdx_start_offset is not None:
                # keep the line, start/end known pos for SPDX matching
                spdx_prefix, spdx_expression = split_spdx_lid(line)
                spdx_text = ' '.join([spdx_prefix or '', spdx_expression])
                spdx_start_known_pos = line_first_known_pos + spdx_start_offset

                if spdx_start_known_pos <= line_last_known_pos:
                    self.spdx_lines.append(
                        (spdx_text, spdx_start_known_pos, line_last_known_pos))

            yield line_tokens

        # finally create a Span of positions followed by unkwnons and another
        # for positions followed by stopwords used for intersection with the
        # query span to do the scoring matches correctly
        self.unknowns_span = Span(unknowns_pos)
        self.stopwords_span = Span(stopwords_pos)
def exact_match(idx, query_run, automaton):
    """
    Return a list of exact LicenseMatch by matching the `query_run` against
    the `automaton` and `idx` index.
    """
    if TRACE: logger_debug(' #exact_AHO: start ... ')
    if TRACE_DEEP: logger_debug(' #exact_AHO: query_run:', query_run)

    len_junk = idx.len_junk
    rules_by_rid = idx.rules_by_rid

    qtokens = query_run.tokens
    qbegin = query_run.start
    query_run_matchables = query_run.matchables

    qtokens_as_str = array('h', qtokens).tostring()
    matches = []

    # iterate over matched strings: the matched value is (rule id, index start pos, index end pos)
    for qend, matched_rule_segments in automaton.iter(qtokens_as_str):

        for rid, istart, iend in matched_rule_segments:
            rule = rules_by_rid[rid]
            if TRACE_DEEP:
                logger_debug('   #exact_AHO: found match to rule:',
                             rule.identifier)

            ################################
            # FIXME: use a trie of ints or a trie or Unicode characters to avoid this shenaningan
            ################################
            # Since the Tries stores bytes and we have two bytes per tokenid, the
            # real end must be adjusted
            real_qend = (qend - 1) / 2
            # ... and there is now a real possibility of a false match.
            # For instance say we have these tokens :
            #   gpl encoded as 0012 and lgpl encoded as 1200 and mit as 2600
            # And if we scan this "mit lgpl" we get this encoding 2600 1200.
            # The automaton will find a matched string  of 0012 to gpl in the middle
            # matching falsely so we check that the corrected end qposition
            # must be always an integer.
            real_qend_int = int(real_qend)
            if real_qend != real_qend_int:
                if TRACE:
                    logger_debug(
                        '   #exact_AHO: real_qend != int(real_qend), discarding rule match:',
                        rule.identifier)
                continue

            match_len = iend + 1 - istart
            matcher = match_len == rule.length and MATCH_AHO_EXACT or MATCH_AHO_FRAG

            real_qend = real_qend_int
            qposses = range(qbegin + real_qend - match_len + 1,
                            qbegin + real_qend + 1)

            if any(p not in query_run_matchables for p in qposses):
                if TRACE:
                    logger_debug(
                        '   #exact_AHO: not matchable match: any(p not in query_run_matchables for p in qposses), discarding rule:',
                        rule.identifier)
                continue

            qspan = Span(qposses)
            ispan = Span(range(istart, iend + 1))

            itokens = idx.tids_by_rid[rid]
            hispan = Span(p for p in ispan if itokens[p] >= len_junk)

            match = LicenseMatch(rule,
                                 qspan,
                                 ispan,
                                 hispan,
                                 query_run.start,
                                 matcher=matcher)
            matches.append(match)

    if TRACE and matches:
        logger_debug(' ##exact_AHO: matches found#', matches)
        map(print, matches)

    return matches
def match_sequence(idx, candidate, query_run, start_offset=0):
    """
    Return a list of LicenseMatch by matching the `query_run` tokens sequence
    against the `idx` index for the `candidate` rule tuple (rid, rule,
    intersection).
    """
    if not candidate:
        return []

    rid, rule, _intersection = candidate
    high_postings = idx.high_postings_by_rid[rid]
    itokens = idx.tids_by_rid[rid]

    len_junk = idx.len_junk

    qbegin = query_run.start + start_offset
    qfinish = query_run.end
    qtokens = query_run.query.tokens

    matches = []
    qstart = qbegin
    qlen = len(query_run)

    # match as long as long we find alignments and have high matchable tokens
    # this allows to find repeated instances of the same rule in the query run

    query_run_matchables = query_run.matchables
    while qstart <= qfinish:
        if not query_run_matchables:
            break
        block_matches = match_blocks(qtokens, itokens, qstart, qlen, high_postings, len_junk, query_run_matchables)
        if not block_matches:
            break
        if TRACE2:
            logger_debug('block_matches:')
            for m in block_matches:
                i, j, k = m
                print(m)
                print('qtokens:', ' '.join(idx.tokens_by_tid[t] for t in qtokens[i:i + k]))
                print('itokens:', ' '.join(idx.tokens_by_tid[t] for t in itokens[j:j + k]))

        # create one match for each matching block: this not entirely correct
        # but this will be sorted out at LicenseMatch merging and filtering time
        for qpos, ipos, mlen in block_matches:
            qspan = Span(range(qpos, qpos + mlen))
            iposses = range(ipos, ipos + mlen)
            hispan = Span(p for p in iposses if itokens[p] >= len_junk)
            ispan = Span(iposses)
            match = LicenseMatch(rule, qspan, ispan, hispan, qbegin, MATCH_SEQ)
            if TRACE2:
                qt, it = get_texts(match, location=query_run.query.location, query_string=query_run.query.query_string, idx=idx)
                print('###########################')
                print(match)
                print('###########################')
                print(qt)
                print('###########################')
                print(it)
                print('###########################')
            matches.append(match)
            qstart = max([qstart, qspan.end + 1])

    if TRACE: map(logger_debug, matches)
    return matches
Beispiel #10
0
    def test_LicenseMatch_small(self):
        r1_text = u'licensed under the GPL, licensed under the GPL'
        r1 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r1_text)
        r2_text = u'licensed under the GPL, licensed under the GPL' * 10
        r2 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r2_text)
        _idx = index.LicenseIndex([r1, r2])

        assert LicenseMatch(rule=r1,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12)).small()
        assert LicenseMatch(rule=r1,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(11, 12)).small()
        assert LicenseMatch(rule=r1,
                            qspan=Span(10, 11, 12),
                            ispan=Span(10, 11, 12),
                            hispan=Span(11, 12)).small()
        assert LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1,
                                                                  6)).small()

        assert LicenseMatch(rule=r2,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12)).small()
        assert LicenseMatch(rule=r2,
                            qspan=Span(5, 10),
                            ispan=Span(5, 10),
                            hispan=Span(5, 6)).small()
        assert LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1,
                                                                  6)).small()
def get_normalized_expression(query_string, try_as_expression=True):
    """
    Given a text `query_string` return a single detected license expression.
    `query_string` is typically the value of a license field as found in package
    manifests. If `try_as_expression` is True try frst to parse this as a
    license_expression. Return None if there is the `query_string` is empty.
    Return "unknown" as a license expression if there is a `query_string` but
    nothing was detected.
    """
    if not query_string or not query_string.strip():
        return

    if TRACE:
        logger_debug('get_normalized_expression: query_string: "{}"'.format(
            query_string))

    from licensedcode.cache import get_index
    idx = get_index()
    licensing = Licensing()

    # we match twice in a cascade: as an expression, then as plain text if we
    # did not succeed.
    matches = None
    if try_as_expression:
        try:
            matched_as_expression = True
            matches = idx.match(query_string=query_string, as_expression=True)
            if matches_have_unknown(matches, licensing):
                # rematch also if we have unknowns
                matched_as_expression = False
                matches = idx.match(query_string=query_string,
                                    as_expression=False)

        except Exception:
            matched_as_expression = False
            matches = idx.match(query_string=query_string, as_expression=False)
    else:
        matched_as_expression = False
        matches = idx.match(query_string=query_string, as_expression=False)

    if not matches:
        # we have a query_string text but there was no match: return an unknown
        # key
        return 'unknown'

    if TRACE:
        logger_debug('get_normalized_expression: matches:', matches)

    # join the possible multiple detected license expression with an AND
    expression_objects = [m.rule.license_expression_object for m in matches]
    if len(expression_objects) == 1:
        combined_expression_object = expression_objects[0]
    else:
        combined_expression_object = licensing.AND(*expression_objects)

    if matched_as_expression:
        # then just return the expression(s)
        return str(combined_expression_object)

    # Otherwise, verify that we consumed 100% of the query string e.g. that we
    # have no unknown leftover.

    # 1. have all matches 100% coverage?
    all_matches_have_full_coverage = all(m.coverage() == 100 for m in matches)

    # TODO: have all matches a high enough score?

    # 2. are all declared license tokens consumed?
    query = matches[0].query
    # the query object should be the same for all matches. Is this always true??
    for mt in matches:
        if mt.query != query:
            # FIXME: the expception may be swallowed in callers!!!
            raise Exception(
                'Inconsistent package.declared_license: text with multiple "queries".'
                'Please report this issue to the scancode-toolkit team.\n'
                '{}'.format(query_string))

    query_len = len(query.tokens)
    matched_qspans = [m.qspan for m in matches]
    matched_qpositions = Span.union(*matched_qspans)
    len_all_matches = len(matched_qpositions)
    declared_license_is_fully_matched = query_len == len_all_matches

    if not all_matches_have_full_coverage or not declared_license_is_fully_matched:
        # We inject an 'unknown' symbol in the expression
        unknown = licensing.parse('unknown', simple=True)
        combined_expression_object = licensing.AND(combined_expression_object,
                                                   unknown)

    return str(combined_expression_object)
Beispiel #12
0
def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
                   match_blocks=None, deadline=sys.maxsize):
    """
    Return a list of LicenseMatch by matching the `query_run` tokens sequence
    starting at `start_offset` against the `idx` index for the candidate `rule`.
    Stop processing when reachin the deadline time.
    """
    if not rule:
        return []

    if not match_blocks:
        from licensedcode.seq import match_blocks

    rid = rule.rid
    itokens = idx.tids_by_rid[rid]

    len_legalese = idx.len_legalese

    qbegin = query_run.start + start_offset
    qfinish = query_run.end
    qtokens = query_run.query.tokens
    query = query_run.query

    matches = []
    qstart = qbegin

    # match as long as long we find alignments and have high matchable tokens
    # this allows to find repeated instances of the same rule in the query run

    while qstart <= qfinish:

        if TRACE2:
            logger_debug('\n\nmatch_seq:==========================LOOP=============================')

        if not query_run.is_matchable(include_low=False):
            break

        if TRACE2:
            logger_debug('match_seq:running block_matches:', 'a_start:', qstart, 'a_end', qfinish + 1)


        block_matches = match_blocks(
            a=qtokens, b=itokens, a_start=qstart, a_end=qfinish + 1,
            b2j=high_postings, len_good=len_legalese,
            matchables=query_run.matchables)

        if not block_matches:
            break

        # create one match for each matching block: they will be further merged
        # at LicenseMatch merging and filtering time

        for qpos, ipos, mlen in block_matches:

            qspan_end = qpos + mlen
            # skip single non-high word matched as as sequence
            if mlen > 1 or (mlen == 1 and qtokens[qpos] < len_legalese):
                qspan = Span(range(qpos, qspan_end))
                ispan = Span(range(ipos, ipos + mlen))
                hispan = Span(p for p in ispan if itokens[p] < len_legalese)
                match = LicenseMatch(
                    rule, qspan, ispan, hispan, qbegin,
                    matcher=MATCH_SEQ, query=query)
                matches.append(match)

                if TRACE2:
                    from licensedcode.tracing import get_texts
                    qt, it = get_texts(match)
                    logger_debug('###########################')
                    logger_debug(match)
                    logger_debug('###########################')
                    logger_debug(qt)
                    logger_debug('###########################')
                    logger_debug(it)
                    logger_debug('###########################')

            qstart = max([qstart, qspan_end])

            if time() > deadline:
                break

        if time() > deadline:
            break

    if TRACE:
        logger_debug('match_seq: FINAL LicenseMatch(es)')
        for m in matches:
            logger_debug(m)
        logger_debug('\n\n')

    return matches
Beispiel #13
0
def match_fragments(idx, query_run):
    """
    Return a list of Span by matching the `query_run` against the `automaton`
    and `idx` index.

    This is using a BLAST-like matching approach: we match ngram fragments of
    rules (e.g. a seed) and then we extend left and right.
    """
    if TRACE_FRAG:
        logger_debug('-------------->match_fragments')

    # Get matches using the AHO Fragments automaton
    matches = exact_match(idx,
                          query_run,
                          automaton=idx.fragments_automaton,
                          matcher=MATCH_AHO_FRAG)
    if TRACE_FRAG:
        logger_debug('match_fragments')
        for m in matches:
            print(m)

    # Discard fragments that have any already matched positions in previous matches
    from licensedcode.match import filter_already_matched_matches
    matches, _discarded = filter_already_matched_matches(
        matches, query_run.query)

    # Merge matches with a zero max distance, e.g. contiguous or overlapping
    # with matches to the same rule
    from licensedcode.match import merge_matches
    matches = merge_matches(matches, max_dist=0)

    # extend matched fragments left and right. We group by rule
    from licensedcode.seq import extend_match

    rules_by_rid = idx.rules_by_rid
    tids_by_rid = idx.tids_by_rid
    len_legalese = idx.len_legalese

    alo = qbegin = query_run.start
    ahi = query_run.end
    query = query_run.query
    qtokens = query.tokens
    matchables = query_run.matchables

    frag_matches = []

    keyf = lambda m: m.rule.rid
    matches.sort(key=keyf)
    matches_by_rule = groupby(matches, key=keyf)

    for rid, rule_matches in matches_by_rule:
        itokens = tids_by_rid[rid]
        blo, bhi = 0, len(itokens)
        rule = rules_by_rid[rid]

        for match in rule_matches:
            i, j, k = match.qstart, match.istart, match.len()
            # extend alignment left and right as long as we have matchables
            qpos, ipos, mlen = extend_match(i, j, k, qtokens, itokens, alo,
                                            ahi, blo, bhi, matchables)

            qspan = Span(range(qpos, qpos + mlen))
            ispan = Span(range(ipos, ipos + mlen))
            hispan = Span(p for p in ispan if itokens[p] < len_legalese)
            match = LicenseMatch(rule,
                                 qspan,
                                 ispan,
                                 hispan,
                                 qbegin,
                                 matcher=MATCH_AHO_FRAG,
                                 query=query)
            frag_matches.append(match)

    # Merge matches as usual
    matches = merge_matches(matches)

    return frag_matches
Beispiel #14
0
def spdx_id_match(idx, query_run, text):
    """
    Return one LicenseMatch by matching the `text` as an SPDX license expression
    using the `query_run` positions and `idx` index for support.
    """
    from licensedcode.cache import get_spdx_symbols
    from licensedcode.cache import get_unknown_spdx_symbol

    if TRACE:
        logger_debug('spdx_id_match: start:', 'text:', text, 'query_run:', query_run)

    licensing = Licensing()
    symbols_by_spdx = get_spdx_symbols()
    unknown_symbol = get_unknown_spdx_symbol()

    expression = get_expression(text, licensing, symbols_by_spdx, unknown_symbol)
    expression_str = expression.render()

    if TRACE:
        logger_debug('spdx_id_match: expression:', repr(expression_str))

    # how many known or unknown-spdx symbols occurence do we have?
    known_syms = 0
    unknown_syms = 0
    for sym in licensing.license_symbols(expression, unique=False, decompose=True):
        if sym == unknown_symbol:
            unknown_syms += 1
        else:
            known_syms += 1

    match_len = len(query_run)
    match_start = query_run.start
    matched_tokens = query_run.tokens

    if TRACE:
        logger_debug('spdx_id_match: matched_tokens: 1:',
                     matched_tokens, [idx.tokens_by_tid[tid] for tid in matched_tokens])

    cleaned = clean_text(text).lower()
    if TRACE: logger_debug('spdx_id_match: cleaned :', cleaned)

    # build synthetic rule
    # TODO: ensure that all the SPDX license keys are known symbols
    rule = SpdxRule(
        license_expression=expression_str,
        # FIXME: for now we are putting the original query text as a
        # rule text: this is likely incorrect when it comes to properly
        # computing the known and unknowns and high and lows for this rule.
        # alternatively we could use the expression string, padded with
        # spdx-license-identifier: this may be wrong too, if the line was
        # not padded originally with this tag
        stored_text=text,
        length=match_len)

    if TRACE:
        logger_debug('spdx_id_match: synthetic rule:', rule.relevance)
        logger_debug('spdx_id_match: synthetic rule:', rule)

    # build match from parsed expression
    # collect match start and end: e.g. the whole text
    qspan = Span(range(match_start, query_run.end + 1))

    # we use the query side to build the ispans
    ispan = Span(range(0, match_len))

    len_legalese = idx.len_legalese
    hispan = Span(p for p, t in enumerate(matched_tokens) if t < len_legalese)

    match = LicenseMatch(
        rule=rule, qspan=qspan, ispan=ispan, hispan=hispan,
        query_run_start=match_start,
        matcher=MATCH_SPDX_ID, query=query_run.query
    )

    if TRACE:
        logger_debug('spdx_id_match: match found:', match)
    return match
Beispiel #15
0
    def test_merge_does_not_merge_overlapping_matches_in_sequence_with_assymetric_overlap(
            self):
        r1 = Rule(text_file='r1', license_expression=u'lgpl-2.0-plus')

        # ---> merge_matches: current: LicenseMatch<'3-seq', lines=(9, 28), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=87.5, qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(199, 200), ispan=Span(5, 21)|Span(23, 46)|Span(48, 77)|Span(79, 93)|Span(95, 100)|Span(108, 128)|Span(130, 142), hispan=Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)>
        # ---> merge_matches: next:    LicenseMatch<'2-aho', lines=(28, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143), qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1)|Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)>
        #     ---> ###merge_matches: next overlaps in sequence current, merged as new: LicenseMatch<'3-seq 2-aho', lines=(9, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=268, ilen=144, hilen=21, rlen=144, qreg=(50, 341), ireg=(0, 143), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(198, 341), ispan=Span(0, 143), his

        # ---> merge_matches: current: qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142)
        # ---> merge_matches: next:    qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143)

        m1 = LicenseMatch(
            rule=r1,
            qspan=Span(50, 90) | Span(92, 142) | Span(151, 182)
            | Span(199, 200),
            ispan=Span(5, 21) | Span(23, 46) | Span(48, 77) | Span(79, 93)
            | Span(95, 100) | Span(108, 128) | Span(130, 142),
            hispan=Span(10) | Span(14) | Span(18) | Span(24) | Span(27)
            | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68)
            | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113)
            | Span(115) | Span(131) | Span(141),
        )
        m2 = LicenseMatch(rule=r1,
                          qspan=Span(198, 341),
                          ispan=Span(0, 143),
                          hispan=Span(1) | Span(10) | Span(14) | Span(18)
                          | Span(24) | Span(27) | Span(52) | Span(57)
                          | Span(61) | Span(65, 66) | Span(68) | Span(70)
                          | Span(80) | Span(88) | Span(96) | Span(111)
                          | Span(113) | Span(115) | Span(131) | Span(141))

        matches = merge_matches([m1, m2])
        assert [m1, m2] == matches
    def test_LicenseMatch_equality(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))

        assert m1 == m2

        r2 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m3 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        assert m1 != m3

        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        r2 = Rule(text_file='r2', licenses=['gpl', 'apache-2.0'])
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        assert m1 != m2
        assert m2 != m1

        r3 = Rule(text_file='r3', licenses=['gpl', 'apache-2.0'])
        m3 = LicenseMatch(rule=r3, qspan=Span(0, 2), ispan=Span(0, 2))

        assert m2 != m3
Beispiel #17
0
    def test_filter_matches_filters_non_contiguous_or_overlapping_contained_matches_with_touching_boundaries(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))

        r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl')
        m2 = LicenseMatch(rule=r2, qspan=Span(3, 7), ispan=Span(3, 7))

        r3 = Rule(text_file='r3', license_expression='apache-2.0 OR gpl')
        m3 = LicenseMatch(rule=r3, qspan=Span(0, 6), ispan=Span(0, 6))

        r6 = Rule(text_file='r6', license_expression='apache-2.0 OR gpl')
        m6 = LicenseMatch(rule=r6, qspan=Span(1, 7), ispan=Span(1, 7))

        r5 = Rule(text_file='r5', license_expression='apache-2.0 OR gpl')
        m5 = LicenseMatch(rule=r5, qspan=Span(1, 6), ispan=Span(1, 6))

        r4 = Rule(text_file='r4', license_expression='apache-2.0 OR gpl')
        m4 = LicenseMatch(rule=r4, qspan=Span(0, 7), ispan=Span(0, 7))

        result, discarded = filter_contained_matches([m1, m2, m3, m4, m5, m6])
        assert [m4] == result
        assert discarded
Beispiel #18
0
def get_licenses(location,
                 min_score=0,
                 include_text=False,
                 license_text_diagnostics=False,
                 license_url_template=SCANCODE_LICENSEDB_URL,
                 deadline=sys.maxsize,
                 **kwargs):
    """
    Return a mapping or detected_licenses for licenses detected in the file at
    `location`

    This mapping contains two keys:
     - 'licenses' with a value that is list of mappings of license information.
     - 'license_expressions' with a value that is list of license expression
       strings.

    `minimum_score` is a minimum score threshold from 0 to 100. The default is 0
    means that all license matches are returned. Otherwise, matches with a score
    below `minimum_score` are returned.

    If `include_text` is True, matched text is included in the returned
    `licenses` data as well as a file-level `percentage_of_license_text` percentage to
    indicate the overall proportion of detected license text and license notice
    words in the file. This is used to determine if a file contains mostly
    licensing information.
    """
    from licensedcode import cache
    from licensedcode.spans import Span

    idx = cache.get_index()

    detected_licenses = []
    detected_expressions = []

    matches = idx.match(location=location,
                        min_score=min_score,
                        deadline=deadline,
                        **kwargs)

    qspans = []
    match = None
    for match in matches:
        qspans.append(match.qspan)

        detected_expressions.append(match.rule.license_expression)

        detected_licenses.extend(
            _licenses_data_from_match(
                match=match,
                include_text=include_text,
                license_text_diagnostics=license_text_diagnostics,
                license_url_template=license_url_template))

    percentage_of_license_text = 0
    if match:
        # we need at least one match to compute a license_coverage
        matched_tokens_length = len(Span().union(*qspans))
        query_tokens_length = match.query.tokens_length(with_unknown=True)
        percentage_of_license_text = round(
            (matched_tokens_length / query_tokens_length) * 100, 2)

    detected_spdx_expressions = []
    return dict([
        ('licenses', detected_licenses),
        ('license_expressions', detected_expressions),
        ('spdx_license_expressions', detected_spdx_expressions),
        ('percentage_of_license_text', percentage_of_license_text),
    ])
Beispiel #19
0
 def test_get_key_phrases_ignores_stopwords_in_positions(self):
     text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.'
     key_phrase_spans = get_key_phrase_spans(text)
     assert list(key_phrase_spans) == [Span(11, 12)]
Beispiel #20
0
    def tokens_by_line(self):
        """
        Yield one sequence of tokens for each line in this query. Populate the
        query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`,
        `shorts_and_digits_pos` and `spdx_lines` as a side effect.
        """
        # bind frequently called functions to local scope
        tokenizer = query_tokenizer
        line_by_pos_append = self.line_by_pos.append
        self_unknowns_by_pos = self.unknowns_by_pos
        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add
        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero

        # absolute position in a query, including all known and unknown tokens
        abs_pos = -1

        # absolute position in a query, including only known tokens
        known_pos = -1

        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids
        do_collect_spdx_lines = spdx_lid_token_ids is not None
        if TRACE:
            logger_debug('tokens_by_line: query lines')
            for line_num, line in query_lines(self.location,
                                              self.query_string):
                logger_debug(' ', line_num, ':', line)

        for line_num, line in query_lines(self.location, self.query_string):
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_start_known_pos = None

            # FIXME: the implicit update of abs_pos is not clear
            for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
                tid = dic_get(token)
                if tid is not None:
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_start_known_pos is None:
                        line_start_known_pos = known_pos
                else:
                    # we have not yet started
                    if not started:
                        self_unknowns_by_pos[-1] += 1
                    else:
                        self_unknowns_by_pos[known_pos] += 1
                        unknowns_pos_add(known_pos)
                line_tokens_append(tid)

            line_end_known_pos = known_pos
            # this works ONLY if the line starts with SPDX or we have one word
            # (such as acomment indicator DNL, REM etc.) and an SPDX id)
            if do_collect_spdx_lines and (line_tokens[:3] == spdx_lid_token_ids
                                          or line_tokens[1:4]
                                          == spdx_lid_token_ids):
                # keep the line, start/end  known pos for SPDX matching
                self.spdx_lines.append(
                    (line, line_start_known_pos, line_end_known_pos))

            yield line_tokens

        # finally create a Span of positions followed by unkwnons, used
        # for intersection with the query span for scoring matches
        self.unknowns_span = Span(unknowns_pos)