Example #1
0
 def test_split_spdx_lid(self):
     test = [
         'SPDX  License   Identifier  : BSD-3-Clause',
         'SPDX-License-Identifier  : BSD-3-Clause',
         'spdx-license- identifier  : BSD-3-Clause',
         ' SPDX License--Identifier: BSD-3-Clause',
         'SPDX-License-Identifier : BSD-3-Clause',
         'SPDx-Licence-Identifier : BSD-3-Clause',
         'SPD-Licence-Identifier : BSD-3-Clause',
     ]
     results = [split_spdx_lid(l) for l in test]
     expected = [
         ('SPDX  License   Identifier  : ', 'BSD-3-Clause'),
         ('SPDX-License-Identifier  : ', 'BSD-3-Clause'),
         ('spdx-license- identifier  : ', 'BSD-3-Clause'),
         ('SPDX License--Identifier: ', 'BSD-3-Clause'),
         ('SPDX-License-Identifier : ', 'BSD-3-Clause'),
         ('SPDx-Licence-Identifier : ', 'BSD-3-Clause'),
         (None, 'SPD-Licence-Identifier : BSD-3-Clause'),
     ]
     assert results == expected
Example #2
0
    def tokens_by_line(self):
        """
        Yield one sequence of tokens for each line in this query. Populate the
        query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`,
        `shorts_and_digits_pos` and `spdx_lines` as a side effect.
        """
        from licensedcode.match_spdx_lid import split_spdx_lid

        # bind frequently called functions to local scope
        tokenizer = query_tokenizer
        line_by_pos_append = self.line_by_pos.append
        self_unknowns_by_pos = self.unknowns_by_pos

        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add
        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # absolute position in a query, including all known and unknown tokens
        abs_pos = -1

        # absolute position in a query, including only known tokens
        known_pos = -1

        # flag ifset to True when we have found the first known token globally
        # across all query lines
        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids

        if TRACE:
            logger_debug('tokens_by_line: query lines')
            for line_num, line in query_lines(self.location, self.query_string):
                logger_debug(' ', line_num, ':', line)

        for line_num, line in query_lines(self.location, self.query_string):
            # keep track of tokens in a line
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_first_known_pos = None

            # FIXME: the implicit update of abs_pos is not clear
            for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
                tid = dic_get(token)

                if tid is not None:
                    # this is a known token
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_first_known_pos is None:
                        line_first_known_pos = known_pos
                else:
                    if not started:
                        # If we have not yet started globally, then all tokens
                        # seen so far are unknowns and we keep a count of them
                        # in the magic "-1" position.
                        self_unknowns_by_pos[-1] += 1
                    else:
                        # here we have a new unknwon token positioned right after
                        # the current known_pos
                        self_unknowns_by_pos[known_pos] += 1
                        unknowns_pos_add(known_pos)

                line_tokens_append(tid)

            # last known token position in the current line
            line_last_known_pos = known_pos

            # ONLY collect as SPDX a line that starts with SPDX License
            # Identifier. There are cases where this prefix does not start as
            # the firt tokens such as when we have one or two words (such as a
            # comment indicator DNL, REM etc.) that start the line and then and
            # an SPDX license identifier.
            spdx_start_offset = None
            if line_tokens[:3] in spdx_lid_token_ids:
                spdx_start_offset = 0
            elif line_tokens[1:4] in spdx_lid_token_ids:
                spdx_start_offset = 1
            elif line_tokens[2:5] in spdx_lid_token_ids:
                spdx_start_offset = 2

            if spdx_start_offset is not None:
                # keep the line, start/end known pos for SPDX matching
                spdx_prefix, spdx_expression = split_spdx_lid(line)
                spdx_text = ' '.join([spdx_prefix or '', spdx_expression])
                spdx_start_known_pos = line_first_known_pos + spdx_start_offset

                if spdx_start_known_pos <= line_last_known_pos:
                    self.spdx_lines.append((spdx_text, spdx_start_known_pos, line_last_known_pos))

            yield line_tokens

        # finally create a Span of positions followed by unkwnons, used
        # for intersection with the query span for scoring matches
        self.unknowns_span = Span(unknowns_pos)
Example #3
0
    def tokens_by_line(
        self,
        location=None,
        query_string=None,
        start_line=1,
    ):
        """
        Yield multiple sequences of tokens, one for each line in this query.
        Line numbers start at ``start_line`` which is 1-based by default.

        SIDE EFFECT: This populates the query `line_by_pos`, `unknowns_by_pos`,
        `unknowns_span`, `stopwords_by_pos`, `shorts_and_digits_pos` and `spdx_lines` .
        """
        from licensedcode.match_spdx_lid import split_spdx_lid
        from licensedcode.stopwords import STOPWORDS

        location = location or self.location
        query_string = query_string or self.query_string

        # bind frequently called functions to local scope
        line_by_pos_append = self.line_by_pos.append

        # we use a defaultdict as a convenience at construction time
        unknowns_by_pos = defaultdict(int)
        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add

        # we use a defaultdict as a convenience at construction time
        stopwords_by_pos = defaultdict(int)
        stopwords_pos = set()
        stopwords_pos_add = stopwords_pos.add

        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # absolute position in a query, including only known tokens
        known_pos = -1

        # flag ifset to True when we have found the first known token globally
        # across all query lines
        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids

        qlines = query_lines(
            location=location,
            query_string=query_string,
            start_line=start_line,
        )
        if TRACE or TRACE_STOP_AND_UNKNOWN:
            logger_debug('tokens_by_line: query lines:')
            qlines = list(qlines)
            for line_num, line in qlines:
                logger_debug(' ', line_num, ':', line)

        for line_num, line in qlines:
            if TRACE_STOP_AND_UNKNOWN:
                logger_debug(f'  line: {line_num}: {line!r}')

            # keep track of tokens in a line
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_first_known_pos = None

            for token in query_tokenizer(line):
                tid = dic_get(token)
                is_stopword = token in STOPWORDS

                if TRACE_STOP_AND_UNKNOWN:
                    logger_debug(
                        f'    token: {token!r}, tid: {tid}, is_stopword: {is_stopword}'
                    )

                if tid is not None and not is_stopword:
                    # this is a known token
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_first_known_pos is None:
                        line_first_known_pos = known_pos

                    if TRACE_STOP_AND_UNKNOWN:
                        logger_debug(
                            f'      KNOWN token: known_pos: {known_pos}')

                else:
                    # process STOPWORDS and unknown words
                    if is_stopword:
                        if not started:
                            # If we have not yet started globally, then all tokens
                            # seen so far are stopwords and we keep a count of them
                            # in the magic "-1" position.
                            stopwords_by_pos[-1] += 1

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      STOPWORD token: known_pos: -1')
                        else:
                            # here we have a new unknwon token positioned right after
                            # the current known_pos
                            stopwords_by_pos[known_pos] += 1
                            stopwords_pos_add(known_pos)

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      STOPWORD token: known_pos: {known_pos}'
                                )

                        # we do not track stopwords, only their position
                        continue
                    else:
                        # this is an UNKNOWN word
                        if not started:
                            # If we have not yet started globally, then all tokens
                            # seen so far are unknowns and we keep a count of them
                            # in the magic "-1" position.
                            unknowns_by_pos[-1] += 1

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      UNKNOWN token: known_pos: -1')

                        else:
                            # here we have a new unknwon token positioned right after
                            # the current known_pos
                            unknowns_by_pos[known_pos] += 1
                            unknowns_pos_add(known_pos)

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      UNKNOWN token: known_pos: {known_pos}'
                                )

                line_tokens_append(tid)

            # last known token position in the current line
            line_last_known_pos = known_pos

            # ONLY collect as SPDX a line that starts with SPDX License
            # Identifier. There are cases where this prefix does not start as
            # the firt tokens such as when we have one or two words (such as a
            # comment indicator DNL, REM etc.) that start the line and then and
            # an SPDX license identifier.
            spdx_start_offset = None
            if line_tokens[:3] in spdx_lid_token_ids:
                spdx_start_offset = 0
            elif line_tokens[1:4] in spdx_lid_token_ids:
                spdx_start_offset = 1
            elif line_tokens[2:5] in spdx_lid_token_ids:
                spdx_start_offset = 2

            if spdx_start_offset is not None:
                # keep the line, start/end known pos for SPDX matching
                spdx_prefix, spdx_expression = split_spdx_lid(line)
                spdx_text = ' '.join([spdx_prefix or '', spdx_expression])
                spdx_start_known_pos = line_first_known_pos + spdx_start_offset

                if spdx_start_known_pos <= line_last_known_pos:
                    self.spdx_lines.append(
                        (spdx_text, spdx_start_known_pos, line_last_known_pos))

            yield line_tokens

        # finally update the attributes and create a Span of positions followed
        # by unkwnons and another for positions followed by stopwords used for
        # intersection with the query span to do the scoring matches correctly
        self.unknowns_span = Span(unknowns_pos)
        # also convert the defaultdicts back to plain discts
        self.unknowns_by_pos = dict(unknowns_by_pos)
        self.stopwords_by_pos = dict(stopwords_by_pos)

        if TRACE_STOP_AND_UNKNOWN:
            logger_debug(f'  self.unknowns_span: {self.unknowns_span}')
            logger_debug(f'  self.unknowns_by_pos: {self.unknowns_by_pos}')
            logger_debug(f'  self.stopwords_by_pos: {self.stopwords_by_pos}')