Ejemplo n.º 1
0
    def get_spdx_id_matches(self, query, from_spdx_id_lines=True, **kwargs):
        """
        Matching strategy for SPDX-Licensed-Identifier style of expressions. If
        `from_spdx_id_lines` is True detect only in the SPDX license identifier
        lines found in the query. Otherwise use the whole query for detection.
        """
        matches = []

        if from_spdx_id_lines:
            qrs_and_texts = query.spdx_lid_query_runs_and_text()
        else:
            # If we are not specifically looking at a single SPDX-Licene-
            # identifier line, then use the whole query run with the whole text.
            # Note this can only work for small texts or this will likely make
            # the expression parser choke if you feed it large texts
            query_lines = [ln for _, ln
                in tokenize.query_lines(query.location, query.query_string)]
            qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines)
            qrs_and_texts = [qrs_and_texts]

        for query_run, detectable_text in qrs_and_texts:
            if not query_run.matchables:
                # this could happen if there was some negative match applied
                continue
            spdx_match = match_spdx_lid.spdx_id_match(
                self, query_run, detectable_text)
            query_run.subtract(spdx_match.qspan)
            matches.append(spdx_match)

        return matches
Ejemplo n.º 2
0
 def test_key_phrase_tokenizer_lines_on_html_like_texts_2(
         self, regen=REGEN_TEST_FIXTURES):
     test_file = self.get_test_loc('tokenize/htmlish.html')
     expected_file = test_file + '.expected.key_phrase_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(key_phrase_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)
Ejemplo n.º 3
0
    def get_spdx_id_matches(
        self,
        query,
        from_spdx_id_lines=True,
        expression_symbols=None,
        **kwargs,
    ):
        """
        Matching strategy for SPDX-Licensed-Identifier style of expressions. If
        `from_spdx_id_lines` is True detect only in the SPDX license identifier
        lines found in the query. Otherwise use the whole query for detection.

        Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol}
        if provided. Otherwise use the standard SPDX license symbols.
        """
        matches = []

        if from_spdx_id_lines:
            qrs_and_texts = query.spdx_lid_query_runs_and_text()
        else:
            # If we are not specifically looking at a single SPDX-Licene-
            # identifier line, then use the whole query run with the whole text.
            # Note this can only work for small texts or this will likely make
            # the expression parser choke if you feed it large texts
            query_lines = tokenize.query_lines(query.location,
                                               query.query_string)
            query_lines = [ln for _, ln in query_lines]
            qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines)
            qrs_and_texts = [qrs_and_texts]

        for query_run, detectable_text in qrs_and_texts:
            if not query_run.matchables:
                continue
            if TRACE_SPDX_LID:
                logger_debug(
                    'get_spdx_id_matches:',
                    'query_run:',
                    query_run,
                    'detectable_text:',
                    detectable_text,
                )

            spdx_match = match_spdx_lid.spdx_id_match(
                idx=self,
                query_run=query_run,
                text=detectable_text,
                expression_symbols=expression_symbols,
            )

            if spdx_match:
                query_run.subtract(spdx_match.qspan)
                matches.append(spdx_match)

        return matches
Ejemplo n.º 4
0
 def test_query_lines_from_location(self):
     query_loc = self.get_test_loc('index/queryperfect-mini')
     expected = [
          u'',
          u'The',
          u'Redistribution and use in source and binary forms, with or without modification, are permitted.',
          u'',
          u'Always',
     ]
     result = list(query_lines(location=query_loc))
     assert expected == result
Ejemplo n.º 5
0
 def test_query_lines_from_location(self):
     query_loc = self.get_test_loc('index/queryperfect-mini')
     expected = [
         u'',
         u'The',
         u'Redistribution and use in source and binary forms, with or without modification, are permitted.',
         u'',
         u'Always',
     ]
     result = [l for _, l in query_lines(location=query_loc)]
     assert result == expected
Ejemplo n.º 6
0
 def test_query_lines_from_location_return_a_correct_number_of_lines(self):
     query_loc = self.get_test_loc('tokenize/correct_lines')
     # note that this is a single line (line number is 1)... broken in two.
     expected = [(
         1, u'Permission is hereby granted, free of charge, to any person '
         'obtaining a copy of this software and associated documentation '
         'files (the "Software"), to deal in the Software without restriction, '
         'including without limitation the rights to use, copy, modify, merge'
         ', , , sublicense, and/or  Software, ,'), (1, u'subject')]
     result = list(query_lines(location=query_loc))
     assert result == expected
Ejemplo n.º 7
0
def matched_query_tokens_str(match,
                             location=None,
                             query_string=None,
                             idx=None,
                             stopwords=STOPWORDS):
    """
    Return an iterable of matched query token strings given a query file at
    `location` or a `query_string`, a match and an index.

    Yield None for unmatched positions. Punctuation is removed, spaces are normalized
    (new line is replaced by a space), case is preserved.
    """
    assert idx
    dictionary_get = idx.dictionary.get

    tokens = (tokenize._query_tokenizer(line, stopwords=stopwords)
              for _ln, line in tokenize.query_lines(location, query_string))
    tokens = chain.from_iterable(tokens)
    match_qspan = match.qspan
    match_qspan_start = match_qspan.start
    match_qspan_end = match_qspan.end
    known_pos = -1
    started = False
    finished = False
    for token in tokens:
        toklow = token.lower()
        if toklow in stopwords:
            continue

        token_id = dictionary_get(toklow)
        if token_id is None:
            if not started:
                continue
            if finished:
                break
        else:
            known_pos += 1

        if match_qspan_start <= known_pos <= match_qspan_end:
            started = True
            if known_pos == match_qspan_end:
                finished = True

            if known_pos in match_qspan and token_id is not None:
                yield token
            else:
                if token_id is not None:
                    yield '<%s>' % token
                else:
                    yield '[%s]' % token
Ejemplo n.º 8
0
    def tokens_by_line(self, tokenizer=query_tokenizer):
        """
        Yield one sequence of tokens for each line in this query.
        Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and
        `shorts_and_digits_pos` as a side effect.
        """
        # bind frequently called functions to local scope
        line_by_pos_append = self.line_by_pos.append
        self_unknowns_by_pos = self.unknowns_by_pos
        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add
        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # this is the absolute position, including the unknown tokens
        abs_pos = -1
        # lines start at one
        line_start = 1

        # this is a relative position, excluding the unknown tokens
        known_pos = -1

        started = False
        for lnum, line in enumerate(
                query_lines(self.location, self.query_string), line_start):
            line_tokens = []
            line_tokens_append = line_tokens.append
            for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
                tid = dic_get(token)
                if tid is not None:
                    known_pos += 1
                    started = True
                    line_by_pos_append(lnum)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                else:
                    # we have not yet started
                    if not started:
                        self_unknowns_by_pos[-1] += 1
                    else:
                        self_unknowns_by_pos[known_pos] += 1
                        unknowns_pos_add(known_pos)
                line_tokens_append(tid)
            yield line_tokens

        # finally create a Span of positions followed by unknwons, used
        # for intersection with the query span for scoring matches
        self.unknowns_span = Span(unknowns_pos)
    def test_query_lines_on_html_like_texts_2(self, regen=False):
        test_file = self.get_test_loc('tokenize/htmlish.html')
        expected_file = test_file + '.expected.query_lines.json'

        # we dumps/loads to normalize tuples/etc
        result = json.loads(json.dumps(list(query_lines(test_file))))

        if regen:
            with open(expected_file, mode) as exc_test:
                json.dump(result, exc_test, indent=2)

        with io.open(expected_file, encoding='utf-8') as exc_test:
            expected = json.load(exc_test)

        assert expected == result
Ejemplo n.º 10
0
    def tokens_by_line(self, tokenizer=query_tokenizer):
        """
        Yield one sequence of tokens for each line in this query.
        Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and
        `shorts_and_digits_pos` as a side effect.
        """
        # bind frequently called functions to local scope
        line_by_pos_append = self.line_by_pos.append
        self_unknowns_by_pos = self.unknowns_by_pos
        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add
        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # this is the absolute position, including the unknown tokens
        abs_pos = -1
        # lines start at one
        line_start = 1

        # this is a relative position, excluding the unknown tokens
        known_pos = -1

        started = False
        for lnum, line  in enumerate(query_lines(self.location, self.query_string), line_start):
            line_tokens = []
            line_tokens_append = line_tokens.append
            for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
                tid = dic_get(token)
                if tid is not None:
                    known_pos += 1
                    started = True
                    line_by_pos_append(lnum)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                else:
                    # we have not yet started
                    if not started:
                        self_unknowns_by_pos[-1] += 1
                    else:
                        self_unknowns_by_pos[known_pos] += 1
                        unknowns_pos_add(known_pos)
                line_tokens_append(tid)
            yield line_tokens

        # finally create a Span of positions followed by unknwons, used
        # for intersection with the query span for scoring matches
        self.unknowns_span = Span(unknowns_pos)
    def test_query_tokenizer_on_html_like_texts(self, regen=False):
        test_file = self.get_test_loc('tokenize/htmlish.txt')
        expected_file = test_file + '.expected.tokenized_lines.json'

        lines = query_lines(test_file)
        tokens = list(list(query_tokenizer(line)) for _ln, line in lines)

        # we dumps/loads to normalize tuples/etc
        result = json.loads(json.dumps(tokens))

        if regen:
            with open(expected_file, mode) as exc_test:
                json.dump(result, exc_test, indent=2)

        with io.open(expected_file, encoding='utf-8') as exc_test:
            expected = json.load(exc_test)

        assert expected == result
Ejemplo n.º 12
0
    def test_query_lines_from_string(self):
        query_string = '''
            The
            Redistribution and use in source and binary forms, with or without modification, are permitted.

            Always
            is
 '''
        expected = [
            u'',
            u'The',
            u'Redistribution and use in source and binary forms, with or without modification, are permitted.',
            u'',
            u'Always',
            u'is',
            u'',
        ]
        result = [l for _, l in query_lines(query_string=query_string)]
        assert result == expected
Ejemplo n.º 13
0
    def test_query_lines_from_string(self):
        query_string = '''
            The   
            Redistribution and use in source and binary forms, with or without modification, are permitted.
            
            Always  
            is
 '''
        expected = [
             u'',
             u'The',
             u'Redistribution and use in source and binary forms, with or without modification, are permitted.',
             u'',
             u'Always',
             u'is',
             u'',
        ]

        result = list(query_lines(query_string=query_string))
        assert expected == result
Ejemplo n.º 14
0
 def test_query_lines_complex(self):
     query_loc = self.get_test_loc('index/querytokens')
     expected = [
         u'',
         u'',
         u'',
         u'Redistribution and use in source and binary forms,',
         u'',
         u'* Redistributions of source code must',
         u'The this that is not there',
         u'Welcom to Jamaica',
         u'* Redistributions in binary form must',
         u'',
         u'THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"',
         u'',
         u'',
         u'',
         u'Redistributions',
     ]
     result = [l for _, l in query_lines(location=query_loc)]
     assert result == expected
Ejemplo n.º 15
0
 def test_query_lines_complex(self):
     query_loc = self.get_test_loc('index/querytokens')
     expected = [
          u'',
          u'',
          u'',
          u'Redistribution and use in source and binary forms,',
          u'',
          u'* Redistributions of source code must',
          u'The this that is not there',
          u'Welcom to Jamaica',
          u'* Redistributions in binary form must',
          u'',
          u'THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"',
          u'',
          u'',
          u'',
          u'Redistributions',
     ]
     result = list(query_lines(location=query_loc))
     assert expected == result
Ejemplo n.º 16
0
    def tokens_by_line(self):
        """
        Yield one sequence of tokens for each line in this query. Populate the
        query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`,
        `shorts_and_digits_pos` and `spdx_lines` as a side effect.
        """
        from licensedcode.match_spdx_lid import split_spdx_lid

        # bind frequently called functions to local scope
        tokenizer = query_tokenizer
        line_by_pos_append = self.line_by_pos.append
        self_unknowns_by_pos = self.unknowns_by_pos

        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add
        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # absolute position in a query, including all known and unknown tokens
        abs_pos = -1

        # absolute position in a query, including only known tokens
        known_pos = -1

        # flag ifset to True when we have found the first known token globally
        # across all query lines
        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids

        if TRACE:
            logger_debug('tokens_by_line: query lines')
            for line_num, line in query_lines(self.location, self.query_string):
                logger_debug(' ', line_num, ':', line)

        for line_num, line in query_lines(self.location, self.query_string):
            # keep track of tokens in a line
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_first_known_pos = None

            # FIXME: the implicit update of abs_pos is not clear
            for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
                tid = dic_get(token)

                if tid is not None:
                    # this is a known token
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_first_known_pos is None:
                        line_first_known_pos = known_pos
                else:
                    if not started:
                        # If we have not yet started globally, then all tokens
                        # seen so far are unknowns and we keep a count of them
                        # in the magic "-1" position.
                        self_unknowns_by_pos[-1] += 1
                    else:
                        # here we have a new unknwon token positioned right after
                        # the current known_pos
                        self_unknowns_by_pos[known_pos] += 1
                        unknowns_pos_add(known_pos)

                line_tokens_append(tid)

            # last known token position in the current line
            line_last_known_pos = known_pos

            # ONLY collect as SPDX a line that starts with SPDX License
            # Identifier. There are cases where this prefix does not start as
            # the firt tokens such as when we have one or two words (such as a
            # comment indicator DNL, REM etc.) that start the line and then and
            # an SPDX license identifier.
            spdx_start_offset = None
            if line_tokens[:3] in spdx_lid_token_ids:
                spdx_start_offset = 0
            elif line_tokens[1:4] in spdx_lid_token_ids:
                spdx_start_offset = 1
            elif line_tokens[2:5] in spdx_lid_token_ids:
                spdx_start_offset = 2

            if spdx_start_offset is not None:
                # keep the line, start/end known pos for SPDX matching
                spdx_prefix, spdx_expression = split_spdx_lid(line)
                spdx_text = ' '.join([spdx_prefix or '', spdx_expression])
                spdx_start_known_pos = line_first_known_pos + spdx_start_offset

                if spdx_start_known_pos <= line_last_known_pos:
                    self.spdx_lines.append((spdx_text, spdx_start_known_pos, line_last_known_pos))

            yield line_tokens

        # finally create a Span of positions followed by unkwnons, used
        # for intersection with the query span for scoring matches
        self.unknowns_span = Span(unknowns_pos)
Ejemplo n.º 17
0
 def test_query_lines_on_html_like_texts(self, regen=REGEN_TEST_FIXTURES):
     test_file = self.get_test_loc('tokenize/htmlish.txt')
     expected_file = test_file + '.expected.query_lines.json'
     result = list(query_lines(test_file))
     check_results(result, expected_file, regen=regen)
Ejemplo n.º 18
0
    def tokens_by_line(
        self,
        location=None,
        query_string=None,
        start_line=1,
    ):
        """
        Yield multiple sequences of tokens, one for each line in this query.
        Line numbers start at ``start_line`` which is 1-based by default.

        SIDE EFFECT: This populates the query `line_by_pos`, `unknowns_by_pos`,
        `unknowns_span`, `stopwords_by_pos`, `shorts_and_digits_pos` and `spdx_lines` .
        """
        from licensedcode.match_spdx_lid import split_spdx_lid
        from licensedcode.stopwords import STOPWORDS

        location = location or self.location
        query_string = query_string or self.query_string

        # bind frequently called functions to local scope
        line_by_pos_append = self.line_by_pos.append

        # we use a defaultdict as a convenience at construction time
        unknowns_by_pos = defaultdict(int)
        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add

        # we use a defaultdict as a convenience at construction time
        stopwords_by_pos = defaultdict(int)
        stopwords_pos = set()
        stopwords_pos_add = stopwords_pos.add

        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero
        # absolute position in a query, including only known tokens
        known_pos = -1

        # flag ifset to True when we have found the first known token globally
        # across all query lines
        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids

        qlines = query_lines(
            location=location,
            query_string=query_string,
            start_line=start_line,
        )
        if TRACE or TRACE_STOP_AND_UNKNOWN:
            logger_debug('tokens_by_line: query lines:')
            qlines = list(qlines)
            for line_num, line in qlines:
                logger_debug(' ', line_num, ':', line)

        for line_num, line in qlines:
            if TRACE_STOP_AND_UNKNOWN:
                logger_debug(f'  line: {line_num}: {line!r}')

            # keep track of tokens in a line
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_first_known_pos = None

            for token in query_tokenizer(line):
                tid = dic_get(token)
                is_stopword = token in STOPWORDS

                if TRACE_STOP_AND_UNKNOWN:
                    logger_debug(
                        f'    token: {token!r}, tid: {tid}, is_stopword: {is_stopword}'
                    )

                if tid is not None and not is_stopword:
                    # this is a known token
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_first_known_pos is None:
                        line_first_known_pos = known_pos

                    if TRACE_STOP_AND_UNKNOWN:
                        logger_debug(
                            f'      KNOWN token: known_pos: {known_pos}')

                else:
                    # process STOPWORDS and unknown words
                    if is_stopword:
                        if not started:
                            # If we have not yet started globally, then all tokens
                            # seen so far are stopwords and we keep a count of them
                            # in the magic "-1" position.
                            stopwords_by_pos[-1] += 1

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      STOPWORD token: known_pos: -1')
                        else:
                            # here we have a new unknwon token positioned right after
                            # the current known_pos
                            stopwords_by_pos[known_pos] += 1
                            stopwords_pos_add(known_pos)

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      STOPWORD token: known_pos: {known_pos}'
                                )

                        # we do not track stopwords, only their position
                        continue
                    else:
                        # this is an UNKNOWN word
                        if not started:
                            # If we have not yet started globally, then all tokens
                            # seen so far are unknowns and we keep a count of them
                            # in the magic "-1" position.
                            unknowns_by_pos[-1] += 1

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      UNKNOWN token: known_pos: -1')

                        else:
                            # here we have a new unknwon token positioned right after
                            # the current known_pos
                            unknowns_by_pos[known_pos] += 1
                            unknowns_pos_add(known_pos)

                            if TRACE_STOP_AND_UNKNOWN:
                                logger_debug(
                                    f'      UNKNOWN token: known_pos: {known_pos}'
                                )

                line_tokens_append(tid)

            # last known token position in the current line
            line_last_known_pos = known_pos

            # ONLY collect as SPDX a line that starts with SPDX License
            # Identifier. There are cases where this prefix does not start as
            # the firt tokens such as when we have one or two words (such as a
            # comment indicator DNL, REM etc.) that start the line and then and
            # an SPDX license identifier.
            spdx_start_offset = None
            if line_tokens[:3] in spdx_lid_token_ids:
                spdx_start_offset = 0
            elif line_tokens[1:4] in spdx_lid_token_ids:
                spdx_start_offset = 1
            elif line_tokens[2:5] in spdx_lid_token_ids:
                spdx_start_offset = 2

            if spdx_start_offset is not None:
                # keep the line, start/end known pos for SPDX matching
                spdx_prefix, spdx_expression = split_spdx_lid(line)
                spdx_text = ' '.join([spdx_prefix or '', spdx_expression])
                spdx_start_known_pos = line_first_known_pos + spdx_start_offset

                if spdx_start_known_pos <= line_last_known_pos:
                    self.spdx_lines.append(
                        (spdx_text, spdx_start_known_pos, line_last_known_pos))

            yield line_tokens

        # finally update the attributes and create a Span of positions followed
        # by unkwnons and another for positions followed by stopwords used for
        # intersection with the query span to do the scoring matches correctly
        self.unknowns_span = Span(unknowns_pos)
        # also convert the defaultdicts back to plain discts
        self.unknowns_by_pos = dict(unknowns_by_pos)
        self.stopwords_by_pos = dict(stopwords_by_pos)

        if TRACE_STOP_AND_UNKNOWN:
            logger_debug(f'  self.unknowns_span: {self.unknowns_span}')
            logger_debug(f'  self.unknowns_by_pos: {self.unknowns_by_pos}')
            logger_debug(f'  self.stopwords_by_pos: {self.stopwords_by_pos}')
Ejemplo n.º 19
0
 def test_index_tokenizer_lines_on_html_like_texts_2(self, regen=False):
     test_file = self.get_test_loc('tokenize/htmlish.html')
     expected_file = test_file + '.expected.index_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(index_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)
Ejemplo n.º 20
0
 def test_query_lines_on_html_like_texts_2(self, regen=False):
     test_file = self.get_test_loc('tokenize/htmlish.html')
     expected_file = test_file + '.expected.query_lines.json'
     result = list(query_lines(test_file))
     check_results(result, expected_file, regen=regen)
 def test_key_phrase_tokenizer_on_html_like_texts(self, regen=False):
     test_file = self.get_test_loc('tokenize/htmlish.txt')
     expected_file = test_file + '.expected.key_phrase_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(key_phrase_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)
Ejemplo n.º 22
0
    def tokens_by_line(self):
        """
        Yield one sequence of tokens for each line in this query. Populate the
        query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`,
        `shorts_and_digits_pos` and `spdx_lines` as a side effect.
        """
        # bind frequently called functions to local scope
        tokenizer = query_tokenizer
        line_by_pos_append = self.line_by_pos.append
        self_unknowns_by_pos = self.unknowns_by_pos
        unknowns_pos = set()
        unknowns_pos_add = unknowns_pos.add
        self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
        dic_get = self.idx.dictionary.get

        # note: positions start at zero

        # absolute position in a query, including all known and unknown tokens
        abs_pos = -1

        # absolute position in a query, including only known tokens
        known_pos = -1

        started = False

        spdx_lid_token_ids = self.spdx_lid_token_ids
        do_collect_spdx_lines = spdx_lid_token_ids is not None
        if TRACE:
            logger_debug('tokens_by_line: query lines')
            for line_num, line in query_lines(self.location,
                                              self.query_string):
                logger_debug(' ', line_num, ':', line)

        for line_num, line in query_lines(self.location, self.query_string):
            line_tokens = []
            line_tokens_append = line_tokens.append
            line_start_known_pos = None

            # FIXME: the implicit update of abs_pos is not clear
            for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
                tid = dic_get(token)
                if tid is not None:
                    known_pos += 1
                    started = True
                    line_by_pos_append(line_num)
                    if len(token) == 1 or token.isdigit():
                        self_shorts_and_digits_pos_add(known_pos)
                    if line_start_known_pos is None:
                        line_start_known_pos = known_pos
                else:
                    # we have not yet started
                    if not started:
                        self_unknowns_by_pos[-1] += 1
                    else:
                        self_unknowns_by_pos[known_pos] += 1
                        unknowns_pos_add(known_pos)
                line_tokens_append(tid)

            line_end_known_pos = known_pos
            # this works ONLY if the line starts with SPDX or we have one word
            # (such as acomment indicator DNL, REM etc.) and an SPDX id)
            if do_collect_spdx_lines and (line_tokens[:3] == spdx_lid_token_ids
                                          or line_tokens[1:4]
                                          == spdx_lid_token_ids):
                # keep the line, start/end  known pos for SPDX matching
                self.spdx_lines.append(
                    (line, line_start_known_pos, line_end_known_pos))

            yield line_tokens

        # finally create a Span of positions followed by unkwnons, used
        # for intersection with the query span for scoring matches
        self.unknowns_span = Span(unknowns_pos)