def get_spdx_id_matches(self, query, from_spdx_id_lines=True, **kwargs): """ Matching strategy for SPDX-Licensed-Identifier style of expressions. If `from_spdx_id_lines` is True detect only in the SPDX license identifier lines found in the query. Otherwise use the whole query for detection. """ matches = [] if from_spdx_id_lines: qrs_and_texts = query.spdx_lid_query_runs_and_text() else: # If we are not specifically looking at a single SPDX-Licene- # identifier line, then use the whole query run with the whole text. # Note this can only work for small texts or this will likely make # the expression parser choke if you feed it large texts query_lines = [ln for _, ln in tokenize.query_lines(query.location, query.query_string)] qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines) qrs_and_texts = [qrs_and_texts] for query_run, detectable_text in qrs_and_texts: if not query_run.matchables: # this could happen if there was some negative match applied continue spdx_match = match_spdx_lid.spdx_id_match( self, query_run, detectable_text) query_run.subtract(spdx_match.qspan) matches.append(spdx_match) return matches
def test_key_phrase_tokenizer_lines_on_html_like_texts_2( self, regen=REGEN_TEST_FIXTURES): test_file = self.get_test_loc('tokenize/htmlish.html') expected_file = test_file + '.expected.key_phrase_tokenizer.json' lines = query_lines(test_file) result = [list(key_phrase_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen)
def get_spdx_id_matches( self, query, from_spdx_id_lines=True, expression_symbols=None, **kwargs, ): """ Matching strategy for SPDX-Licensed-Identifier style of expressions. If `from_spdx_id_lines` is True detect only in the SPDX license identifier lines found in the query. Otherwise use the whole query for detection. Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if provided. Otherwise use the standard SPDX license symbols. """ matches = [] if from_spdx_id_lines: qrs_and_texts = query.spdx_lid_query_runs_and_text() else: # If we are not specifically looking at a single SPDX-Licene- # identifier line, then use the whole query run with the whole text. # Note this can only work for small texts or this will likely make # the expression parser choke if you feed it large texts query_lines = tokenize.query_lines(query.location, query.query_string) query_lines = [ln for _, ln in query_lines] qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines) qrs_and_texts = [qrs_and_texts] for query_run, detectable_text in qrs_and_texts: if not query_run.matchables: continue if TRACE_SPDX_LID: logger_debug( 'get_spdx_id_matches:', 'query_run:', query_run, 'detectable_text:', detectable_text, ) spdx_match = match_spdx_lid.spdx_id_match( idx=self, query_run=query_run, text=detectable_text, expression_symbols=expression_symbols, ) if spdx_match: query_run.subtract(spdx_match.qspan) matches.append(spdx_match) return matches
def test_query_lines_from_location(self): query_loc = self.get_test_loc('index/queryperfect-mini') expected = [ u'', u'The', u'Redistribution and use in source and binary forms, with or without modification, are permitted.', u'', u'Always', ] result = list(query_lines(location=query_loc)) assert expected == result
def test_query_lines_from_location(self): query_loc = self.get_test_loc('index/queryperfect-mini') expected = [ u'', u'The', u'Redistribution and use in source and binary forms, with or without modification, are permitted.', u'', u'Always', ] result = [l for _, l in query_lines(location=query_loc)] assert result == expected
def test_query_lines_from_location_return_a_correct_number_of_lines(self): query_loc = self.get_test_loc('tokenize/correct_lines') # note that this is a single line (line number is 1)... broken in two. expected = [( 1, u'Permission is hereby granted, free of charge, to any person ' 'obtaining a copy of this software and associated documentation ' 'files (the "Software"), to deal in the Software without restriction, ' 'including without limitation the rights to use, copy, modify, merge' ', , , sublicense, and/or Software, ,'), (1, u'subject')] result = list(query_lines(location=query_loc)) assert result == expected
def matched_query_tokens_str(match, location=None, query_string=None, idx=None, stopwords=STOPWORDS): """ Return an iterable of matched query token strings given a query file at `location` or a `query_string`, a match and an index. Yield None for unmatched positions. Punctuation is removed, spaces are normalized (new line is replaced by a space), case is preserved. """ assert idx dictionary_get = idx.dictionary.get tokens = (tokenize._query_tokenizer(line, stopwords=stopwords) for _ln, line in tokenize.query_lines(location, query_string)) tokens = chain.from_iterable(tokens) match_qspan = match.qspan match_qspan_start = match_qspan.start match_qspan_end = match_qspan.end known_pos = -1 started = False finished = False for token in tokens: toklow = token.lower() if toklow in stopwords: continue token_id = dictionary_get(toklow) if token_id is None: if not started: continue if finished: break else: known_pos += 1 if match_qspan_start <= known_pos <= match_qspan_end: started = True if known_pos == match_qspan_end: finished = True if known_pos in match_qspan and token_id is not None: yield token else: if token_id is not None: yield '<%s>' % token else: yield '[%s]' % token
def tokens_by_line(self, tokenizer=query_tokenizer): """ Yield one sequence of tokens for each line in this query. Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and `shorts_and_digits_pos` as a side effect. """ # bind frequently called functions to local scope line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # this is the absolute position, including the unknown tokens abs_pos = -1 # lines start at one line_start = 1 # this is a relative position, excluding the unknown tokens known_pos = -1 started = False for lnum, line in enumerate( query_lines(self.location, self.query_string), line_start): line_tokens = [] line_tokens_append = line_tokens.append for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1): tid = dic_get(token) if tid is not None: known_pos += 1 started = True line_by_pos_append(lnum) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) else: # we have not yet started if not started: self_unknowns_by_pos[-1] += 1 else: self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) yield line_tokens # finally create a Span of positions followed by unknwons, used # for intersection with the query span for scoring matches self.unknowns_span = Span(unknowns_pos)
def test_query_lines_on_html_like_texts_2(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.html') expected_file = test_file + '.expected.query_lines.json' # we dumps/loads to normalize tuples/etc result = json.loads(json.dumps(list(query_lines(test_file)))) if regen: with open(expected_file, mode) as exc_test: json.dump(result, exc_test, indent=2) with io.open(expected_file, encoding='utf-8') as exc_test: expected = json.load(exc_test) assert expected == result
def tokens_by_line(self, tokenizer=query_tokenizer): """ Yield one sequence of tokens for each line in this query. Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and `shorts_and_digits_pos` as a side effect. """ # bind frequently called functions to local scope line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # this is the absolute position, including the unknown tokens abs_pos = -1 # lines start at one line_start = 1 # this is a relative position, excluding the unknown tokens known_pos = -1 started = False for lnum, line in enumerate(query_lines(self.location, self.query_string), line_start): line_tokens = [] line_tokens_append = line_tokens.append for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1): tid = dic_get(token) if tid is not None: known_pos += 1 started = True line_by_pos_append(lnum) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) else: # we have not yet started if not started: self_unknowns_by_pos[-1] += 1 else: self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) yield line_tokens # finally create a Span of positions followed by unknwons, used # for intersection with the query span for scoring matches self.unknowns_span = Span(unknowns_pos)
def test_query_tokenizer_on_html_like_texts(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.txt') expected_file = test_file + '.expected.tokenized_lines.json' lines = query_lines(test_file) tokens = list(list(query_tokenizer(line)) for _ln, line in lines) # we dumps/loads to normalize tuples/etc result = json.loads(json.dumps(tokens)) if regen: with open(expected_file, mode) as exc_test: json.dump(result, exc_test, indent=2) with io.open(expected_file, encoding='utf-8') as exc_test: expected = json.load(exc_test) assert expected == result
def test_query_lines_from_string(self): query_string = ''' The Redistribution and use in source and binary forms, with or without modification, are permitted. Always is ''' expected = [ u'', u'The', u'Redistribution and use in source and binary forms, with or without modification, are permitted.', u'', u'Always', u'is', u'', ] result = [l for _, l in query_lines(query_string=query_string)] assert result == expected
def test_query_lines_from_string(self): query_string = ''' The Redistribution and use in source and binary forms, with or without modification, are permitted. Always is ''' expected = [ u'', u'The', u'Redistribution and use in source and binary forms, with or without modification, are permitted.', u'', u'Always', u'is', u'', ] result = list(query_lines(query_string=query_string)) assert expected == result
def test_query_lines_complex(self): query_loc = self.get_test_loc('index/querytokens') expected = [ u'', u'', u'', u'Redistribution and use in source and binary forms,', u'', u'* Redistributions of source code must', u'The this that is not there', u'Welcom to Jamaica', u'* Redistributions in binary form must', u'', u'THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"', u'', u'', u'', u'Redistributions', ] result = [l for _, l in query_lines(location=query_loc)] assert result == expected
def test_query_lines_complex(self): query_loc = self.get_test_loc('index/querytokens') expected = [ u'', u'', u'', u'Redistribution and use in source and binary forms,', u'', u'* Redistributions of source code must', u'The this that is not there', u'Welcom to Jamaica', u'* Redistributions in binary form must', u'', u'THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"', u'', u'', u'', u'Redistributions', ] result = list(query_lines(location=query_loc)) assert expected == result
def tokens_by_line(self): """ Yield one sequence of tokens for each line in this query. Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`, `shorts_and_digits_pos` and `spdx_lines` as a side effect. """ from licensedcode.match_spdx_lid import split_spdx_lid # bind frequently called functions to local scope tokenizer = query_tokenizer line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including all known and unknown tokens abs_pos = -1 # absolute position in a query, including only known tokens known_pos = -1 # flag ifset to True when we have found the first known token globally # across all query lines started = False spdx_lid_token_ids = self.spdx_lid_token_ids if TRACE: logger_debug('tokens_by_line: query lines') for line_num, line in query_lines(self.location, self.query_string): logger_debug(' ', line_num, ':', line) for line_num, line in query_lines(self.location, self.query_string): # keep track of tokens in a line line_tokens = [] line_tokens_append = line_tokens.append line_first_known_pos = None # FIXME: the implicit update of abs_pos is not clear for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1): tid = dic_get(token) if tid is not None: # this is a known token known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_first_known_pos is None: line_first_known_pos = known_pos else: if not started: # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. self_unknowns_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) # last known token position in the current line line_last_known_pos = known_pos # ONLY collect as SPDX a line that starts with SPDX License # Identifier. There are cases where this prefix does not start as # the firt tokens such as when we have one or two words (such as a # comment indicator DNL, REM etc.) that start the line and then and # an SPDX license identifier. spdx_start_offset = None if line_tokens[:3] in spdx_lid_token_ids: spdx_start_offset = 0 elif line_tokens[1:4] in spdx_lid_token_ids: spdx_start_offset = 1 elif line_tokens[2:5] in spdx_lid_token_ids: spdx_start_offset = 2 if spdx_start_offset is not None: # keep the line, start/end known pos for SPDX matching spdx_prefix, spdx_expression = split_spdx_lid(line) spdx_text = ' '.join([spdx_prefix or '', spdx_expression]) spdx_start_known_pos = line_first_known_pos + spdx_start_offset if spdx_start_known_pos <= line_last_known_pos: self.spdx_lines.append((spdx_text, spdx_start_known_pos, line_last_known_pos)) yield line_tokens # finally create a Span of positions followed by unkwnons, used # for intersection with the query span for scoring matches self.unknowns_span = Span(unknowns_pos)
def test_query_lines_on_html_like_texts(self, regen=REGEN_TEST_FIXTURES): test_file = self.get_test_loc('tokenize/htmlish.txt') expected_file = test_file + '.expected.query_lines.json' result = list(query_lines(test_file)) check_results(result, expected_file, regen=regen)
def tokens_by_line( self, location=None, query_string=None, start_line=1, ): """ Yield multiple sequences of tokens, one for each line in this query. Line numbers start at ``start_line`` which is 1-based by default. SIDE EFFECT: This populates the query `line_by_pos`, `unknowns_by_pos`, `unknowns_span`, `stopwords_by_pos`, `shorts_and_digits_pos` and `spdx_lines` . """ from licensedcode.match_spdx_lid import split_spdx_lid from licensedcode.stopwords import STOPWORDS location = location or self.location query_string = query_string or self.query_string # bind frequently called functions to local scope line_by_pos_append = self.line_by_pos.append # we use a defaultdict as a convenience at construction time unknowns_by_pos = defaultdict(int) unknowns_pos = set() unknowns_pos_add = unknowns_pos.add # we use a defaultdict as a convenience at construction time stopwords_by_pos = defaultdict(int) stopwords_pos = set() stopwords_pos_add = stopwords_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including only known tokens known_pos = -1 # flag ifset to True when we have found the first known token globally # across all query lines started = False spdx_lid_token_ids = self.spdx_lid_token_ids qlines = query_lines( location=location, query_string=query_string, start_line=start_line, ) if TRACE or TRACE_STOP_AND_UNKNOWN: logger_debug('tokens_by_line: query lines:') qlines = list(qlines) for line_num, line in qlines: logger_debug(' ', line_num, ':', line) for line_num, line in qlines: if TRACE_STOP_AND_UNKNOWN: logger_debug(f' line: {line_num}: {line!r}') # keep track of tokens in a line line_tokens = [] line_tokens_append = line_tokens.append line_first_known_pos = None for token in query_tokenizer(line): tid = dic_get(token) is_stopword = token in STOPWORDS if TRACE_STOP_AND_UNKNOWN: logger_debug( f' token: {token!r}, tid: {tid}, is_stopword: {is_stopword}' ) if tid is not None and not is_stopword: # this is a known token known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_first_known_pos is None: line_first_known_pos = known_pos if TRACE_STOP_AND_UNKNOWN: logger_debug( f' KNOWN token: known_pos: {known_pos}') else: # process STOPWORDS and unknown words if is_stopword: if not started: # If we have not yet started globally, then all tokens # seen so far are stopwords and we keep a count of them # in the magic "-1" position. stopwords_by_pos[-1] += 1 if TRACE_STOP_AND_UNKNOWN: logger_debug( f' STOPWORD token: known_pos: -1') else: # here we have a new unknwon token positioned right after # the current known_pos stopwords_by_pos[known_pos] += 1 stopwords_pos_add(known_pos) if TRACE_STOP_AND_UNKNOWN: logger_debug( f' STOPWORD token: known_pos: {known_pos}' ) # we do not track stopwords, only their position continue else: # this is an UNKNOWN word if not started: # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. unknowns_by_pos[-1] += 1 if TRACE_STOP_AND_UNKNOWN: logger_debug( f' UNKNOWN token: known_pos: -1') else: # here we have a new unknwon token positioned right after # the current known_pos unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) if TRACE_STOP_AND_UNKNOWN: logger_debug( f' UNKNOWN token: known_pos: {known_pos}' ) line_tokens_append(tid) # last known token position in the current line line_last_known_pos = known_pos # ONLY collect as SPDX a line that starts with SPDX License # Identifier. There are cases where this prefix does not start as # the firt tokens such as when we have one or two words (such as a # comment indicator DNL, REM etc.) that start the line and then and # an SPDX license identifier. spdx_start_offset = None if line_tokens[:3] in spdx_lid_token_ids: spdx_start_offset = 0 elif line_tokens[1:4] in spdx_lid_token_ids: spdx_start_offset = 1 elif line_tokens[2:5] in spdx_lid_token_ids: spdx_start_offset = 2 if spdx_start_offset is not None: # keep the line, start/end known pos for SPDX matching spdx_prefix, spdx_expression = split_spdx_lid(line) spdx_text = ' '.join([spdx_prefix or '', spdx_expression]) spdx_start_known_pos = line_first_known_pos + spdx_start_offset if spdx_start_known_pos <= line_last_known_pos: self.spdx_lines.append( (spdx_text, spdx_start_known_pos, line_last_known_pos)) yield line_tokens # finally update the attributes and create a Span of positions followed # by unkwnons and another for positions followed by stopwords used for # intersection with the query span to do the scoring matches correctly self.unknowns_span = Span(unknowns_pos) # also convert the defaultdicts back to plain discts self.unknowns_by_pos = dict(unknowns_by_pos) self.stopwords_by_pos = dict(stopwords_by_pos) if TRACE_STOP_AND_UNKNOWN: logger_debug(f' self.unknowns_span: {self.unknowns_span}') logger_debug(f' self.unknowns_by_pos: {self.unknowns_by_pos}') logger_debug(f' self.stopwords_by_pos: {self.stopwords_by_pos}')
def test_index_tokenizer_lines_on_html_like_texts_2(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.html') expected_file = test_file + '.expected.index_tokenizer.json' lines = query_lines(test_file) result = [list(index_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen)
def test_query_lines_on_html_like_texts_2(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.html') expected_file = test_file + '.expected.query_lines.json' result = list(query_lines(test_file)) check_results(result, expected_file, regen=regen)
def test_key_phrase_tokenizer_on_html_like_texts(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.txt') expected_file = test_file + '.expected.key_phrase_tokenizer.json' lines = query_lines(test_file) result = [list(key_phrase_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen)
def tokens_by_line(self): """ Yield one sequence of tokens for each line in this query. Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`, `shorts_and_digits_pos` and `spdx_lines` as a side effect. """ # bind frequently called functions to local scope tokenizer = query_tokenizer line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including all known and unknown tokens abs_pos = -1 # absolute position in a query, including only known tokens known_pos = -1 started = False spdx_lid_token_ids = self.spdx_lid_token_ids do_collect_spdx_lines = spdx_lid_token_ids is not None if TRACE: logger_debug('tokens_by_line: query lines') for line_num, line in query_lines(self.location, self.query_string): logger_debug(' ', line_num, ':', line) for line_num, line in query_lines(self.location, self.query_string): line_tokens = [] line_tokens_append = line_tokens.append line_start_known_pos = None # FIXME: the implicit update of abs_pos is not clear for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1): tid = dic_get(token) if tid is not None: known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_start_known_pos is None: line_start_known_pos = known_pos else: # we have not yet started if not started: self_unknowns_by_pos[-1] += 1 else: self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) line_end_known_pos = known_pos # this works ONLY if the line starts with SPDX or we have one word # (such as acomment indicator DNL, REM etc.) and an SPDX id) if do_collect_spdx_lines and (line_tokens[:3] == spdx_lid_token_ids or line_tokens[1:4] == spdx_lid_token_ids): # keep the line, start/end known pos for SPDX matching self.spdx_lines.append( (line, line_start_known_pos, line_end_known_pos)) yield line_tokens # finally create a Span of positions followed by unkwnons, used # for intersection with the query span for scoring matches self.unknowns_span = Span(unknowns_pos)