def test_numbered_text_lines_from_list_or_location_yield_same_results(self): test_file = self.get_test_loc('analysis/bsd-new') with io.open(test_file, encoding='utf-8') as inf: test_strings_list = inf.read().splitlines(True) # test when we are passing a location or a list from_loc = list(numbered_text_lines(location=test_file)) from_list = list(numbered_text_lines(location=test_strings_list)) assert from_loc != from_list assert len(from_loc) > len(from_list) assert ''.join(l for _, l in from_loc) == ''.join(l for _, l in from_list)
def query_lines(location=None, query_string=None, strip=True, start_line=1): """ Return an iterable of tuples (line number, text line) given a file at `location` or a `query string`. Include empty lines. Line numbers start at ``start_line`` which is 1-based by default. """ # TODO: OPTIMIZE: tokenizing line by line may be rather slow # we could instead get lines and tokens at once in a batch? numbered_lines = [] if location: numbered_lines = numbered_text_lines( location, demarkup=False, start_line=start_line, ) elif query_string: if strip: keepends = False else: keepends = True numbered_lines = enumerate( query_string.splitlines(keepends), start_line, ) for line_number, line in numbered_lines: if strip: yield line_number, line.strip() else: yield line_number, line.rstrip('\n') + '\n'
def file_content(self): """ Return the content of this Resource file using TextCode utilities for optimal compatibility. """ from textcode.analysis import numbered_text_lines numbered_lines = numbered_text_lines(self.location) return "".join(l for _, l in numbered_lines)
def test_numbered_text_lines_return_correct_number_of_lines(self): test_file = self.get_test_loc('analysis/correct_lines') result = list(numbered_text_lines(test_file)) expected = [( 1, 'Permission is hereby granted, free of charge, to any person obtaining ' 'a copy of this software and associated documentation files (the "Software"), ' 'to deal in the Software without restriction, including without limitation ' 'the rights to use, copy, modify, merge, , , sublicense, and/or Software, ,' ), (1, u' subject')] assert expected == result assert 2 == len(result)
def file_content(self): """ Return the content of this Resource file using TextCode utilities for optimal compatibility. """ from textcode.analysis import numbered_text_lines numbered_lines = numbered_text_lines(self.location) # ScanCode-toolkit is not providing the "\n" suffix when reading binary files. # The following is a workaround until the issue is fixed in the toolkit. lines = (l if l.endswith("\n") else l + "\n" for _, l in numbered_lines) return "".join(lines)
def text(self): """ Return the rule text loaded from its file. """ if self.text_file and exists(self.text_file): # IMPORTANT: use the same process as query text loading for symmetry numbered_lines = numbered_text_lines(self.text_file, demarkup=False, plain_text=True) return ''.join(l for _, l in numbered_lines) # used for non-file backed rules elif self.stored_text: return self.stored_text else: raise Exception('Inconsistent rule text for: ' + self.identifier + '\nfile://' + self.text_file)
def find(location, patterns): """ Yield match and matched lines for patterns found in file at location as a tuple of (key, found text, text line). `patterns` is a list of tuples (key, compiled regex). Note: the location can be a list of lines for testing convenience. """ if TRACE: from pprint import pformat loc = pformat(location) logger_debug('find(location=%(loc)r,\n patterns=%(patterns)r)' % locals()) for line_number, line in analysis.numbered_text_lines(location, demarkup=False): for key, pattern in patterns: for match in pattern.findall(line): if TRACE: logger_debug('find: yielding match: key=%(key)r, ' 'match=%(match)r,\n line=%(line)r' % locals()) yield key, toascii(match), line, line_number
def get_scancode_compatible_content(location): """ Return the content of the file at `location` using the ScanCode functions to ensure compatibility and consistency between outputs. """ return "".join(line for _, line in numbered_text_lines(location))
def test_numbered_text_lines_does_not_fail_on_autocad_test_pdf(self): test_file = self.get_test_loc('pdf/AutoCad_Diagram.pdf') result = list(numbered_text_lines(test_file)) assert [] == result
def test_numbered_text_lines_handles_jsmap3(self): test_file = self.get_test_loc('analysis/jsmap/ar-ER.js.map') result = list(l for _, l in numbered_text_lines(test_file)) expected_file = test_file + '.expected' check_text_lines(result, expected_file)
def test_numbered_text_lines_handles_sfdb(self): test_file = self.get_test_loc('analysis/splinefonts/Ambrosia.sfd') result = list(l for _, l in numbered_text_lines(test_file)) expected_file = test_file + '.expected' expected = open(expected_file, 'r').read().splitlines(True) assert expected == list(result)
def test_image_media_do_not_yield_numbered_text_lines(self): test_dir = self.get_test_loc('media_without_text') for test_file in resource_iter(test_dir, with_dirs=False): result = list(numbered_text_lines(test_file)) assert [] == result, 'Should not return text lines:' + test_file
def test_mpg_media_do_not_yield_numbered_text_lines(self): test_dir = self.get_test_loc('media_with_text') for test_file in resource_iter(test_dir, with_dirs=False): result = list(numbered_text_lines(test_file)) assert not result
def test_archives_do_not_yield_numbered_text_lines(self): test_file = self.get_test_loc('archive/simple.jar') result = list(numbered_text_lines(test_file)) assert [] == result
def test_numbered_text_lines_return_unicode(self): test_file = self.get_test_loc('analysis/verify.go') for _lineno, line in numbered_text_lines(test_file): assert type(line) == str
def test_numbered_text_lines_handles_broken_jsmap_as_plain_text(self): test_file = self.get_test_loc('analysis/jsmap/broken.js.map') result = list(l for _, l in numbered_text_lines(test_file)) expected_file = test_file + '.expected' check_text_lines(result, expected_file)