def test_extractcode_command_works_with_relative_paths_verbose(monkeypatch): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath from commoncode import fileutils import tempfile import shutil try: scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace(scancode_root_abs, '').strip('\\/') test_file = test_env.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') runner = CliRunner() monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) result = runner.invoke(extract_cli.extractcode, ['--verbose', test_src_file]) assert result.exit_code == 0 # extract the path from the second line of the output # check that the path is relative and not absolute lines = result.output.splitlines(False) line = lines[1] line_path = line.split(':', 1)[-1].strip() if on_windows: drive = test_file[:2] assert not line_path.startswith(drive) else: assert not line_path.startswith('/') finally: fileutils.delete(test_src_dir)
def test_extractcode_command_works_with_relative_paths(monkeypatch): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath from commoncode import fileutils import extractcode import tempfile import shutil try: scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace(scancode_root_abs, '').strip('\\/') test_file = test_env.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') test_tgt_dir = join(scancode_root, test_src_file) + extractcode.EXTRACT_SUFFIX runner = CliRunner() monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) result = runner.invoke(extract_cli.extractcode, [test_src_file]) assert result.exit_code == 0 assert 'Extracting done' in result.output assert not 'WARNING' in result.output assert not 'ERROR' in result.output expected = ['/c/a/a.txt', '/c/b/a.txt', '/c/c/a.txt'] file_result = [as_posixpath(f.replace(test_tgt_dir, '')) for f in fileutils.resource_iter(test_tgt_dir, with_dirs=False)] assert sorted(expected) == sorted(file_result) finally: fileutils.delete(test_src_dir)
def setup(self): """ Setup the cache: must be called at least once globally after cache initialization. """ fileutils.create_dir(self.cache_infos_dir) fileutils.create_dir(self.cache_scans_dir)
def get_temp_dir(self, sub_dir_path=None): """ Create a unique new temporary directory location. Create directories identified by sub_dir_path if provided in this temporary directory. Return the location for this unique directory joined with the sub_dir_path if any. """ # ensure that we have a new unique temp directory for each test run global test_run_temp_dir if not test_run_temp_dir: from scancode_config import scancode_root_dir test_tmp_root_dir = path.join(scancode_root_dir, 'tmp') # now we add a space in the path for testing path with spaces test_run_temp_dir = fileutils.get_temp_dir( base_dir=test_tmp_root_dir, prefix='scancode-tk-tests -') if on_linux and py2: test_run_temp_dir = fsencode(test_run_temp_dir) test_run_temp_subdir = fileutils.get_temp_dir( base_dir=test_run_temp_dir, prefix='') if sub_dir_path: # create a sub directory hierarchy if requested sub_dir_path = to_os_native_path(sub_dir_path) test_run_temp_subdir = path.join(test_run_temp_subdir, sub_dir_path) fileutils.create_dir(test_run_temp_subdir) return test_run_temp_subdir
def test_extractcode_command_works_with_relative_paths_verbose(monkeypatch): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath from commoncode import fileutils import tempfile import shutil try: scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace(scancode_root_abs, '').strip('\\/') test_file = test_env.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') runner = CliRunner() monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) result = runner.invoke(extract_cli.extractcode, ['--verbose',test_src_file]) assert result.exit_code == 0 # extract the path from the second line of the output # check that the path is relative and not absolute lines = result.output.splitlines(False) line = lines[1] line_path = line.split(':', 1)[-1].strip() if on_windows: drive = test_file[:2] assert not line_path.startswith(drive) else: assert not line_path.startswith('/') finally: fileutils.delete(test_src_dir)
def remove_backslashes_and_dotdots(directory): """ Walk a directory and rename the files if their names contain backslashes. Return a list of errors if any. """ if on_linux: directory = path_to_bytes(directory) errors = [] for top, _, files in os.walk(directory): for filename in files: if not (WIN_PATH_SEP in filename or DOTDOT in filename): continue try: new_path = fileutils.as_posixpath(filename) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) segments = new_path.split(POSIX_PATH_SEP) directory = os.path.join(top, *segments[:-1]) fileutils.create_dir(directory) shutil.move(os.path.join(top, filename), os.path.join(top, *segments)) except Exception: errors.append(os.path.join(top, filename)) return errors
def remove_backslashes_and_dotdots(directory): """ Walk a directory and rename the files if their names contain backslashes. Return a list of errors if any. """ if on_linux: directory = fsencode(directory) errors = [] for top, _, files in os.walk(directory): for filename in files: if not (WIN_PATH_SEP in filename or DOTDOT in filename): continue try: new_path = as_posixpath(filename) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) segments = new_path.split(POSIX_PATH_SEP) directory = join(top, *segments[:-1]) create_dir(directory) shutil.move(join(top, filename), join(top, *segments)) except Exception: errors.append(join(top, filename)) return errors
def test_extract_option_works_with_relative_paths(self): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code from os.path import dirname, join, abspath from commoncode import fileutils import extractcode import tempfile import shutil scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace(scancode_root_abs, '').strip('\\/') test_file = self.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') test_tgt_dir = join(scancode_root, test_src_file) + extractcode.EXTRACT_SUFFIX runner = CliRunner() result = runner.invoke(cli.scancode, ['--extract', test_src_file]) assert result.exit_code == 0 assert 'Extracting done' in result.output assert not 'WARNING' in result.output assert not 'ERROR' in result.output expected = ['/c/a/a.txt', '/c/b/a.txt', '/c/c/a.txt'] file_result = [as_posixpath(f.replace(test_tgt_dir, '')) for f in fileutils.file_iter(test_tgt_dir)] assert sorted(expected)==sorted(file_result)
def remove_backslashes(directory): """ Walk a directory and rename the files if their names contain backslashes. Return a list of errors if any. """ errors = [] for top, _, files in os.walk(str(directory)): for filename in files: if '\\' in filename or '..' in filename: try: new_path = fileutils.as_posixpath(filename) new_path = new_path.strip('/') new_path = posixpath.normpath(new_path) new_path = new_path.replace('..', '/') new_path = new_path.strip('/') new_path = posixpath.normpath(new_path) segments = new_path.split('/') directory = os.path.join(top, *segments[:-1]) fileutils.create_dir(directory) shutil.move(os.path.join(top, filename), os.path.join(top, *segments)) except Exception: errors.append(os.path.join(top, filename)) return errors
def collect_and_rebuild_rootfs_v10(location, echo=print, layerid_len=DEFAULT_ID_LEN): """ Collect all images in a directory tree. Extract/merges the layers side-by- side with the image directory with an extract suffix. """ import extractcode all_wh = {} # FIXME: we should instead receive a list of images.... for loc, image in collect_images_v10(location, echo, layerid_len=layerid_len).items(): extract_target = loc.rstrip('\\/') + extractcode.EXTRACT_SUFFIX fileutils.create_dir(extract_target) echo( 'Extracting/merging and building rootfs from layers for Docker image %(loc)r \n to: %(extract_target)r' % locals()) errors, whiteouts = rebuild_rootfs(image, extract_target, layerid_len=layerid_len) if whiteouts: echo( 'Files deleted while extract/merging layers for Docker image %(loc)r:' % locals()) all_wh[loc] = whiteouts for w in whiteouts: echo(' ' + w) if errors: echo('Extraction error for layers of Docker image %(loc)r:' % locals()) for e in errors: echo(' ' + e) return all_wh
def test_tree_checksum_ignores_some_files_and_directories(self): test_dir = self.get_test_loc('cache/tree', copy=True) before = cache.tree_checksum(test_dir) # create some new pyc file and a dir with open(os.path.join(test_dir, 'some.pyc'), 'wb') as pyc: pyc.write('') fileutils.create_dir(os.path.join(test_dir, 'some dir')) after = cache.tree_checksum(test_dir) assert before == after with open(os.path.join(test_dir, 'some.py'), 'wb') as py: py.write(' ') after = cache.tree_checksum(test_dir) assert before != after before = after with open(os.path.join(test_dir, 'some.LICENSE'), 'wb') as f: f.write(' ') after = cache.tree_checksum(test_dir) assert before != after before = after with open(os.path.join(test_dir, 'some.LICENSE~'), 'wb') as f: f.write(' ') after = cache.tree_checksum(test_dir) assert before == after with open(os.path.join(test_dir, 'some.LICENSE.swp'), 'wb') as f: f.write(' ') after = cache.tree_checksum(test_dir) assert before == after
def paths_from_keys(base_path, keys): """ Return a tuple of (parent dir path, filename) built from a cache keys triple and a base_directory. Ensure that the parent directory exist. """ dir1, dir2, fname = keys parent = os.path.join(base_path, dir1, dir2) fileutils.create_dir(parent) return parent, fname
def test_tree_checksum_does_not_ignore_the_index_cache(self): # this is stored in the code tree as package data and we should not # ignore it test_dir = self.get_test_loc('cache/tree', copy=True) before = cache.tree_checksum(test_dir) # create some file name like the index with open(os.path.join(test_dir, cache.LICENSE_INDEX_FILENAME), 'w') as pyc: pyc.write(' ') fileutils.create_dir(os.path.join(test_dir, 'some dir')) after = cache.tree_checksum(test_dir) assert after != before
def get_scans_cache_class(cache_dir=scans_cache_dir): """ Return a new persistent cache class configured with a unique storage directory. """ # create a unique temp directory in cache_dir fileutils.create_dir(cache_dir) # ensure that the cache dir is alwasy unicode cache_dir = fileutils.get_temp_dir(unicode(cache_dir), prefix=unicode(timeutils.time2tstamp()) + u'-') sc = ScanFileCache(cache_dir) sc.setup() return partial(ScanFileCache, cache_dir)
def write(self, target_dir, transform_path=lambda x: x): """ Write entry to a file or directory saved relatively to the `target_dir` and return the path where the file or directory was written or None if nothing was written to disk. `transform_path` is a callable taking a path and returning a transformed path such as resolving relative paths, transliterating non-portable characters or other path transformations. The default is a no-op lambda. """ if not self.archive.archive_struct: raise ArchiveErrorIllegalOperationOnClosedArchive() # skip links and special files if not (self.isfile or self.isdir): return abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) # TODO: return some warning when original path has been transformed clean_path = transform_path(self.path) if self.isdir: # TODO: also rename directories to a new name if needed segment by segment dir_path = os.path.join(abs_target_dir, clean_path) fileutils.create_dir(dir_path) return dir_path # note: here isfile=True try: # create parent directories if needed target_path = os.path.join(abs_target_dir, clean_path) parent_path = os.path.dirname(target_path) # TODO: also rename directories to a new name if needed segment by segment fileutils.create_dir(parent_path) # TODO: return some warning when original path has been renamed? unique_path = extractcode.new_name(target_path, is_dir=False) chunk_len = 10240 sbuffer = create_string_buffer(chunk_len) with open(unique_path, 'wb') as target: chunk_size = 1 while chunk_size: chunk_size = read_entry_data(self.archive.archive_struct, sbuffer, chunk_len) data = sbuffer.raw[0:chunk_size] target.write(data) os.utime(unique_path, (self.time, self.time)) return target_path except ArchiveWarning, aw: msg = aw.args and '\n'.join(aw.args) or 'No message provided.' if msg not in self.warnings: self.warnings.append(msg) return target_path
def write(self, target_dir, transform_path=lambda x: x): """ Write entry to a file or directory saved relatively to the `target_dir` and return the path where the file or directory was written or None if nothing was written to disk. `transform_path` is a callable taking a path and returning a transformed path such as resolving relative paths, transliterating non-portable characters or other path transformations. The default is a no-op lambda. """ if not self.archive.archive_struct: raise ArchiveErrorIllegalOperationOnClosedArchive() # skip links and special files if not (self.isfile or self.isdir): return abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) # TODO: return some warning when original path has been transformed clean_path = transform_path(self.path) if self.isdir: # TODO: also rename directories to a new name if needed segment by segment dir_path = os.path.join(abs_target_dir, clean_path) fileutils.create_dir(dir_path) return dir_path # note: here isfile=True try: # create parent directories if needed target_path = os.path.join(abs_target_dir, clean_path) parent_path = os.path.dirname(target_path) # TODO: also rename directories to a new name if needed segment by segment fileutils.create_dir(parent_path) # TODO: return some warning when original path has been renamed? unique_path = extractcode.new_name(target_path, is_dir=False) chunk_len = 10240 sbuffer = create_string_buffer(chunk_len) with open(unique_path, 'wb') as target: chunk_size = 1 while chunk_size: chunk_size = read_entry_data(self.archive.archive_struct, sbuffer, chunk_len) data = sbuffer.raw[0:chunk_size] target.write(data) os.utime(unique_path, (self.time, self.time)) return target_path except ArchiveWarning as aw: msg = aw.args and '\n'.join(aw.args) or 'No message provided.' if msg not in self.warnings: self.warnings.append(msg) return target_path
def get_license_cache_paths(cache_dir=scancode_cache_dir): """ Return a tuple of index cache files given a master `cache_dir` """ idx_cache_dir = join(cache_dir, 'license_index') create_dir(idx_cache_dir) lock_file = join(idx_cache_dir, 'lockfile') checksum_file = join(idx_cache_dir, 'tree_checksums') cache_file = join(idx_cache_dir, 'index_cache') return lock_file, checksum_file, cache_file
def get_scans_cache_class(cache_dir=scans_cache_dir): """ Return a new persistent cache class configured with a unique storage directory. """ # create a unique temp directory in cache_dir fileutils.create_dir(cache_dir) prefix = timeutils.time2tstamp() + u'-' cache_dir = fileutils.get_temp_dir(cache_dir, prefix=prefix) if on_linux: cache_dir = path_to_bytes(cache_dir) sc = ScanFileCache(cache_dir) sc.setup() return partial(ScanFileCache, cache_dir)
def extract(location, target_dir): """ Extract each patch of a patch file at `location` as files in a target_dir directory tree mimicking the directory in which the patches would be applied with the patch command. This treats a patch file as if it were an archive containing one file for each patch applied to a file to be patched. Return a list of warning messages. Raise Exceptionon errors. """ for source, target, text in patch_info(location): # prefer the target path for writing the patch text to a subfile # unless target is /dev/null (a deletion) if '/dev/null' in target: patch_subfile_path = source else: patch_subfile_path = target # make the path safe to use as a subfile path # ensure this a good and clean posix relative path patch_subfile_path = paths.safe_path(patch_subfile_path) # create directories parent_dir = posixpath.dirname(patch_subfile_path) parent_target_dir = os.path.join(target_dir, parent_dir) fileutils.create_dir(parent_target_dir) # find a unique name using a simple counter base_subfile_path = os.path.join(target_dir, patch_subfile_path) counter = 0 fp = base_subfile_path while os.path.exists(fp + extractcode.EXTRACT_SUFFIX): fp = base_subfile_path + '_%d' % counter counter += 1 base_subfile_path = fp # write the location proper, with a suffix extension to avoid # recursive extraction if py2: mode = 'wb' eol = b'\n' if py3: mode = 'w' eol = u'\n' subfile_path = base_subfile_path + extractcode.EXTRACT_SUFFIX with open(subfile_path, mode) as subfile: subfile.write(eol.join(text)) return []
def save_results(scanners, only_findings, files_count, results, format, options, input, output_file): """ Save scan results to file or screen. """ if only_findings: # Find all scans that are both enabled and have a valid function reference. # This deliberately filters out the "info" scan (which always has a "None" # function reference) as there is no dedicated "infos" key in the results # that "has_findings()" could check. active_scans = [k for k, v in scanners.items() if v[0] and v[1]] # FIXME: this is forcing all the scan results to be loaded in memory # and defeats lazy loading from cache results = [ file_data for file_data in results if has_findings(active_scans, file_data) ] # FIXME: computing len before hand will need a list and therefore need loding # it all ahead of time files_count = len(results) # note: in tests, sys.stdout is not used, but some io wrapper with no name # attributes is_real_file = hasattr(output_file, 'name') if output_file != sys.stdout and is_real_file: parent_dir = os.path.dirname(output_file.name) if parent_dir: fileutils.create_dir(abspath(expanduser(parent_dir))) if format not in formats: # render using a user-provided custom format template if not os.path.isfile(format): echo_stderr('\nInvalid template passed.', fg='red') else: for template_chunk in as_template(results, template=format): try: output_file.write(template_chunk) except Exception as e: extra_context = 'ERROR: Failed to write output to HTML for: ' + repr( template_chunk) echo_stderr(extra_context, fg='red') e.args += (extra_context, ) raise e return write_formatted_output(scanners, files_count, version, notice, results, format, options, input, output_file, echo_stderr)
def save_results(scanners, files_count, results, format, options, input, output_file): """ Save scan results to file or screen. """ # note: in tests, sys.stdout is not used, but is instead some io # wrapper with no name attributes. We use this to check if this is a # real filesystem file or not. # note: sys.stdout.name == '<stdout>' so it has a name. is_real_file = hasattr(output_file, 'name') if output_file != sys.stdout and is_real_file: # we are writing to a real filesystem file: create directories! parent_dir = os.path.dirname(output_file.name) if parent_dir: fileutils.create_dir(abspath(expanduser(parent_dir))) # Write scan results to file or screen as a formatted output ... # ... using a user-provided custom format template format_plugins = plugincode.output.get_format_plugins() if format not in format_plugins: # format may be a custom template file path if not os.path.isfile(format): # this check was done before in the CLI validation, but this # is done again if the function is used directly echo_stderr('\nInvalid template: must be a file.', fg='red') else: from formattedcode import format_templated # FIXME: carrying an echo function does not make sense format_templated.write_custom(results, output_file, _echo=echo_stderr, version=version, template_path=format) # ... or using the selected format plugin else: writer = format_plugins[format] # FIXME: carrying an echo function does not make sense # FIXME: do not use input as a variable name writer(files_count=files_count, version=version, notice=notice, scanned_files=results, options=options, input=input, output_file=output_file, _echo=echo_stderr)
def setup_vscode(): """ Add base settings for .vscode """ from scancode_config import scancode_root_dir from commoncode.fileutils import create_dir from commoncode.fileutils import copyfile settings = os.path.join(scancode_root_dir, 'etc', 'vscode', 'settings.json') if os.path.exists(settings): vscode = os.path.join(scancode_root_dir, '.vscode') create_dir(vscode) copyfile(settings, vscode)
def paths_from_keys(base_path, keys): """ Return a tuple of (parent dir path, filename) for a cache entry built from a cache keys triple and a base_directory. Ensure that the parent directory exist. """ if on_linux: keys = [path_to_bytes(k) for k in keys] base_path = path_to_bytes(base_path) else: keys = [path_to_unicode(k) for k in keys] base_path = path_to_unicode(base_path) dir1, dir2, file_name = keys parent = os.path.join(base_path, dir1, dir2) fileutils.create_dir(parent) return parent, file_name
def get_license_cache_paths( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, ): """ Return a tuple of index cache files given a master `cache_dir` """ idx_cache_dir = os.path.join(licensedcode_cache_dir, LICENSE_INDEX_DIR) create_dir(idx_cache_dir) cache_file = os.path.join(idx_cache_dir, LICENSE_INDEX_FILENAME) lock_file = os.path.join(scancode_cache_dir, 'scancode_license_index_lockfile') checksum_file = os.path.join(scancode_cache_dir, 'scancode_license_index_tree_checksums') return lock_file, checksum_file, cache_file
def extract(location, target_dir): """ Extract each patch of a patch file at `location` as files in a target_dir directory tree mimicking the directory in which the patches would be applied with the patch command. This treats a patch file as if it were an archive containing one file for each patch applied to a file to be patched. Return a list of warning messages. Raise Exceptionon errors. """ for source, target, text in patch_info(location): # prefer the target path for writing the patch text to a subfile # unless target is /dev/null (a deletion) if '/dev/null' in target: patch_subfile_path = source else: patch_subfile_path = target # make the path safe to use as a subfile path # ensure this a good and clean posix relative path patch_subfile_path = paths.safe_path(patch_subfile_path) # create directories parent_dir = posixpath.dirname(patch_subfile_path) parent_target_dir = os.path.join(target_dir, parent_dir) fileutils.create_dir(parent_target_dir) # find a unique name using a simple counter base_subfile_path = os.path.join(target_dir, patch_subfile_path) counter = 0 fp = base_subfile_path while os.path.exists(fp + extractcode.EXTRACT_SUFFIX): fp = base_subfile_path + '_%d' % counter counter += 1 base_subfile_path = fp # write the location proper, with a suffix extension to avoid # recursive extraction subfile_path = base_subfile_path + extractcode.EXTRACT_SUFFIX with open(subfile_path, 'wb') as subfile: subfile.write(u'\n'.join(text)) return []
def save_results(scanners, files_count, results, format, options, input, output_file): """ Save scan results to file or screen. """ # note: in tests, sys.stdout is not used, but is instead some io # wrapper with no name attributes. We use this to check if this is a # real filesystem file or not. # note: sys.stdout.name == '<stdout>' so it has a name. is_real_file = hasattr(output_file, 'name') if output_file != sys.stdout and is_real_file: # we are writing to a real filesystem file: create directories! parent_dir = os.path.dirname(output_file.name) if parent_dir: fileutils.create_dir(abspath(expanduser(parent_dir))) # Write scan results to file or screen as a formatted output ... # ... using a user-provided custom format template format_plugins = plugincode.output.get_format_plugins() if format not in format_plugins: # format may be a custom template file path if not os.path.isfile(format): # this check was done before in the CLI validation, but this # is done again if the function is used directly echo_stderr('\nInvalid template: must be a file.', fg='red') else: from formattedcode import format_templated # FIXME: carrying an echo function does not make sense format_templated.write_custom( results, output_file, _echo=echo_stderr, template_path=format) # ... or using the selected format plugin else: writer = format_plugins[format] # FIXME: carrying an echo function does not make sense # FIXME: do not use input as a variable name writer(files_count=files_count, version=version, notice=notice, scanned_files=results, options=options, input=input, output_file=output_file, _echo=echo_stderr)
def get_temp_dir(self, sub_dir_path=None): """ Create a unique new temporary directory location. Create directories identified by sub_dir_path if provided in this temporary directory. Return the location for this unique directory joined with the sub_dir_path if any. """ # ensure that we have a new unique temp directory for each test run global test_run_temp_dir if not test_run_temp_dir: test_run_temp_dir = fileutils.get_temp_dir(base_dir='tst', prefix=' ') new_temp_dir = fileutils.get_temp_dir(base_dir=test_run_temp_dir) if sub_dir_path: # create a sub directory hierarchy if requested sub_dir_path = to_os_native_path(sub_dir_path) new_temp_dir = os.path.join(new_temp_dir, sub_dir_path) fileutils.create_dir(new_temp_dir) return new_temp_dir
def test_extract_can_extract_to_relative_paths(self): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code from os.path import dirname, join, abspath scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) import tempfile test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace(scancode_root_abs, '').strip('\\/') test_file = self.get_test_loc('extract/relative_path/basic.zip') import shutil shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') test_tgt_dir = join(scancode_root, test_src_file) + extractcode.EXTRACT_SUFFIX result = list(extract.extract(test_src_file)) expected = ['c/a/a.txt', 'c/b/a.txt', 'c/c/a.txt'] check_files(test_tgt_dir, expected) for r in result: assert [] == r.warnings assert [] == r.errors
def test_libarchive_extract_can_extract_to_relative_paths(self): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath import tempfile import shutil from extractcode.libarchive2 import extract test_file = self.get_test_loc('archive/relative_path/basic.zip') scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace( scancode_root_abs, '').strip('\\/') test_tgt_dir = tempfile.mkdtemp(dir=scancode_tmp).replace( scancode_root_abs, '').strip('\\/') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') result = list(extract(test_src_file, test_tgt_dir)) assert [] == result expected = ['c/a/a.txt', 'c/b/a.txt', 'c/c/a.txt'] check_files(test_tgt_dir, expected)
def extract_file_by_file(location, target_dir, arch_type='*', skip_symlinks=True): """ Extract all files using a one-by-one process from a 7zip-supported archive file at location in the `target_dir` directory. Return a list of warning messages if any or an empty list. Raise exception on errors. `arch_type` is the type of 7zip archive passed to the -t 7zip option. Can be None. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) entries, errors_msgs = list_entries(location, arch_type) entries = list(entries) # Determine if we need a one-by-one approach: technically the aproach is to # check if we have files that are in the same dir and have the same name # when the case is ignored. We take a simpler approach: we check if all # paths are unique when we ignore the case: for that we only check that the # length of two paths sets are the same: one set as-is and the other # lowercased. paths_as_is = set(e.path for e in entries) paths_no_case = set(p.lower() for p in paths_as_is) need_by_file = len(paths_as_is) != len(paths_no_case) if not need_by_file: # use regular extract return extract_all_files_at_once(location=location, target_dir=target_dir, arch_type=arch_type) # now we are extracting one file at a time. this is a tad painful because we # are dealing with a full command execution at each time. errors = {} warnings = {} tmp_dir = fileutils.get_temp_dir(prefix='extractcode-extract-') for i, entry in enumerate(entries): if not entry.is_file: continue tmp_extract_dir = os.path.join(tmp_dir, str(i)) fileutils.create_dir(tmp_extract_dir) ex_args = build_7z_extract_command( location=location, target_dir=tmp_extract_dir, single_entry=entry, arch_type=arch_type, ) rc, stdout, stderr = command.execute2(**ex_args) error = get_7z_errors(stdout, stderr) if error or rc != 0: error = error or UNKNOWN_ERROR if TRACE: logger.debug( 'extract: failure: {rc}\n' 'stderr: {stderr}\nstdout: {stdout}'.format(**locals())) errors[entry.path] = error continue # these are all for a single file path warns = get_7z_warnings(stdout) or {} wmsg = '\n'.join(warns.values()) if wmsg: if entry.path in warnings: warnings[entry.path] += '\n' + wmsg else: warnings[entry.path] = wmsg # finally move that extracted file to its target location, possibly renamed source_file_name = fileutils.file_name(entry.path) source_file_loc = os.path.join(tmp_extract_dir, source_file_name) if not os.path.exists(source_file_loc): if entry.path in errors: errors[entry.path] += '\nNo file name extracted.' else: errors[entry.path] = 'No file name extracted.' continue safe_path = paths.safe_path(entry.path, posix=True) target_file_loc = os.path.join(target_dir, safe_path) target_file_dir = os.path.dirname(target_file_loc) fileutils.create_dir(target_file_dir) unique_target_file_loc = extractcode.new_name(target_file_loc, is_dir=False) if TRACE: logger.debug( 'extract: unique_target_file_loc: from {} to {}'.format( target_file_loc, unique_target_file_loc)) if os.path.isfile(source_file_loc): fileutils.copyfile(source_file_loc, unique_target_file_loc) else: fileutils.copytree(source_file_loc, unique_target_file_loc) extractcode.remove_backslashes_and_dotdots(abs_target_dir) if errors: raise ExtractErrorFailedToExtract(errors) return convert_warnings_to_list(warnings)
from os.path import dirname from os.path import abspath from os.path import getsize from os.path import getmtime from os.path import join from os.path import exists from commoncode import fileutils lic_src_dir = abspath(dirname(__file__)) src_dir = dirname(lic_src_dir) data_dir = join(lic_src_dir, 'data') licenses_data_dir = join(data_dir, 'licenses') rules_data_dir = join(data_dir, 'rules') root_dir = dirname(src_dir) cache_dir = join(root_dir, '.cache') license_index_cache_dir = join(cache_dir, 'license_index') if not exists(license_index_cache_dir): fileutils.create_dir(license_index_cache_dir) # minimum number of tokens a match should have to be considered as worthy keeping MIN_MATCH_LENGTH = 4 MIN_MATCH_HIGH_LENGTH = 3 # maximum distance between two matches to merge MAX_DIST = 120
from __future__ import print_function from __future__ import absolute_import from os.path import dirname from os.path import abspath from os.path import getsize from os.path import getmtime from os.path import join from os.path import exists from commoncode import fileutils scan_src_dir = abspath(dirname(__file__)) src_dir = dirname(scan_src_dir) root_dir = dirname(src_dir) cache_dir = join(root_dir, '.cache') scans_cache_dir = join(cache_dir, 'scan_results_caches') if not exists(scans_cache_dir): fileutils.create_dir(scans_cache_dir) from pkg_resources import get_distribution, DistributionNotFound try: __version__ = get_distribution('scancode-toolkit').version except DistributionNotFound: # package is not installed ?? __version__ = '2.0.0'
from __future__ import print_function from __future__ import absolute_import from os.path import dirname from os.path import abspath from os.path import getsize from os.path import getmtime from os.path import join from os.path import exists from commoncode import fileutils scan_src_dir = abspath(dirname(__file__)) src_dir = dirname(scan_src_dir) root_dir = dirname(src_dir) cache_dir = join(root_dir, '.cache') scans_cache_dir = join(cache_dir, 'scan_results_caches') if not exists(scans_cache_dir): fileutils.create_dir(scans_cache_dir) from pkg_resources import get_distribution, DistributionNotFound try: __version__ = get_distribution('scancode-toolkit').version except DistributionNotFound: # package is not installed ?? __version__ = '2.2.1'
def load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=SCANCODE_DEV_MODE, # used for testing only timeout=LICENSE_INDEX_LOCK_TIMEOUT, tree_base_dir=scancode_src_dir, licenses_data_dir=None, rules_data_dir=None, ): """ Load or build and save and return a LicenseCache object. We either load a cached LicenseIndex or build and cache the index. On the side, we load cached or build license db, SPDX symbols and other license-related data structures. - If the cache does not exist, a new index is built and cached. - If `check_consistency` is True, the cache is checked for consistency and rebuilt if inconsistent or stale. - If `check_consistency` is False, the cache is NOT checked for consistency and if the cache files exist but ARE stale, the cache WILL NOT be rebuilt """ idx_cache_dir = os.path.join(licensedcode_cache_dir, LICENSE_INDEX_DIR) create_dir(idx_cache_dir) cache_file = os.path.join(idx_cache_dir, LICENSE_INDEX_FILENAME) has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file) # bypass check if no consistency check is needed if has_cache and not check_consistency: try: return load_cache_file(cache_file) except Exception as e: # work around some rare Windows quirks import traceback print( 'Inconsistent License cache: checking and rebuilding index.' ) print(str(e)) print(traceback.format_exc()) from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd from licensedcode.models import load_licenses from scancode import lockfile licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd lock_file = os.path.join(scancode_cache_dir, LICENSE_LOCKFILE_NAME) checksum_file = os.path.join(scancode_cache_dir, LICENSE_CHECKSUM_FILE) has_tree_checksum = os.path.exists(checksum_file) # here, we have no cache or we want a validity check: lock, check # and build or rebuild as needed try: # acquire lock and wait until timeout to get a lock or die with lockfile.FileLock(lock_file).locked(timeout=timeout): current_checksum = None # is the current cache consistent or stale? if has_cache and has_tree_checksum: # if we have a saved cached index # load saved tree_checksum and compare with current tree_checksum with open(checksum_file) as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum( tree_base_dir=tree_base_dir) if current_checksum == existing_checksum: # The cache is consistent with the latest code and data # load and return return load_cache_file(cache_file) # Here, the cache is not consistent with the latest code and # data: It is either stale or non-existing: we need to # rebuild all cached data (e.g. mostly the index) and cache it licenses_db = load_licenses( licenses_data_dir=licenses_data_dir) index = build_index( licenses_db=licenses_db, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) spdx_symbols = build_spdx_symbols(licenses_db=licenses_db) unknown_spdx_symbol = build_unknown_spdx_symbol( licenses_db=licenses_db) licensing = build_licensing(licenses_db=licenses_db) license_cache = LicenseCache( db=licenses_db, index=index, licensing=licensing, spdx_symbols=spdx_symbols, unknown_spdx_symbol=unknown_spdx_symbol, ) # save the cache as pickle new tree checksum with open(cache_file, 'wb') as fn: pickle.dump(license_cache, fn, protocol=PICKLE_PROTOCOL) current_checksum = tree_checksum(tree_base_dir=tree_base_dir) with open(checksum_file, 'w') as ctcs: ctcs.write(current_checksum) return license_cache except lockfile.LockTimeout: # TODO: handle unable to lock in a nicer way raise
) if spdx_text: yield Rule( text_file=join(license_obj.src_dir, license_obj.spdx_file), licenses=[license_key], ) text_tknzr, template_tknzr, _ = index.tokenizers() # token caching cache_dir = join(dirname(dirname(src_dir)), '.cache', 'license_tokens') if not os.path.exists(cache_dir): fileutils.create_dir(cache_dir) class RuleWithNoTokenError(Exception): pass def get_tokens(location, template, use_cache=False): """ Return a list of tokens from a from a file at location using the tokenizer function. """ location = os.path.abspath(location) if not exists(location): raise RuleWithNoTokenError('Rule text location does not exist: %(location)r' % locals()) # return []
def dump(self): parent = fileutils.parent_directory(self.data_file) if not exists(parent): fileutils.create_dir(parent) with open(self.data_file, 'w') as df: df.write(saneyaml.dump(self.to_dict()))
def write(self, target_dir, transform_path=lambda x: x, skip_links=True): """ Write entry to a file or directory saved relatively to the `target_dir` and return the path where the file or directory was written or None if nothing was written to disk. `transform_path` is a callable taking a path and returning a transformed path such as resolving relative paths, transliterating non-portable characters or other path transformations. The default is a no-op lambda. """ if TRACE: logger.debug('writing entry: {}'.format(self)) if not self.archive.archive_struct: raise ArchiveErrorIllegalOperationOnClosedArchive() # skip links and special files if not (self.isfile or self.isdir): return if skip_links and self.issym: return if skip_links and self.issym: return if not skip_links and self.issym: raise NotImplemented( 'extraction of sym links with librarchive is not yet implemented.' ) abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) # TODO: return some warning when original path has been transformed clean_path = transform_path(self.path) if self.isdir: # TODO: also rename directories to a new name if needed segment by segment dir_path = os.path.join(abs_target_dir, clean_path) fileutils.create_dir(dir_path) return dir_path # note: here isfile=True # create parent directories if needed target_path = os.path.join(abs_target_dir, clean_path) parent_path = os.path.dirname(target_path) # TODO: also rename directories to a new name if needed segment by segment fileutils.create_dir(parent_path) # TODO: return some warning when original path has been renamed? unique_path = extractcode.new_name(target_path, is_dir=False) if TRACE: logger.debug('path: \ntarget_path: {}\nunique_path: {}'.format( target_path, unique_path)) with open(unique_path, 'wb') as target: for content in self.get_content(): if TRACE_DEEP: logger.debug(' chunk: {}'.format(repr(content))) target.write(content) os.utime(unique_path, (self.time, self.time)) return target_path
from os.path import abspath from os.path import getsize from os.path import getmtime from os.path import join from os.path import exists from commoncode import fileutils lic_src_dir = abspath(dirname(__file__)) src_dir = dirname(lic_src_dir) data_dir = join(lic_src_dir, 'data') licenses_data_dir = join(data_dir, 'licenses') rules_data_dir = join(data_dir, 'rules') root_dir = dirname(src_dir) cache_dir = join(root_dir, '.cache') license_index_cache_dir = join(cache_dir, 'license_index') if not exists(license_index_cache_dir): fileutils.create_dir(license_index_cache_dir) # minimum number of tokens a match should have to be considered as worthy keeping MIN_MATCH_LENGTH = 4 MIN_MATCH_HIGH_LENGTH = 3 # FIXME: we should consider the length of two rules and two matches when considering MAX_DIST # eventually this should be skipped early right during the matching too # maximum distance between two matches to merge MAX_DIST = 120