def test_path_to_unicode_and_path_to_bytes_are_idempotent(self): a = b'foo\xb1bar' b = u'foo\udcb1bar' assert a == path_to_bytes(path_to_unicode(a)) assert a == path_to_bytes(path_to_unicode(b)) assert b == path_to_unicode(path_to_bytes(a)) assert b == path_to_unicode(path_to_bytes(b))
def test_scan_does_not_fail_when_scanning_unicode_files_and_paths(): test_dir = test_env.get_test_loc(u'unicodepath/uc') result_file = test_env.get_temp_file('json') if on_linux: test_dir = path_to_bytes(test_dir) result_file = path_to_bytes(result_file) args = ['--info', '--license', '--copyright', '--package', '--email', '--url', '--strip-root', test_dir , result_file] result = run_scan_click(args) if result.exit_code != 0: raise Exception(result.output, args) assert result.exit_code == 0 assert 'Scanning done' in result.output # the paths for each OS end up encoded differently. # See for details: # https://github.com/nexB/scancode-toolkit/issues/390 # https://github.com/nexB/scancode-toolkit/issues/688 if on_linux: expected = 'unicodepath/unicodepath.expected-linux.json' elif on_mac: expected = 'unicodepath/unicodepath.expected-mac.json' elif on_windows: expected = 'unicodepath/unicodepath.expected-win.json' check_json_scan(test_env.get_test_loc(expected), result_file, strip_dates=True, regen=False)
def extract_tar(location, target_dir, verbatim=False, *args, **kwargs): """ Extract a tar archive at location in the target_dir directory. If `verbatim` is True preserve the permissions. """ # always for using bytes for paths on all OSses... tar seems to use bytes internally # and get confused otherwise location = path_to_bytes(location) target_dir = path_to_bytes(target_dir) with open(location, 'rb') as input_tar: tar = None try: tar = tarfile.open(fileobj=input_tar) tarinfos = tar.getmembers() to_extract = [] for tarinfo in tarinfos: if tar_can_extract(tarinfo, verbatim): if not verbatim: tarinfo.mode = 0700 to_extract.append(tarinfo) tar.extractall(target_dir, members=to_extract) finally: if tar: tar.close()
def test_scan_does_not_fail_when_scanning_unicode_files_and_paths(): test_dir = test_env.get_test_loc(u'unicodepath/uc') result_file = test_env.get_temp_file('json') if on_linux: test_dir = path_to_bytes(test_dir) result_file = path_to_bytes(result_file) args = [ '--info', '--license', '--copyright', '--package', '--email', '--url', '--strip-root', test_dir, result_file ] result = run_scan_click(args) if result.exit_code != 0: raise Exception(result.output, args) assert result.exit_code == 0 assert 'Scanning done' in result.output # the paths for each OS end up encoded differently. # See for details: # https://github.com/nexB/scancode-toolkit/issues/390 # https://github.com/nexB/scancode-toolkit/issues/688 if on_linux: expected = 'unicodepath/unicodepath.expected-linux.json' elif on_mac: expected = 'unicodepath/unicodepath.expected-mac.json' elif on_windows: expected = 'unicodepath/unicodepath.expected-win.json' check_json_scan(test_env.get_test_loc(expected), result_file, strip_dates=True, regen=False)
def get_test_loc(self, test_path, copy=False, debug=False): """ Given a `test_path` relative to the self.test_data_dir directory, return the location to a test file or directory for this path. Copy to a temp test location if `copy` is True. """ test_data_dir = self.test_data_dir if on_linux: test_path = path_to_bytes(test_path) test_data_dir = path_to_bytes(test_data_dir) if debug: import inspect caller = inspect.stack()[1][3] print('\nself.get_test_loc,%(caller)s,"%(test_path)s"' % locals()) test_loc = get_test_loc(test_path, test_data_dir, debug=debug) if copy: base_name = os.path.basename(test_loc) if filetype.is_file(test_loc): # target must be an existing dir target_dir = self.get_temp_dir() fileutils.copyfile(test_loc, target_dir) test_loc = os.path.join(target_dir, base_name) else: # target must be a NON existing dir target_dir = os.path.join(self.get_temp_dir(), base_name) fileutils.copytree(test_loc, target_dir) # cleanup of VCS that could be left over from checkouts self.remove_vcs(target_dir) test_loc = target_dir return test_loc
def get_test_loc(test_path, test_data_dir, debug=False, exists=True): """ Given a `test_path` relative to the `test_data_dir` directory, return the location to a test file or directory for this path. No copy is done. """ if on_linux: test_path = path_to_bytes(test_path) test_data_dir = path_to_bytes(test_data_dir) if debug: import inspect caller = inspect.stack()[1][3] print('\nget_test_loc,%(caller)s,"%(test_path)s","%(test_data_dir)s"' % locals()) assert test_path assert test_data_dir if not os.path.exists(test_data_dir): raise IOError("[Errno 2] No such directory: test_data_dir not found:" " '%(test_data_dir)s'" % locals()) tpath = to_os_native_path(test_path) test_loc = os.path.abspath(os.path.join(test_data_dir, tpath)) if exists and not os.path.exists(test_loc): raise IOError("[Errno 2] No such file or directory: " "test_path not found: '%(test_loc)s'" % locals()) return test_loc
def test_scan_can_handle_non_utf8_file_names_on_posix(): test_dir = test_env.extract_test_tar_raw('non_utf8/non_unicode.tgz') result_file = test_env.get_temp_file('json') if on_linux: test_dir = path_to_bytes(test_dir) result_file = path_to_bytes(result_file) result = run_scan_click(['-i', '--strip-root', test_dir, result_file]) assert result.exit_code == 0 assert 'Scanning done' in result.output # the paths for each OS end up encoded differently. # See for details: # https://github.com/nexB/scancode-toolkit/issues/390 # https://github.com/nexB/scancode-toolkit/issues/688 if on_linux: expected = 'non_utf8/expected-linux.json' elif on_mac: expected = 'non_utf8/expected-mac.json' elif on_windows: expected = 'non_utf8/expected-win.json' check_json_scan(test_env.get_test_loc(expected), result_file, regen=False)
def test_scan_can_handle_non_utf8_file_names_on_posix(): test_dir = test_env.extract_test_tar_raw('non_utf8/non_unicode.tgz') result_file = test_env.get_temp_file('json') if on_linux: test_dir = path_to_bytes(test_dir) result_file = path_to_bytes(result_file) result = run_scan_click(['-i', '--strip-root', test_dir, result_file]) assert result.exit_code == 0 assert 'Scanning done' in result.output # the paths for each OS end up encoded differently. # See for details: # https://github.com/nexB/scancode-toolkit/issues/390 # https://github.com/nexB/scancode-toolkit/issues/688 if on_linux: expected = 'non_utf8/expected-linux.json' elif on_mac: expected = 'non_utf8/expected-mac.json' elif on_windows: expected = 'non_utf8/expected-win.json' check_json_scan(test_env.get_test_loc(expected), result_file, regen=False)
def get_test_loc(test_path, test_data_dir, debug=False, exists=True): """ Given a `test_path` relative to the `test_data_dir` directory, return the location to a test file or directory for this path. No copy is done. """ if on_linux: test_path = path_to_bytes(test_path) test_data_dir = path_to_bytes(test_data_dir) if debug: import inspect caller = inspect.stack()[1][3] print('\nget_test_loc,%(caller)s,"%(test_path)s","%(test_data_dir)s"' % locals()) assert test_path assert test_data_dir if not os.path.exists(test_data_dir): raise IOError("[Errno 2] No such directory: test_data_dir not found:" " '%(test_data_dir)s'" % locals()) tpath = to_os_native_path(test_path) test_loc = os.path.abspath(os.path.join(test_data_dir, tpath)) if exists and not os.path.exists(test_loc): raise IOError("[Errno 2] No such file or directory: " "test_path not found: '%(test_loc)s'" % locals()) return test_loc
def extract_tar(location, target_dir, verbatim=False, *args, **kwargs): """ Extract a tar archive at location in the target_dir directory. If `verbatim` is True preserve the permissions. """ # always for using bytes for paths on all OSses... tar seems to use bytes internally # and get confused otherwise location = path_to_bytes(location) target_dir = path_to_bytes(target_dir) with open(location, 'rb') as input_tar: tar = None try: tar = tarfile.open(fileobj=input_tar) tarinfos = tar.getmembers() to_extract = [] for tarinfo in tarinfos: if tar_can_extract(tarinfo, verbatim): if not verbatim: tarinfo.mode = 0700 to_extract.append(tarinfo) tar.extractall(target_dir, members=to_extract) finally: if tar: tar.close()
def get_test_loc(self, test_path, copy=False, debug=False): """ Given a `test_path` relative to the self.test_data_dir directory, return the location to a test file or directory for this path. Copy to a temp test location if `copy` is True. """ test_data_dir = self.test_data_dir if on_linux: test_path = path_to_bytes(test_path) test_data_dir = path_to_bytes(test_data_dir) if debug: import inspect caller = inspect.stack()[1][3] print('\nself.get_test_loc,%(caller)s,"%(test_path)s"' % locals()) test_loc = get_test_loc(test_path, test_data_dir, debug=debug) if copy: base_name = os.path.basename(test_loc) if filetype.is_file(test_loc): # target must be an existing dir target_dir = self.get_temp_dir() fileutils.copyfile(test_loc, target_dir) test_loc = os.path.join(target_dir, base_name) else: # target must be a NON existing dir target_dir = os.path.join(self.get_temp_dir(), base_name) fileutils.copytree(test_loc, target_dir) # cleanup of VCS that could be left over from checkouts self.remove_vcs(target_dir) test_loc = target_dir return test_loc
def _extract_tar_raw(test_path, target_dir, to_bytes, *args, **kwargs): """ Raw simplified extract for certain really weird paths and file names. """ if to_bytes: # use bytes for paths on ALL OSes (though this may fail on macOS) target_dir = path_to_bytes(target_dir) test_path = path_to_bytes(test_path) tar = tarfile.open(test_path) tar.extractall(path=target_dir) tar.close()
def _extract_tar_raw(test_path, target_dir, to_bytes, *args, **kwargs): """ Raw simplified extract for certain really weird paths and file names. """ if to_bytes: # use bytes for paths on ALL OSes (though this may fail on macOS) target_dir = path_to_bytes(target_dir) test_path = path_to_bytes(test_path) tar = tarfile.open(test_path) tar.extractall(path=target_dir) tar.close()
def build_ignorer(ignores, unignores): """ Return a callable suitable for path ignores with OS-specific encoding preset. """ ignores = ignores or {} unignores = unignores or {} if on_linux: ignores = {path_to_bytes(k): v for k, v in ignores.items()} unignores = {path_to_bytes(k): v for k, v in unignores.items()} else: ignores = {path_to_unicode(k): v for k, v in ignores.items()} unignores = {path_to_unicode(k): v for k, v in unignores.items()} return partial(ignore.is_ignored, ignores=ignores, unignores=unignores)
def build_ignorer(ignores, unignores): """ Return a callable suitable for path ignores with OS-specific encoding preset. """ ignores = ignores or {} unignores = unignores or {} if on_linux: ignores = {path_to_bytes(k): v for k, v in ignores.items()} unignores = {path_to_bytes(k): v for k, v in unignores.items()} else: ignores = {path_to_unicode(k): v for k, v in ignores.items()} unignores = {path_to_unicode(k): v for k, v in unignores.items()} return partial(ignore.is_ignored, ignores=ignores, unignores=unignores)
def remove_backslashes_and_dotdots(directory): """ Walk a directory and rename the files if their names contain backslashes. Return a list of errors if any. """ if on_linux: directory = path_to_bytes(directory) errors = [] for top, _, files in os.walk(directory): for filename in files: if not (WIN_PATH_SEP in filename or DOTDOT in filename): continue try: new_path = fileutils.as_posixpath(filename) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) segments = new_path.split(POSIX_PATH_SEP) directory = os.path.join(top, *segments[:-1]) fileutils.create_dir(directory) shutil.move(os.path.join(top, filename), os.path.join(top, *segments)) except Exception: errors.append(os.path.join(top, filename)) return errors
def remove_archive_suffix(path): """ Remove all the extracted suffix from a path. """ if on_linux: path = path_to_bytes(path) return re.sub(EXTRACT_SUFFIX, EMPTY_STRING, path)
def get_extraction_path(path): """ Return a path where to extract. """ if on_linux: path = path_to_bytes(path) return path.rstrip(PATHS_SEPS) + EXTRACT_SUFFIX
def paths_from_keys(base_path, keys): """ Return a tuple of (parent dir path, filename) for a cache entry built from a cache keys triple and a base_directory. Ensure that the parent directory exist. """ if on_linux: keys = [path_to_bytes(k) for k in keys] base_path = path_to_bytes(base_path) else: keys = [path_to_unicode(k) for k in keys] base_path = path_to_unicode(base_path) dir1, dir2, file_name = keys parent = os.path.join(base_path, dir1, dir2) fileutils.create_dir(parent) return parent, file_name
def paths_from_keys(base_path, keys): """ Return a tuple of (parent dir path, filename) for a cache entry built from a cache keys triple and a base_directory. Ensure that the parent directory exist. """ if on_linux: keys = [path_to_bytes(k) for k in keys] base_path = path_to_bytes(base_path) else: keys = [path_to_unicode(k) for k in keys] base_path = path_to_unicode(base_path) dir1, dir2, file_name = keys parent = os.path.join(base_path, dir1, dir2) fileutils.create_dir(parent) return parent, file_name
def get_extraction_path(path): """ Return a path where to extract. """ if on_linux: path = path_to_bytes(path) return path.rstrip(PATHS_SEPS) + EXTRACT_SUFFIX
def remove_archive_suffix(path): """ Remove all the extracted suffix from a path. """ if on_linux: path = path_to_bytes(path) return re.sub(EXTRACT_SUFFIX, EMPTY_STRING, path)
def resource_paths(base_path, diag, scans_cache_class, pre_scan_plugins=()): """ Yield `Resource` objects for all the files found at base_path (either a directory or file) given an absolute base_path. Only yield Files, not directories. absolute path is a native OS path. base_path-relative path is a POSIX path. The relative path is guaranted to be unicode and may be URL-encoded and may not be suitable to address an actual file. """ if base_path: if on_linux: base_path = path_to_bytes(base_path) else: base_path = path_to_unicode(base_path) base_path = os.path.abspath(os.path.normpath(os.path.expanduser(base_path))) base_is_dir = filetype.is_dir(base_path) len_base_path = len(base_path) ignores = {} if pre_scan_plugins: for plugin in pre_scan_plugins: ignores.update(plugin.get_ignores()) ignores.update(ignore.ignores_VCS) ignorer = build_ignorer(ignores, unignores={}) resources = fileutils.resource_iter(base_path, ignored=ignorer) for abs_path in resources: resource = Resource(scans_cache_class, abs_path, base_is_dir, len_base_path) # always fetch infos and cache. resource.put_info(scan_infos(abs_path, diag=diag)) yield resource
def test_extractcode_command_can_extract_archive_with_unicode_names( monkeypatch): monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) test_dir = test_env.get_test_loc('unicodearch', copy=True) if on_linux: test_dir = path_to_bytes(test_dir) runner = CliRunner() result = runner.invoke(extract_cli.extractcode, [test_dir], catch_exceptions=False) assert result.exit_code == 0 uni_arch = b'unicodepath.tgz' if on_linux else 'unicodepath.tgz' uni_path = b'/unicodepath/' if on_linux else '/unicodepath/' file_result = [ f for f in map(as_posixpath, file_iter(test_dir)) if not f.endswith(uni_arch) ] file_result = [ EMPTY_STRING.join(f.partition(uni_path)[1:]) for f in file_result ] file_result = [f for f in file_result if f] expected = [ '/unicodepath/Ho_', '/unicodepath/Ho_a', '/unicodepath/koristenjem_Karkkainen_-_Sander.pdf' ] assert sorted(expected) == sorted(file_result)
def remove_backslashes_and_dotdots(directory): """ Walk a directory and rename the files if their names contain backslashes. Return a list of errors if any. """ if on_linux: directory = path_to_bytes(directory) errors = [] for top, _, files in os.walk(directory): for filename in files: if not (WIN_PATH_SEP in filename or DOTDOT in filename): continue try: new_path = fileutils.as_posixpath(filename) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) segments = new_path.split(POSIX_PATH_SEP) directory = os.path.join(top, *segments[:-1]) fileutils.create_dir(directory) shutil.move(os.path.join(top, filename), os.path.join(top, *segments)) except Exception: errors.append(os.path.join(top, filename)) return errors
def recognize_package(location): """ Return a Package object if one was recognized or None for this `location`. """ if not filetype.is_file(location): return T = contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file for package_type in PACKAGE_TYPES: # Note: default to True if there is nothing to match against metafiles = package_type.metafiles if on_linux: metafiles = (path_to_bytes(m) for m in metafiles) if location.endswith(tuple(metafiles)): logger_debug('metafile matching: package_type is of type:', package_type) return package_type.recognize(location) if package_type.filetypes: type_matched = any(t in ftype for t in package_type.filetypes) else: type_matched = False if package_type.mimetypes: mime_matched = any(m in mtype for m in package_type.mimetypes) else: mime_matched = False extensions = package_type.extensions if extensions: if on_linux: extensions = tuple(path_to_bytes(e) for e in extensions) extension_matched = location.lower().endswith(extensions) else: extension_matched = False if type_matched and mime_matched and extension_matched: # we return the first match in the order of PACKAGE_TYPES logger_debug('all matching: package is of type:', package_type) recognized = package_type.recognize(location) logger_debug('all matching: recognized as:', repr(recognized)) return recognized logger_debug('no match: package is not of known type:', package_type)
def is_extraction_path(path): """ Return True is the path points to an extraction path. """ if on_linux: path = path_to_bytes(path) return path and path.rstrip(PATHS_SEPS).endswith(EXTRACT_SUFFIX)
def is_extracted(location): """ Return True is the location is already extracted to the corresponding extraction location. """ if on_linux: location = path_to_bytes(location) return location and os.path.exists(get_extraction_path(location))
def is_extraction_path(path): """ Return True is the path points to an extraction path. """ if on_linux: path = path_to_bytes(path) return path and path.rstrip(PATHS_SEPS).endswith(EXTRACT_SUFFIX)
def update_path_environment(new_path, _os_module=os): """ Update the PATH environment variable by adding `new_path` to the front of PATH if `new_path` is not alreday in the PATH. """ # note: _os_module is used to facilitate mock testing using an # object with a sep string attribute and an environ mapping # attribute if not new_path: return new_path = new_path.strip() if not new_path: return path_env = _os_module.environ.get(b'PATH') if not path_env: # this is quite unlikely to ever happen, but here for safety path_env = '' # ensure we use unicode or bytes depending on OSes if on_linux: new_path = path_to_bytes(new_path) path_env = path_to_bytes(path_env) sep = _os_module.pathsep else: new_path = path_to_unicode(new_path) path_env = path_to_unicode(path_env) sep = unicode(_os_module.pathsep) path_segments = path_env.split(sep) # add lib path to the front of the PATH env var # this will use bytes on Linux and unicode elsewhere if new_path not in path_segments: if not path_env: new_path_env = new_path else: new_path_env = sep.join([new_path, path_env]) if not on_linux: # recode to bytes using FS encoding new_path_env = path_to_bytes(new_path_env) # ... and set the variable back as bytes _os_module.environ[b'PATH'] = new_path_env
def is_extracted(location): """ Return True is the location is already extracted to the corresponding extraction location. """ if on_linux: location = path_to_bytes(location) return location and os.path.exists(get_extraction_path(location))
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos.update(multi_checksums(location, ( 'sha1', 'md5', ))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def new_name(location, is_dir=False): """ Return a new non-existing location from a `location` usable to write a file or create directory without overwriting existing files or directories in the same parent directory, ignoring the case of the filename. The case of the filename is ignored to ensure that similar results are returned across case sensitive (*nix) and case insensitive file systems. To find a new unique filename, this tries new names this way: * pad a directory name with _X where X is an incremented number. * pad a file base name with _X where X is an incremented number and keep the extension unchanged. """ assert location if on_linux: location = path_to_bytes(location) location = location.rstrip(PATHS_SEPS) assert location parent = fileutils.parent_directory(location) # all existing files or directory as lower case siblings_lower = set(s.lower() for s in os.listdir(parent)) filename = fileutils.file_name(location) # corner case if filename in (DOT, DOT): filename = UNDERSCORE # if unique, return this if filename.lower() not in siblings_lower: return os.path.join(parent, filename) # otherwise seek a unique name if is_dir: # directories do not have an "extension" base_name = filename ext = EMPTY_STRING else: base_name, dot, ext = filename.partition(DOT) if dot: ext = dot + ext else: base_name = filename ext = EMPTY_STRING # find a unique filename, adding a counter int to the base_name counter = 1 while 1: filename = base_name + UNDERSCORE + str(counter) + ext if filename.lower() not in siblings_lower: break counter += 1 return os.path.join(parent, filename)
def new_name(location, is_dir=False): """ Return a new non-existing location from a `location` usable to write a file or create directory without overwriting existing files or directories in the same parent directory, ignoring the case of the filename. The case of the filename is ignored to ensure that similar results are returned across case sensitive (*nix) and case insensitive file systems. To find a new unique filename, this tries new names this way: * pad a directory name with _X where X is an incremented number. * pad a file base name with _X where X is an incremented number and keep the extension unchanged. """ assert location if on_linux: location = path_to_bytes(location) location = location.rstrip(PATHS_SEPS) assert location parent = fileutils.parent_directory(location) # all existing files or directory as lower case siblings_lower = set(s.lower() for s in os.listdir(parent)) filename = fileutils.file_name(location) # corner case if filename in (DOT, DOT): filename = UNDERSCORE # if unique, return this if filename.lower() not in siblings_lower: return os.path.join(parent, filename) # otherwise seek a unique name if is_dir: # directories do not have an "extension" base_name = filename ext = EMPTY_STRING else: base_name, dot, ext = filename.partition(DOT) if dot: ext = dot + ext else: base_name = filename ext = EMPTY_STRING # find a unique filename, adding a counter int to the base_name counter = 1 while 1: filename = base_name + UNDERSCORE + str(counter) + ext if filename.lower() not in siblings_lower: break counter += 1 return os.path.join(parent, filename)
def info_keys(path, seed=None): """ Return a file info cache "keys" tripple for a path composed of three paths segments derived from a checksum. For example: >>> expected = 'fb87db2bb28e9501ac7fdc4812782118f4c94a0f' >>> assert expected == sha1('/w421/scancode-toolkit2').hexdigest() >>> expected = ('f', 'b', '87db2bb28e9501ac7fdc4812782118f4c94a0f') >>> assert expected == info_keys('/w421/scancode-toolkit2') """ # ensure that we always pass bytes to the hash function if isinstance(path, unicode): path = path_to_bytes(path) if seed: if isinstance(seed, unicode): seed = path_to_bytes(seed) path = seed + path return keys_from_hash(sha1(path).hexdigest())
def log_file_path(cls, logfile_fd, path): """ Log file path in the cache logfile_fd **opened** file descriptor. """ # we dump one path per line written as bytes or unicode if on_linux: path = path_to_bytes(path) + b'\n' else: path = path_to_unicode(path) + '\n' logfile_fd.write(path)
def get_handlers(location): """ Return an iterable of (handler, type_matched, mime_matched, extension_matched,) for this `location`. """ if on_linux: location = path_to_bytes(location) if filetype.is_file(location): T = typecode.contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file for handler in archive_handlers: if not handler.extractors: continue extractor_count = len(handler.extractors) if extractor_count > 2: raise Exception('Maximum level of archive nesting is two.') # default to False type_matched = handler.filetypes and any(t in ftype for t in handler.filetypes) mime_matched = handler.mimetypes and any(m in mtype for m in handler.mimetypes) exts = handler.extensions if exts: if on_linux: exts = tuple(path_to_bytes(e) for e in exts) extension_matched = exts and location.lower().endswith(exts) if TRACE_DEEP: handler_name = handler.name logger.debug('get_handlers: considering %(handler_name)r handler for %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals()) logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals()) if handler.strict and not all([type_matched, mime_matched, extension_matched]): continue if type_matched or mime_matched or extension_matched: if TRACE_DEEP: logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals()) logger.debug('get_handlers: %(location)s: handler: %(handler)r' % locals()) yield handler, type_matched, mime_matched, extension_matched
def log_file_path(cls, logfile_fd, path): """ Log file path in the cache logfile_fd **opened** file descriptor. """ # we dump one path per line written as bytes or unicode if on_linux: path = path_to_bytes(path) + b'\n' else: path = path_to_unicode(path) + '\n' logfile_fd.write(path)
def info_keys(path, seed=None): """ Return a file info cache "keys" tripple for a path composed of three paths segments derived from a checksum. For example: >>> expected = 'fb87db2bb28e9501ac7fdc4812782118f4c94a0f' >>> assert expected == sha1('/w421/scancode-toolkit2').hexdigest() >>> expected = ('f', 'b', '87db2bb28e9501ac7fdc4812782118f4c94a0f') >>> assert expected == info_keys('/w421/scancode-toolkit2') """ # ensure that we always pass bytes to the hash function if isinstance(path, unicode): path = path_to_bytes(path) if seed: if isinstance(seed, unicode): seed = path_to_bytes(seed) path = seed + path return keys_from_hash(sha1(path).hexdigest())
def __extract(self, test_path, extract_func=None, verbatim=False): """ Given an archive file identified by test_path relative to a test files directory, return a new temp directory where the archive file has been extracted using extract_func. If `verbatim` is True preserve the permissions. """ assert test_path and test_path != '' if on_linux: test_path = path_to_bytes(test_path) test_path = to_os_native_path(test_path) target_path = os.path.basename(test_path) target_dir = self.get_temp_dir(target_path) original_archive = self.get_test_loc(test_path) if on_linux: target_dir = path_to_bytes(target_dir) original_archive = path_to_bytes(original_archive) extract_func(original_archive, target_dir, verbatim=verbatim) return target_dir
def to_os_native_path(path): """ Normalize a path to use the native OS path separator. """ if on_linux: path = path_to_bytes(path) path = path.replace(POSIX_PATH_SEP, OS_PATH_SEP) path = path.replace(WIN_PATH_SEP, OS_PATH_SEP) path = path.rstrip(OS_PATH_SEP) return path
def to_os_native_path(path): """ Normalize a path to use the native OS path separator. """ if on_linux: path = path_to_bytes(path) path = path.replace(POSIX_PATH_SEP, OS_PATH_SEP) path = path.replace(WIN_PATH_SEP, OS_PATH_SEP) path = path.rstrip(OS_PATH_SEP) return path
def __extract(self, test_path, extract_func=None, verbatim=False): """ Given an archive file identified by test_path relative to a test files directory, return a new temp directory where the archive file has been extracted using extract_func. If `verbatim` is True preserve the permissions. """ assert test_path and test_path != '' if on_linux: test_path = path_to_bytes(test_path) test_path = to_os_native_path(test_path) target_path = os.path.basename(test_path) target_dir = self.get_temp_dir(target_path) original_archive = self.get_test_loc(test_path) if on_linux: target_dir = path_to_bytes(target_dir) original_archive = path_to_bytes(original_archive) extract_func(original_archive, target_dir, verbatim=verbatim) return target_dir
def get_temp_file(self, extension=None, dir_name='td', file_name='tf'): """ Return a unique new temporary file location to a non-existing temporary file that can safely be created without a risk of name collision. """ if extension is None: extension = '.txt' if on_linux: extension = path_to_bytes(extension) dir_name = path_to_bytes(dir_name) file_name = path_to_bytes(file_name) if extension and not extension.startswith(DOT): extension = DOT + extension file_name = file_name + extension temp_dir = self.get_temp_dir(dir_name) location = os.path.join(temp_dir, file_name) return location
def get_temp_file(self, extension=None, dir_name='td', file_name='tf'): """ Return a unique new temporary file location to a non-existing temporary file that can safely be created without a risk of name collision. """ if extension is None: extension = '.txt' if on_linux: extension = path_to_bytes(extension) dir_name = path_to_bytes(dir_name) file_name = path_to_bytes(file_name) if extension and not extension.startswith(DOT): extension = DOT + extension file_name = file_name + extension temp_dir = self.get_temp_dir(dir_name) location = os.path.join(temp_dir, file_name) return location
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date(location) or None infos['size'] = T.size infos.update(multi_checksums(location, ('sha1', 'md5',))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def extract_twice(location, target_dir, extractor1, extractor2): """ Extract a nested compressed archive at `location` to `target_dir` using the `extractor1` function to a temporary directory then the `extractor2` function on the extracted payload of `extractor1`. Return a list of warning messages. Raise exceptions on errors. Typical nested archives include compressed tarballs and RPMs (containing a compressed cpio). Note: it would be easy to support deeper extractor chains, but this gets hard to trace and debug very quickly. A depth of two is simple and sane and covers most common cases. """ if on_linux: location = path_to_bytes(location) target_dir = path_to_bytes(target_dir) abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir))) # extract first the intermediate payload to a temp dir temp_target = unicode(fileutils.get_temp_dir('extract')) warnings = extractor1(abs_location, temp_target) if TRACE: logger.debug('extract_twice: temp_target: %(temp_target)r' % locals()) # extract this intermediate payload to the final target_dir try: inner_archives = list(fileutils.file_iter(temp_target)) if not inner_archives: warnings.append(location + ': No files found in archive.') else: for extracted1_loc in inner_archives: if TRACE: logger.debug('extract_twice: extractor2: %(extracted1_loc)r' % locals()) warnings.extend(extractor2(extracted1_loc, abs_target_dir)) finally: # cleanup the temporary output from extractor1 fileutils.delete(temp_target) return warnings
def remove_vcs(self, test_dir): """ Remove some version control directories and some temp editor files. """ vcses = ('CVS', '.svn', '.git', '.hg') if on_linux: vcses = tuple(path_to_bytes(p) for p in vcses) test_dir = path_to_bytes(test_dir) for root, dirs, files in os.walk(test_dir): for vcs_dir in vcses: if vcs_dir in dirs: for vcsroot, vcsdirs, vcsfiles in os.walk(test_dir): for vcsfile in vcsdirs + vcsfiles: vfile = os.path.join(vcsroot, vcsfile) fileutils.chmod(vfile, fileutils.RW, recurse=False) shutil.rmtree(os.path.join(root, vcs_dir), False) # editors temp file leftovers tilde = b'~' if on_linux else '~' map(os.remove, [os.path.join(root, file_loc) for file_loc in files if file_loc.endswith(tilde)])
def get_scans_cache_class(cache_dir=scans_cache_dir): """ Return a new persistent cache class configured with a unique storage directory. """ # create a unique temp directory in cache_dir fileutils.create_dir(cache_dir) prefix = timeutils.time2tstamp() + u'-' cache_dir = fileutils.get_temp_dir(cache_dir, prefix=prefix) if on_linux: cache_dir = path_to_bytes(cache_dir) sc = ScanFileCache(cache_dir) sc.setup() return partial(ScanFileCache, cache_dir)
def get_best_handler(location, kinds=all_kinds): """ Return the best handler of None for the file at location. """ if on_linux: location = path_to_bytes(location) location = os.path.abspath(os.path.expanduser(location)) if not filetype.is_file(location): return handlers = list(get_handlers(location)) if handlers: candidates = score_handlers(handlers) return candidates and pick_best_handler(candidates, kinds)
def get_scans_cache_class(cache_dir=scans_cache_dir): """ Return a new persistent cache class configured with a unique storage directory. """ # create a unique temp directory in cache_dir fileutils.create_dir(cache_dir) prefix = timeutils.time2tstamp() + u'-' cache_dir = fileutils.get_temp_dir(cache_dir, prefix=prefix) if on_linux: cache_dir = path_to_bytes(cache_dir) sc = ScanFileCache(cache_dir) sc.setup() return partial(ScanFileCache, cache_dir)
def remove_vcs(self, test_dir): """ Remove some version control directories and some temp editor files. """ vcses = ('CVS', '.svn', '.git', '.hg') if on_linux: vcses = tuple(path_to_bytes(p) for p in vcses) test_dir = path_to_bytes(test_dir) for root, dirs, files in os.walk(test_dir): for vcs_dir in vcses: if vcs_dir in dirs: for vcsroot, vcsdirs, vcsfiles in os.walk(test_dir): for vcsfile in vcsdirs + vcsfiles: vfile = os.path.join(vcsroot, vcsfile) fileutils.chmod(vfile, fileutils.RW, recurse=False) shutil.rmtree(os.path.join(root, vcs_dir), False) # editors temp file leftovers tilde = b'~' if on_linux else '~' map(os.remove, [ os.path.join(root, file_loc) for file_loc in files if file_loc.endswith(tilde) ])
def extract_zip(location, target_dir, *args, **kwargs): """ Extract a zip archive file at location in the target_dir directory. """ if not os.path.isfile(location) and zipfile.is_zipfile(location): raise Exception('Incorrect zip file %(location)r' % locals()) if on_linux: location = path_to_bytes(location) target_dir = path_to_bytes(target_dir) with zipfile.ZipFile(location) as zipf: for info in zipf.infolist(): name = info.filename content = zipf.read(name) target = os.path.join(target_dir, name) if not os.path.exists(os.path.dirname(target)): os.makedirs(os.path.dirname(target)) if not content and target.endswith(os.path.sep): if not os.path.exists(target): os.makedirs(target) if not os.path.exists(target): with open(target, 'wb') as f: f.write(content)
def extract_zip(location, target_dir, *args, **kwargs): """ Extract a zip archive file at location in the target_dir directory. """ if not os.path.isfile(location) and zipfile.is_zipfile(location): raise Exception('Incorrect zip file %(location)r' % locals()) if on_linux: location = path_to_bytes(location) target_dir = path_to_bytes(target_dir) with zipfile.ZipFile(location) as zipf: for info in zipf.infolist(): name = info.filename content = zipf.read(name) target = os.path.join(target_dir, name) if not os.path.exists(os.path.dirname(target)): os.makedirs(os.path.dirname(target)) if not content and target.endswith(os.path.sep): if not os.path.exists(target): os.makedirs(target) if not os.path.exists(target): with open(target, 'wb') as f: f.write(content)
def __init__(self, cache_dir): # subdirs for info and scans_dir caches if on_linux: infos_dir = b'infos_dir/' scans_dir = b'scans_dir/' files_log = b'files_log' self.cache_base_dir = path_to_bytes(cache_dir) else: infos_dir = u'infos_dir/' scans_dir = u'scans_dir/' files_log = u'files_log' self.cache_base_dir = cache_dir self.cache_infos_dir = as_posixpath(os.path.join(self.cache_base_dir, infos_dir)) self.cache_scans_dir = as_posixpath(os.path.join(self.cache_base_dir, scans_dir)) self.cache_files_log = as_posixpath(os.path.join(self.cache_base_dir, files_log))
def test_scan_does_not_fail_when_scanning_unicode_test_files_from_express(): # On Windows, Python tar cannot extract these files. Other # extractors either fail or change the file name, making the test # moot. Git cannot check these files. So for now it makes no sense # to test this on Windows at all. Extractcode works fine, but does # rename the problematic files. test_dir = test_env.extract_test_tar_raw(b'unicode_fixtures.tar.gz') test_dir = path_to_bytes(test_dir) args = [ '-n0', '--info', '--license', '--copyright', '--package', '--email', '--url', '--strip-root', test_dir ] result = run_scan_click(args, catch_exceptions=False) if result.exit_code != 0: raise Exception(result.output, args) assert 'Scanning done' in result.output