def test_path_to_unicode_and_path_to_bytes_are_idempotent(self): a = b'foo\xb1bar' b = u'foo\udcb1bar' assert a == path_to_bytes(path_to_unicode(a)) assert a == path_to_bytes(path_to_unicode(b)) assert b == path_to_unicode(path_to_bytes(a)) assert b == path_to_unicode(path_to_bytes(b))
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos.update(multi_checksums(location, ( 'sha1', 'md5', ))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date(location) or None infos['size'] = T.size infos.update(multi_checksums(location, ('sha1', 'md5',))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def build_ignorer(ignores, unignores): """ Return a callable suitable for path ignores with OS-specific encoding preset. """ ignores = ignores or {} unignores = unignores or {} if on_linux: ignores = {path_to_bytes(k): v for k, v in ignores.items()} unignores = {path_to_bytes(k): v for k, v in unignores.items()} else: ignores = {path_to_unicode(k): v for k, v in ignores.items()} unignores = {path_to_unicode(k): v for k, v in unignores.items()} return partial(ignore.is_ignored, ignores=ignores, unignores=unignores)
def fixed_width_file_name(path, max_length=25): """ Return a fixed width file name of at most `max_length` characters extracted from the `path` string and usable for fixed width display. If the file_name is longer than `max_length`, it is truncated in the middle with using three dots "..." as an ellipsis and the extension is kept. For example: >>> short = fixed_width_file_name('0123456789012345678901234.c') >>> assert '0123456789...5678901234.c' == short """ if not path: return '' # get the path as unicode for display! path = path_to_unicode(path) filename = fileutils.file_name(path) if len(filename) <= max_length: return filename base_name, extension = fileutils.splitext(filename) number_of_dots = 3 len_extension = len(extension) remaining_length = max_length - len_extension - number_of_dots if remaining_length < (len_extension + number_of_dots) or remaining_length < 5: return '' prefix_and_suffix_length = abs(remaining_length // 2) prefix = base_name[:prefix_and_suffix_length] ellipsis = number_of_dots * '.' suffix = base_name[-prefix_and_suffix_length:] return '{prefix}{ellipsis}{suffix}{extension}'.format(**locals())
def paths_from_keys(base_path, keys): """ Return a tuple of (parent dir path, filename) for a cache entry built from a cache keys triple and a base_directory. Ensure that the parent directory exist. """ if on_linux: keys = [path_to_bytes(k) for k in keys] base_path = path_to_bytes(base_path) else: keys = [path_to_unicode(k) for k in keys] base_path = path_to_unicode(base_path) dir1, dir2, file_name = keys parent = os.path.join(base_path, dir1, dir2) fileutils.create_dir(parent) return parent, file_name
def resource_paths(base_path, diag, scans_cache_class, pre_scan_plugins=()): """ Yield `Resource` objects for all the files found at base_path (either a directory or file) given an absolute base_path. Only yield Files, not directories. absolute path is a native OS path. base_path-relative path is a POSIX path. The relative path is guaranted to be unicode and may be URL-encoded and may not be suitable to address an actual file. """ if base_path: if on_linux: base_path = path_to_bytes(base_path) else: base_path = path_to_unicode(base_path) base_path = os.path.abspath(os.path.normpath(os.path.expanduser(base_path))) base_is_dir = filetype.is_dir(base_path) len_base_path = len(base_path) ignores = {} if pre_scan_plugins: for plugin in pre_scan_plugins: ignores.update(plugin.get_ignores()) ignores.update(ignore.ignores_VCS) ignorer = build_ignorer(ignores, unignores={}) resources = fileutils.resource_iter(base_path, ignored=ignorer) for abs_path in resources: resource = Resource(scans_cache_class, abs_path, base_is_dir, len_base_path) # always fetch infos and cache. resource.put_info(scan_infos(abs_path, diag=diag)) yield resource
def update_path_environment(new_path, _os_module=os): """ Update the PATH environment variable by adding `new_path` to the front of PATH if `new_path` is not alreday in the PATH. """ # note: _os_module is used to facilitate mock testing using an # object with a sep string attribute and an environ mapping # attribute if not new_path: return new_path = new_path.strip() if not new_path: return path_env = _os_module.environ.get(b'PATH') if not path_env: # this is quite unlikely to ever happen, but here for safety path_env = '' # ensure we use unicode or bytes depending on OSes if on_linux: new_path = path_to_bytes(new_path) path_env = path_to_bytes(path_env) sep = _os_module.pathsep else: new_path = path_to_unicode(new_path) path_env = path_to_unicode(path_env) sep = unicode(_os_module.pathsep) path_segments = path_env.split(sep) # add lib path to the front of the PATH env var # this will use bytes on Linux and unicode elsewhere if new_path not in path_segments: if not path_env: new_path_env = new_path else: new_path_env = sep.join([new_path, path_env]) if not on_linux: # recode to bytes using FS encoding new_path_env = path_to_bytes(new_path_env) # ... and set the variable back as bytes _os_module.environ[b'PATH'] = new_path_env
def log_file_path(cls, logfile_fd, path): """ Log file path in the cache logfile_fd **opened** file descriptor. """ # we dump one path per line written as bytes or unicode if on_linux: path = path_to_bytes(path) + b'\n' else: path = path_to_unicode(path) + '\n' logfile_fd.write(path)
def get_relative_path(path, len_base_path, base_is_dir): """ Return a posix relative path from the posix 'path' relative to a base path of `len_base_path` length where the base is a directory if `base_is_dir` True or a file otherwise. """ path = path_to_unicode(path) if base_is_dir: rel_path = path[len_base_path:] else: rel_path = fileutils.file_name(path) return rel_path.lstrip('/')
def scan_one(location, scanners, diag=False): """ Scan one file or directory at `location` and return a scan result mapping, calling every scanner callable in the `scanners` mapping of (scan name -> scan function). The scan result mapping contain a 'scan_errors' key with a list of error messages. If `diag` is True, 'scan_errors' error messages also contain detailed diagnostic information such as a traceback if available. """ if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) scan_result = OrderedDict() scan_errors = [] for scan_name, scanner in scanners.items(): if not scanner: continue try: scan_details = scanner(location) # consume generators if isinstance(scan_details, GeneratorType): scan_details = list(scan_details) scan_result[scan_name] = scan_details except TimeoutError: raise except Exception as e: # never fail but instead add an error message and keep an empty scan: scan_result[scan_name] = [] messages = ['ERROR: ' + scan_name + ': ' + e.message] if diag: messages.append('ERROR: ' + scan_name + ': ' + traceback.format_exc()) scan_errors.extend(messages) # put errors last, after scans proper scan_result['scan_errors'] = scan_errors return scan_result
def resource_paths(base_path, diag, scans_cache_class, pre_scan_plugins=None): """ Yield `Resource` objects for all the files found at base_path (either a directory or file) given an absolute base_path. Only yield Files, not directories. absolute path is a native OS path. base_path-relative path is a POSIX path. The relative path is guaranted to be unicode and may be URL-encoded and may not be suitable to address an actual file. """ if base_path: if on_linux: base_path = path_to_bytes(base_path) else: base_path = path_to_unicode(base_path) base_path = os.path.abspath(os.path.normpath( os.path.expanduser(base_path))) base_is_dir = filetype.is_dir(base_path) len_base_path = len(base_path) ignores = {} if pre_scan_plugins: for plugin in pre_scan_plugins: ignores.update(plugin.get_ignores()) ignores.update(ignore.ignores_VCS) ignorer = build_ignorer(ignores, unignores={}) resources = fileutils.resource_iter(base_path, ignored=ignorer) for abs_path in resources: resource = Resource(scans_cache_class, abs_path, base_is_dir, len_base_path) # always fetch infos and cache. resource.put_info(scan_infos(abs_path, diag=diag)) if pre_scan_plugins: for plugin in pre_scan_plugins: resource = plugin.process_resource(resource) if resource: yield resource
def iterate(self, scan_names, root_dir=None, paths_subset=tuple()): """ Yield scan data for all cached scans e.g. the whole cache given a list of scan names. If a `paths_subset` sequence of paths is provided, then only these paths are iterated. The logfile MUST have been closed before calling this method. """ if on_linux: paths_subset = set(path_to_bytes(p) for p in paths_subset) else: paths_subset = set(path_to_unicode(p) for p in paths_subset) if on_linux: log_opener = partial(open, self.cache_files_log, 'rb') else: log_opener = partial(codecs.open, self.cache_files_log, 'rb', encoding='utf-8') EOL = b'\n' if on_linux else '\n' with log_opener() as cached_files: # iterate paths, one by line for file_log in cached_files: # must be unicode path = file_log.rstrip(EOL) if paths_subset and path not in paths_subset: continue file_info = self.get_info(path) if on_linux: unicode_path = path_to_unicode(path) else: unicode_path = path if root_dir: # must be unicode if on_linux: root_dir = path_to_unicode(root_dir) rooted_path = posixpath.join(root_dir, unicode_path) else: rooted_path = unicode_path rooted_path = fileutils.as_posixpath(rooted_path) logger_debug('iterate:', 'rooted_path:', rooted_path) # rare but possible corner case if file_info is None: no_info = ( 'ERROR: file info unavailable in cache: ' 'This is either a bug or processing was aborted with CTRL-C.' ) scan_result = OrderedDict(path=rooted_path) scan_result['scan_errors'] = [no_info] if TRACE: logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n') yield scan_result continue _unicode_path_from_file_info = file_info.pop('path') scan_result = OrderedDict(path=rooted_path) if 'infos' in scan_names: # info are always collected but only returned if requested # we flatten these as direct attributes of a file object scan_result.update(file_info.items()) if not scan_result.get('scan_errors'): scan_result['scan_errors'] = [] # check if we have more than just infos if ['infos'] != scan_names: errors = scan_result['scan_errors'] scan_details = self.get_scan(path, file_info) if scan_details is None: no_scan_details = ( 'ERROR: scan details unavailable in cache: ' 'This is either a bug or processing was aborted with CTRL-C.' ) errors.append(no_scan_details) else: # append errors to other top level errors if any scan_errors = scan_details.pop('scan_errors', []) errors.extend(scan_errors) scan_result.update(scan_details) if TRACE: logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n') yield scan_result
def iterate(self, scan_names, root_dir=None, paths_subset=tuple()): """ Yield scan data for all cached scans e.g. the whole cache given a list of scan names. If a `paths_subset` sequence of paths is provided, then only these paths are iterated. The logfile MUST have been closed before calling this method. """ if on_linux: paths_subset = set(path_to_bytes(p) for p in paths_subset) else: paths_subset = set(path_to_unicode(p) for p in paths_subset) if on_linux: log_opener = partial(open, self.cache_files_log, 'rb') else: log_opener = partial(codecs.open, self.cache_files_log, 'rb', encoding='utf-8') EOL = b'\n' if on_linux else '\n' with log_opener() as cached_files: # iterate paths, one by line for file_log in cached_files: # must be unicode path = file_log.rstrip(EOL) if paths_subset and path not in paths_subset: continue file_info = self.get_info(path) if on_linux: unicode_path = path_to_unicode(path) else: unicode_path = path if root_dir: # must be unicode if on_linux: root_dir = path_to_unicode(root_dir) rooted_path = posixpath.join(root_dir, unicode_path) else: rooted_path = unicode_path rooted_path = fileutils.as_posixpath(rooted_path) logger_debug('iterate:', 'rooted_path:', rooted_path) # rare but possible corner case if file_info is None: no_info = ('ERROR: file info unavailable in cache: ' 'This is either a bug or processing was aborted with CTRL-C.') scan_result = OrderedDict(path=rooted_path) scan_result['scan_errors'] = [no_info] if TRACE: logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n') yield scan_result continue _unicode_path_from_file_info = file_info.pop('path') scan_result = OrderedDict(path=rooted_path) if 'infos' in scan_names: # info are always collected but only returned if requested # we flatten these as direct attributes of a file object scan_result.update(file_info.items()) if not scan_result.get('scan_errors'): scan_result['scan_errors'] = [] # check if we have more than just infos if ['infos'] != scan_names: errors = scan_result['scan_errors'] scan_details = self.get_scan(path, file_info) if scan_details is None: no_scan_details = ( 'ERROR: scan details unavailable in cache: ' 'This is either a bug or processing was aborted with CTRL-C.') errors.append(no_scan_details) else: # append errors to other top level errors if any scan_errors = scan_details.pop('scan_errors', []) errors.extend(scan_errors) scan_result.update(scan_details) if TRACE: logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n') yield scan_result