예제 #1
0
 def test_path_to_unicode_and_path_to_bytes_are_idempotent(self):
     a = b'foo\xb1bar'
     b = u'foo\udcb1bar'
     assert a == path_to_bytes(path_to_unicode(a))
     assert a == path_to_bytes(path_to_unicode(b))
     assert b == path_to_unicode(path_to_bytes(a))
     assert b == path_to_unicode(path_to_bytes(b))
예제 #2
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
예제 #3
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, ('sha1', 'md5',)))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
예제 #4
0
def build_ignorer(ignores, unignores):
    """
    Return a callable suitable for path ignores with OS-specific encoding
    preset.
    """
    ignores = ignores or {}
    unignores = unignores or {}
    if on_linux:
        ignores = {path_to_bytes(k): v for k, v in ignores.items()}
        unignores = {path_to_bytes(k): v for k, v in unignores.items()}
    else:
        ignores = {path_to_unicode(k): v for k, v in ignores.items()}
        unignores = {path_to_unicode(k): v for k, v in unignores.items()}
    return partial(ignore.is_ignored, ignores=ignores, unignores=unignores)
예제 #5
0
def build_ignorer(ignores, unignores):
    """
    Return a callable suitable for path ignores with OS-specific encoding
    preset.
    """
    ignores = ignores or {}
    unignores = unignores or {}
    if on_linux:
        ignores = {path_to_bytes(k): v for k, v in ignores.items()}
        unignores = {path_to_bytes(k): v for k, v in unignores.items()}
    else:
        ignores = {path_to_unicode(k): v for k, v in ignores.items()}
        unignores = {path_to_unicode(k): v for k, v in unignores.items()}
    return partial(ignore.is_ignored, ignores=ignores, unignores=unignores)
예제 #6
0
def fixed_width_file_name(path, max_length=25):
    """
    Return a fixed width file name of at most `max_length` characters
    extracted from the `path` string and usable for fixed width display.
    If the file_name is longer than `max_length`, it is truncated in the
    middle with using three dots "..." as an ellipsis and the extension
    is kept.

    For example:
    >>> short = fixed_width_file_name('0123456789012345678901234.c')
    >>> assert '0123456789...5678901234.c' == short
    """
    if not path:
        return ''

    # get the path as unicode for display!
    path = path_to_unicode(path)
    filename = fileutils.file_name(path)
    if len(filename) <= max_length:
        return filename
    base_name, extension = fileutils.splitext(filename)
    number_of_dots = 3
    len_extension = len(extension)
    remaining_length = max_length - len_extension - number_of_dots

    if remaining_length < (len_extension +
                           number_of_dots) or remaining_length < 5:
        return ''

    prefix_and_suffix_length = abs(remaining_length // 2)
    prefix = base_name[:prefix_and_suffix_length]
    ellipsis = number_of_dots * '.'
    suffix = base_name[-prefix_and_suffix_length:]
    return '{prefix}{ellipsis}{suffix}{extension}'.format(**locals())
예제 #7
0
def fixed_width_file_name(path, max_length=25):
    """
    Return a fixed width file name of at most `max_length` characters
    extracted from the `path` string and usable for fixed width display.
    If the file_name is longer than `max_length`, it is truncated in the
    middle with using three dots "..." as an ellipsis and the extension
    is kept.

    For example:
    >>> short = fixed_width_file_name('0123456789012345678901234.c')
    >>> assert '0123456789...5678901234.c' == short
    """
    if not path:
        return ''

    # get the path as unicode for display!
    path = path_to_unicode(path)
    filename = fileutils.file_name(path)
    if len(filename) <= max_length:
        return filename
    base_name, extension = fileutils.splitext(filename)
    number_of_dots = 3
    len_extension = len(extension)
    remaining_length = max_length - len_extension - number_of_dots

    if remaining_length < (len_extension + number_of_dots) or remaining_length < 5:
        return ''

    prefix_and_suffix_length = abs(remaining_length // 2)
    prefix = base_name[:prefix_and_suffix_length]
    ellipsis = number_of_dots * '.'
    suffix = base_name[-prefix_and_suffix_length:]
    return '{prefix}{ellipsis}{suffix}{extension}'.format(**locals())
예제 #8
0
def paths_from_keys(base_path, keys):
    """
    Return a tuple of (parent dir path, filename) for a cache entry built from a cache
    keys triple and a base_directory. Ensure that the parent directory exist.
    """
    if on_linux:
        keys = [path_to_bytes(k) for k in keys]
        base_path = path_to_bytes(base_path)
    else:
        keys = [path_to_unicode(k) for k in keys]
        base_path = path_to_unicode(base_path)

    dir1, dir2, file_name = keys
    parent = os.path.join(base_path, dir1, dir2)
    fileutils.create_dir(parent)
    return parent, file_name
예제 #9
0
def paths_from_keys(base_path, keys):
    """
    Return a tuple of (parent dir path, filename) for a cache entry built from a cache
    keys triple and a base_directory. Ensure that the parent directory exist.
    """
    if on_linux:
        keys = [path_to_bytes(k) for k in keys]
        base_path = path_to_bytes(base_path)
    else:
        keys = [path_to_unicode(k) for k in keys]
        base_path = path_to_unicode(base_path)

    dir1, dir2, file_name = keys
    parent = os.path.join(base_path, dir1, dir2)
    fileutils.create_dir(parent)
    return parent, file_name
예제 #10
0
def resource_paths(base_path, diag, scans_cache_class, pre_scan_plugins=()):
    """
    Yield `Resource` objects for all the files found at base_path
    (either a directory or file) given an absolute base_path. Only yield
    Files, not directories.
    absolute path is a native OS path.
    base_path-relative path is a POSIX path.

    The relative path is guaranted to be unicode and may be URL-encoded and may not
    be suitable to address an actual file.
    """
    if base_path:
        if on_linux:
            base_path = path_to_bytes(base_path)
        else:
            base_path = path_to_unicode(base_path)

    base_path = os.path.abspath(os.path.normpath(os.path.expanduser(base_path)))
    base_is_dir = filetype.is_dir(base_path)
    len_base_path = len(base_path)
    ignores = {}
    if pre_scan_plugins:
        for plugin in pre_scan_plugins:
            ignores.update(plugin.get_ignores())
    ignores.update(ignore.ignores_VCS)

    ignorer = build_ignorer(ignores, unignores={})
    resources = fileutils.resource_iter(base_path, ignored=ignorer)

    for abs_path in resources:
        resource = Resource(scans_cache_class, abs_path, base_is_dir, len_base_path)
        # always fetch infos and cache.
        resource.put_info(scan_infos(abs_path, diag=diag))
        yield resource
예제 #11
0
def update_path_environment(new_path, _os_module=os):
    """
    Update the PATH environment variable by adding `new_path` to the front
    of PATH if `new_path` is not alreday in the PATH.
    """
    # note: _os_module is used to facilitate mock testing using an
    # object with a sep string attribute and an environ mapping
    # attribute

    if not new_path:
        return

    new_path = new_path.strip()
    if not new_path:
        return

    path_env = _os_module.environ.get(b'PATH')
    if not path_env:
        # this is quite unlikely to ever happen, but here for safety
        path_env = ''

    # ensure we use unicode or bytes depending on OSes
    if on_linux:
        new_path = path_to_bytes(new_path)
        path_env = path_to_bytes(path_env)
        sep = _os_module.pathsep
    else:
        new_path = path_to_unicode(new_path)
        path_env = path_to_unicode(path_env)
        sep = unicode(_os_module.pathsep)

    path_segments = path_env.split(sep)

    # add lib path to the front of the PATH env var
    # this will use bytes on Linux and unicode elsewhere
    if new_path not in path_segments:
        if not path_env:
            new_path_env = new_path
        else:
            new_path_env = sep.join([new_path, path_env])

        if not on_linux:
            # recode to bytes using FS encoding
            new_path_env = path_to_bytes(new_path_env)
        # ... and set the variable back as bytes
        _os_module.environ[b'PATH'] = new_path_env
예제 #12
0
 def log_file_path(cls, logfile_fd, path):
     """
     Log file path in the cache logfile_fd **opened** file descriptor.
     """
     # we dump one path per line written as bytes or unicode
     if on_linux:
         path = path_to_bytes(path) + b'\n'
     else:
         path = path_to_unicode(path) + '\n'
     logfile_fd.write(path)
예제 #13
0
 def log_file_path(cls, logfile_fd, path):
     """
     Log file path in the cache logfile_fd **opened** file descriptor.
     """
     # we dump one path per line written as bytes or unicode
     if on_linux:
         path = path_to_bytes(path) + b'\n'
     else:
         path = path_to_unicode(path) + '\n'
     logfile_fd.write(path)
예제 #14
0
def get_relative_path(path, len_base_path, base_is_dir):
    """
    Return a posix relative path from the posix 'path' relative to a
    base path of `len_base_path` length where the base is a directory if
    `base_is_dir` True or a file otherwise.
    """
    path = path_to_unicode(path)
    if base_is_dir:
        rel_path = path[len_base_path:]
    else:
        rel_path = fileutils.file_name(path)

    return rel_path.lstrip('/')
예제 #15
0
def get_relative_path(path, len_base_path, base_is_dir):
    """
    Return a posix relative path from the posix 'path' relative to a
    base path of `len_base_path` length where the base is a directory if
    `base_is_dir` True or a file otherwise.
    """
    path = path_to_unicode(path)
    if base_is_dir:
        rel_path = path[len_base_path:]
    else:
        rel_path = fileutils.file_name(path)

    return rel_path.lstrip('/')
예제 #16
0
def scan_one(location, scanners, diag=False):
    """
    Scan one file or directory at `location` and return a scan result
    mapping, calling every scanner callable in the `scanners` mapping of
    (scan name -> scan function).

    The scan result mapping contain a 'scan_errors' key with a list of
    error messages. If `diag` is True, 'scan_errors' error messages also
    contain detailed diagnostic information such as a traceback if
    available.
    """
    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    scan_result = OrderedDict()
    scan_errors = []
    for scan_name, scanner in scanners.items():
        if not scanner:
            continue
        try:
            scan_details = scanner(location)
            # consume generators
            if isinstance(scan_details, GeneratorType):
                scan_details = list(scan_details)
            scan_result[scan_name] = scan_details
        except TimeoutError:
            raise
        except Exception as e:
            # never fail but instead add an error message and keep an empty scan:
            scan_result[scan_name] = []
            messages = ['ERROR: ' + scan_name + ': ' + e.message]
            if diag:
                messages.append('ERROR: ' + scan_name + ': ' +
                                traceback.format_exc())
            scan_errors.extend(messages)

    # put errors last, after scans proper
    scan_result['scan_errors'] = scan_errors
    return scan_result
예제 #17
0
def scan_one(location, scanners, diag=False):
    """
    Scan one file or directory at `location` and return a scan result
    mapping, calling every scanner callable in the `scanners` mapping of
    (scan name -> scan function).

    The scan result mapping contain a 'scan_errors' key with a list of
    error messages. If `diag` is True, 'scan_errors' error messages also
    contain detailed diagnostic information such as a traceback if
    available.
    """
    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    scan_result = OrderedDict()
    scan_errors = []
    for scan_name, scanner in scanners.items():
        if not scanner:
            continue
        try:
            scan_details = scanner(location)
            # consume generators
            if isinstance(scan_details, GeneratorType):
                scan_details = list(scan_details)
            scan_result[scan_name] = scan_details
        except TimeoutError:
            raise
        except Exception as e:
            # never fail but instead add an error message and keep an empty scan:
            scan_result[scan_name] = []
            messages = ['ERROR: ' + scan_name + ': ' + e.message]
            if diag:
                messages.append('ERROR: ' + scan_name + ': ' + traceback.format_exc())
            scan_errors.extend(messages)

    # put errors last, after scans proper
    scan_result['scan_errors'] = scan_errors
    return scan_result
예제 #18
0
def resource_paths(base_path, diag, scans_cache_class, pre_scan_plugins=None):
    """
    Yield `Resource` objects for all the files found at base_path
    (either a directory or file) given an absolute base_path. Only yield
    Files, not directories.
    absolute path is a native OS path.
    base_path-relative path is a POSIX path.

    The relative path is guaranted to be unicode and may be URL-encoded and may not
    be suitable to address an actual file.
    """
    if base_path:
        if on_linux:
            base_path = path_to_bytes(base_path)
        else:
            base_path = path_to_unicode(base_path)

    base_path = os.path.abspath(os.path.normpath(
        os.path.expanduser(base_path)))
    base_is_dir = filetype.is_dir(base_path)
    len_base_path = len(base_path)
    ignores = {}
    if pre_scan_plugins:
        for plugin in pre_scan_plugins:
            ignores.update(plugin.get_ignores())
    ignores.update(ignore.ignores_VCS)

    ignorer = build_ignorer(ignores, unignores={})
    resources = fileutils.resource_iter(base_path, ignored=ignorer)

    for abs_path in resources:
        resource = Resource(scans_cache_class, abs_path, base_is_dir,
                            len_base_path)
        # always fetch infos and cache.
        resource.put_info(scan_infos(abs_path, diag=diag))
        if pre_scan_plugins:
            for plugin in pre_scan_plugins:
                resource = plugin.process_resource(resource)
        if resource:
            yield resource
예제 #19
0
    def iterate(self, scan_names, root_dir=None, paths_subset=tuple()):
        """
        Yield scan data for all cached scans e.g. the whole cache given
        a list of scan names.
        If a `paths_subset` sequence of paths is provided, then only
        these paths are iterated.

        The logfile MUST have been closed before calling this method.
        """
        if on_linux:
            paths_subset = set(path_to_bytes(p) for p in paths_subset)
        else:
            paths_subset = set(path_to_unicode(p) for p in paths_subset)

        if on_linux:
            log_opener = partial(open, self.cache_files_log, 'rb')
        else:
            log_opener = partial(codecs.open,
                                 self.cache_files_log,
                                 'rb',
                                 encoding='utf-8')
        EOL = b'\n' if on_linux else '\n'

        with log_opener() as cached_files:
            # iterate paths, one by line
            for file_log in cached_files:
                # must be unicode
                path = file_log.rstrip(EOL)
                if paths_subset and path not in paths_subset:
                    continue
                file_info = self.get_info(path)

                if on_linux:
                    unicode_path = path_to_unicode(path)
                else:
                    unicode_path = path

                if root_dir:
                    # must be unicode
                    if on_linux:
                        root_dir = path_to_unicode(root_dir)
                    rooted_path = posixpath.join(root_dir, unicode_path)
                else:
                    rooted_path = unicode_path
                rooted_path = fileutils.as_posixpath(rooted_path)
                logger_debug('iterate:', 'rooted_path:', rooted_path)

                # rare but possible corner case
                if file_info is None:
                    no_info = (
                        'ERROR: file info unavailable in cache: '
                        'This is either a bug or processing was aborted with CTRL-C.'
                    )
                    scan_result = OrderedDict(path=rooted_path)
                    scan_result['scan_errors'] = [no_info]
                    if TRACE:
                        logger_debug('iterate:', 'scan_result:', scan_result,
                                     'for path:', rooted_path, '\n')
                    yield scan_result
                    continue

                _unicode_path_from_file_info = file_info.pop('path')
                scan_result = OrderedDict(path=rooted_path)

                if 'infos' in scan_names:
                    # info are always collected but only returned if requested
                    # we flatten these as direct attributes of a file object
                    scan_result.update(file_info.items())

                if not scan_result.get('scan_errors'):
                    scan_result['scan_errors'] = []

                # check if we have more than just infos
                if ['infos'] != scan_names:
                    errors = scan_result['scan_errors']
                    scan_details = self.get_scan(path, file_info)
                    if scan_details is None:
                        no_scan_details = (
                            'ERROR: scan details unavailable in cache: '
                            'This is either a bug or processing was aborted with CTRL-C.'
                        )
                        errors.append(no_scan_details)
                    else:
                        # append errors to other top level errors if any
                        scan_errors = scan_details.pop('scan_errors', [])
                        errors.extend(scan_errors)
                        scan_result.update(scan_details)

                if TRACE:
                    logger_debug('iterate:', 'scan_result:', scan_result,
                                 'for path:', rooted_path, '\n')
                yield scan_result
예제 #20
0
    def iterate(self, scan_names, root_dir=None, paths_subset=tuple()):
        """
        Yield scan data for all cached scans e.g. the whole cache given
        a list of scan names.
        If a `paths_subset` sequence of paths is provided, then only
        these paths are iterated.

        The logfile MUST have been closed before calling this method.
        """
        if on_linux:
            paths_subset = set(path_to_bytes(p) for p in paths_subset)
        else:
            paths_subset = set(path_to_unicode(p) for p in paths_subset)

        if on_linux:
            log_opener = partial(open, self.cache_files_log, 'rb')
        else:
            log_opener = partial(codecs.open, self.cache_files_log, 'rb', encoding='utf-8')
        EOL = b'\n' if on_linux else '\n'

        with log_opener() as cached_files:
            # iterate paths, one by line
            for file_log in cached_files:
                # must be unicode
                path = file_log.rstrip(EOL)
                if paths_subset and path not in paths_subset:
                    continue
                file_info = self.get_info(path)

                if on_linux:
                    unicode_path = path_to_unicode(path)
                else:
                    unicode_path = path

                if root_dir:
                    # must be unicode
                    if on_linux:
                        root_dir = path_to_unicode(root_dir)
                    rooted_path = posixpath.join(root_dir, unicode_path)
                else:
                    rooted_path = unicode_path
                rooted_path = fileutils.as_posixpath(rooted_path)
                logger_debug('iterate:', 'rooted_path:', rooted_path)

                # rare but possible corner case
                if file_info is None:
                    no_info = ('ERROR: file info unavailable in cache: '
                               'This is either a bug or processing was aborted with CTRL-C.')
                    scan_result = OrderedDict(path=rooted_path)
                    scan_result['scan_errors'] = [no_info]
                    if TRACE:
                        logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n')
                    yield scan_result
                    continue

                _unicode_path_from_file_info = file_info.pop('path')
                scan_result = OrderedDict(path=rooted_path)

                if 'infos' in scan_names:
                    # info are always collected but only returned if requested
                    # we flatten these as direct attributes of a file object
                    scan_result.update(file_info.items())

                if not scan_result.get('scan_errors'):
                    scan_result['scan_errors'] = []

                # check if we have more than just infos
                if ['infos'] != scan_names:
                    errors = scan_result['scan_errors']
                    scan_details = self.get_scan(path, file_info)
                    if scan_details is None:
                        no_scan_details = (
                            'ERROR: scan details unavailable in cache: '
                            'This is either a bug or processing was aborted with CTRL-C.')
                        errors.append(no_scan_details)
                    else:
                        # append errors to other top level errors if any
                        scan_errors = scan_details.pop('scan_errors', [])
                        errors.extend(scan_errors)
                        scan_result.update(scan_details)

                if TRACE:
                    logger_debug('iterate:', 'scan_result:', scan_result, 'for path:', rooted_path, '\n')
                yield scan_result