예제 #1
0
 def test_path_to_unicode_and_path_to_bytes_are_idempotent(self):
     a = b'foo\xb1bar'
     b = u'foo\udcb1bar'
     assert a == path_to_bytes(path_to_unicode(a))
     assert a == path_to_bytes(path_to_unicode(b))
     assert b == path_to_unicode(path_to_bytes(a))
     assert b == path_to_unicode(path_to_bytes(b))
예제 #2
0
def test_scan_does_not_fail_when_scanning_unicode_files_and_paths():
    test_dir = test_env.get_test_loc(u'unicodepath/uc')
    result_file = test_env.get_temp_file('json')

    if on_linux:
        test_dir = path_to_bytes(test_dir)
        result_file = path_to_bytes(result_file)

    args = ['--info', '--license', '--copyright',
            '--package', '--email', '--url', '--strip-root',
            test_dir , result_file]
    result = run_scan_click(args)
    if result.exit_code != 0:
        raise Exception(result.output, args)
    assert result.exit_code == 0
    assert 'Scanning done' in result.output

    # the paths for each OS end up encoded differently.
    # See for details:
    # https://github.com/nexB/scancode-toolkit/issues/390
    # https://github.com/nexB/scancode-toolkit/issues/688

    if on_linux:
        expected = 'unicodepath/unicodepath.expected-linux.json'
    elif on_mac:
        expected = 'unicodepath/unicodepath.expected-mac.json'
    elif on_windows:
        expected = 'unicodepath/unicodepath.expected-win.json'

    check_json_scan(test_env.get_test_loc(expected), result_file, strip_dates=True, regen=False)
예제 #3
0
def extract_tar(location, target_dir, verbatim=False, *args, **kwargs):
    """
    Extract a tar archive at location in the target_dir directory.
    If `verbatim` is True preserve the permissions.
    """
    # always for using bytes for paths on all OSses... tar seems to use bytes internally
    # and get confused otherwise
    location = path_to_bytes(location)
    target_dir = path_to_bytes(target_dir)

    with open(location, 'rb') as input_tar:
        tar = None
        try:
            tar = tarfile.open(fileobj=input_tar)
            tarinfos = tar.getmembers()
            to_extract = []
            for tarinfo in tarinfos:
                if tar_can_extract(tarinfo, verbatim):
                    if not verbatim:
                        tarinfo.mode = 0700
                    to_extract.append(tarinfo)
            tar.extractall(target_dir, members=to_extract)
        finally:
            if tar:
                tar.close()
예제 #4
0
def test_scan_does_not_fail_when_scanning_unicode_files_and_paths():
    test_dir = test_env.get_test_loc(u'unicodepath/uc')
    result_file = test_env.get_temp_file('json')

    if on_linux:
        test_dir = path_to_bytes(test_dir)
        result_file = path_to_bytes(result_file)

    args = [
        '--info', '--license', '--copyright', '--package', '--email', '--url',
        '--strip-root', test_dir, result_file
    ]
    result = run_scan_click(args)
    if result.exit_code != 0:
        raise Exception(result.output, args)
    assert result.exit_code == 0
    assert 'Scanning done' in result.output

    # the paths for each OS end up encoded differently.
    # See for details:
    # https://github.com/nexB/scancode-toolkit/issues/390
    # https://github.com/nexB/scancode-toolkit/issues/688

    if on_linux:
        expected = 'unicodepath/unicodepath.expected-linux.json'
    elif on_mac:
        expected = 'unicodepath/unicodepath.expected-mac.json'
    elif on_windows:
        expected = 'unicodepath/unicodepath.expected-win.json'

    check_json_scan(test_env.get_test_loc(expected),
                    result_file,
                    strip_dates=True,
                    regen=False)
예제 #5
0
    def get_test_loc(self, test_path, copy=False, debug=False):
        """
        Given a `test_path` relative to the self.test_data_dir directory, return the
        location to a test file or directory for this path. Copy to a temp
        test location if `copy` is True.
        """
        test_data_dir = self.test_data_dir
        if on_linux:
            test_path = path_to_bytes(test_path)
            test_data_dir = path_to_bytes(test_data_dir)

        if debug:
            import inspect
            caller = inspect.stack()[1][3]
            print('\nself.get_test_loc,%(caller)s,"%(test_path)s"' % locals())

        test_loc = get_test_loc(test_path, test_data_dir, debug=debug)
        if copy:
            base_name = os.path.basename(test_loc)
            if filetype.is_file(test_loc):
                # target must be an existing dir
                target_dir = self.get_temp_dir()
                fileutils.copyfile(test_loc, target_dir)
                test_loc = os.path.join(target_dir, base_name)
            else:
                # target must be a NON existing dir
                target_dir = os.path.join(self.get_temp_dir(), base_name)
                fileutils.copytree(test_loc, target_dir)
                # cleanup of VCS that could be left over from checkouts
                self.remove_vcs(target_dir)
                test_loc = target_dir
        return test_loc
예제 #6
0
def get_test_loc(test_path, test_data_dir, debug=False, exists=True):
    """
    Given a `test_path` relative to the `test_data_dir` directory, return the
    location to a test file or directory for this path. No copy is done.
    """
    if on_linux:
        test_path = path_to_bytes(test_path)
        test_data_dir = path_to_bytes(test_data_dir)

    if debug:
        import inspect
        caller = inspect.stack()[1][3]
        print('\nget_test_loc,%(caller)s,"%(test_path)s","%(test_data_dir)s"' %
              locals())

    assert test_path
    assert test_data_dir

    if not os.path.exists(test_data_dir):
        raise IOError("[Errno 2] No such directory: test_data_dir not found:"
                      " '%(test_data_dir)s'" % locals())

    tpath = to_os_native_path(test_path)
    test_loc = os.path.abspath(os.path.join(test_data_dir, tpath))

    if exists and not os.path.exists(test_loc):
        raise IOError("[Errno 2] No such file or directory: "
                      "test_path not found: '%(test_loc)s'" % locals())

    return test_loc
예제 #7
0
def test_scan_can_handle_non_utf8_file_names_on_posix():
    test_dir = test_env.extract_test_tar_raw('non_utf8/non_unicode.tgz')
    result_file = test_env.get_temp_file('json')

    if on_linux:
        test_dir = path_to_bytes(test_dir)
        result_file = path_to_bytes(result_file)

    result = run_scan_click(['-i', '--strip-root', test_dir, result_file])
    assert result.exit_code == 0
    assert 'Scanning done' in result.output

    # the paths for each OS end up encoded differently.
    # See for details:
    # https://github.com/nexB/scancode-toolkit/issues/390
    # https://github.com/nexB/scancode-toolkit/issues/688

    if on_linux:
        expected = 'non_utf8/expected-linux.json'
    elif on_mac:
        expected = 'non_utf8/expected-mac.json'
    elif on_windows:
        expected = 'non_utf8/expected-win.json'

    check_json_scan(test_env.get_test_loc(expected), result_file, regen=False)
예제 #8
0
def test_scan_can_handle_non_utf8_file_names_on_posix():
    test_dir = test_env.extract_test_tar_raw('non_utf8/non_unicode.tgz')
    result_file = test_env.get_temp_file('json')

    if on_linux:
        test_dir = path_to_bytes(test_dir)
        result_file = path_to_bytes(result_file)

    result = run_scan_click(['-i', '--strip-root', test_dir, result_file])
    assert result.exit_code == 0
    assert 'Scanning done' in result.output

    # the paths for each OS end up encoded differently.
    # See for details:
    # https://github.com/nexB/scancode-toolkit/issues/390
    # https://github.com/nexB/scancode-toolkit/issues/688

    if on_linux:
        expected = 'non_utf8/expected-linux.json'
    elif on_mac:
        expected = 'non_utf8/expected-mac.json'
    elif on_windows:
        expected = 'non_utf8/expected-win.json'

    check_json_scan(test_env.get_test_loc(expected), result_file, regen=False)
예제 #9
0
def get_test_loc(test_path, test_data_dir, debug=False, exists=True):
    """
    Given a `test_path` relative to the `test_data_dir` directory, return the
    location to a test file or directory for this path. No copy is done.
    """
    if on_linux:
        test_path = path_to_bytes(test_path)
        test_data_dir = path_to_bytes(test_data_dir)

    if debug:
        import inspect
        caller = inspect.stack()[1][3]
        print('\nget_test_loc,%(caller)s,"%(test_path)s","%(test_data_dir)s"' % locals())

    assert test_path
    assert test_data_dir

    if not os.path.exists(test_data_dir):
        raise IOError("[Errno 2] No such directory: test_data_dir not found:"
                      " '%(test_data_dir)s'" % locals())

    tpath = to_os_native_path(test_path)
    test_loc = os.path.abspath(os.path.join(test_data_dir, tpath))

    if exists and not os.path.exists(test_loc):
        raise IOError("[Errno 2] No such file or directory: "
                      "test_path not found: '%(test_loc)s'" % locals())

    return test_loc
예제 #10
0
def extract_tar(location, target_dir, verbatim=False, *args, **kwargs):
    """
    Extract a tar archive at location in the target_dir directory.
    If `verbatim` is True preserve the permissions.
    """
    # always for using bytes for paths on all OSses... tar seems to use bytes internally
    # and get confused otherwise
    location = path_to_bytes(location)
    target_dir = path_to_bytes(target_dir)

    with open(location, 'rb') as input_tar:
        tar = None
        try:
            tar = tarfile.open(fileobj=input_tar)
            tarinfos = tar.getmembers()
            to_extract = []
            for tarinfo in tarinfos:
                if tar_can_extract(tarinfo, verbatim):
                    if not verbatim:
                        tarinfo.mode = 0700
                    to_extract.append(tarinfo)
            tar.extractall(target_dir, members=to_extract)
        finally:
            if tar:
                tar.close()
예제 #11
0
    def get_test_loc(self, test_path, copy=False, debug=False):
        """
        Given a `test_path` relative to the self.test_data_dir directory, return the
        location to a test file or directory for this path. Copy to a temp
        test location if `copy` is True.
        """
        test_data_dir = self.test_data_dir
        if on_linux:
            test_path = path_to_bytes(test_path)
            test_data_dir = path_to_bytes(test_data_dir)

        if debug:
            import inspect
            caller = inspect.stack()[1][3]
            print('\nself.get_test_loc,%(caller)s,"%(test_path)s"' % locals())

        test_loc = get_test_loc(test_path, test_data_dir, debug=debug)
        if copy:
            base_name = os.path.basename(test_loc)
            if filetype.is_file(test_loc):
                # target must be an existing dir
                target_dir = self.get_temp_dir()
                fileutils.copyfile(test_loc, target_dir)
                test_loc = os.path.join(target_dir, base_name)
            else:
                # target must be a NON existing dir
                target_dir = os.path.join(self.get_temp_dir(), base_name)
                fileutils.copytree(test_loc, target_dir)
                # cleanup of VCS that could be left over from checkouts
                self.remove_vcs(target_dir)
                test_loc = target_dir
        return test_loc
예제 #12
0
def _extract_tar_raw(test_path, target_dir, to_bytes, *args, **kwargs):
    """
    Raw simplified extract for certain really weird paths and file
    names.
    """
    if to_bytes:
        # use bytes for paths on ALL OSes (though this may fail on macOS)
        target_dir = path_to_bytes(target_dir)
        test_path = path_to_bytes(test_path)
    tar = tarfile.open(test_path)
    tar.extractall(path=target_dir)
    tar.close()
예제 #13
0
def _extract_tar_raw(test_path, target_dir, to_bytes, *args, **kwargs):
    """
    Raw simplified extract for certain really weird paths and file
    names.
    """
    if to_bytes:
        # use bytes for paths on ALL OSes (though this may fail on macOS)
        target_dir = path_to_bytes(target_dir)
        test_path = path_to_bytes(test_path)
    tar = tarfile.open(test_path)
    tar.extractall(path=target_dir)
    tar.close()
예제 #14
0
def build_ignorer(ignores, unignores):
    """
    Return a callable suitable for path ignores with OS-specific encoding
    preset.
    """
    ignores = ignores or {}
    unignores = unignores or {}
    if on_linux:
        ignores = {path_to_bytes(k): v for k, v in ignores.items()}
        unignores = {path_to_bytes(k): v for k, v in unignores.items()}
    else:
        ignores = {path_to_unicode(k): v for k, v in ignores.items()}
        unignores = {path_to_unicode(k): v for k, v in unignores.items()}
    return partial(ignore.is_ignored, ignores=ignores, unignores=unignores)
예제 #15
0
def build_ignorer(ignores, unignores):
    """
    Return a callable suitable for path ignores with OS-specific encoding
    preset.
    """
    ignores = ignores or {}
    unignores = unignores or {}
    if on_linux:
        ignores = {path_to_bytes(k): v for k, v in ignores.items()}
        unignores = {path_to_bytes(k): v for k, v in unignores.items()}
    else:
        ignores = {path_to_unicode(k): v for k, v in ignores.items()}
        unignores = {path_to_unicode(k): v for k, v in unignores.items()}
    return partial(ignore.is_ignored, ignores=ignores, unignores=unignores)
예제 #16
0
def remove_backslashes_and_dotdots(directory):
    """
    Walk a directory and rename the files if their names contain backslashes.
    Return a list of errors if any.
    """
    if on_linux:
        directory = path_to_bytes(directory)
    errors = []
    for top, _, files in os.walk(directory):
        for filename in files:
            if not (WIN_PATH_SEP in filename or DOTDOT in filename):
                continue
            try:
                new_path = fileutils.as_posixpath(filename)
                new_path = new_path.strip(POSIX_PATH_SEP)
                new_path = posixpath.normpath(new_path)
                new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP)
                new_path = new_path.strip(POSIX_PATH_SEP)
                new_path = posixpath.normpath(new_path)
                segments = new_path.split(POSIX_PATH_SEP)
                directory = os.path.join(top, *segments[:-1])
                fileutils.create_dir(directory)
                shutil.move(os.path.join(top, filename), os.path.join(top, *segments))
            except Exception:
                errors.append(os.path.join(top, filename))
    return errors
예제 #17
0
def remove_archive_suffix(path):
    """
    Remove all the extracted suffix from a path.
    """
    if on_linux:
        path = path_to_bytes(path)
    return re.sub(EXTRACT_SUFFIX, EMPTY_STRING, path)
예제 #18
0
def get_extraction_path(path):
    """
    Return a path where to extract.
    """
    if on_linux:
        path = path_to_bytes(path)
    return path.rstrip(PATHS_SEPS) + EXTRACT_SUFFIX
예제 #19
0
def paths_from_keys(base_path, keys):
    """
    Return a tuple of (parent dir path, filename) for a cache entry built from a cache
    keys triple and a base_directory. Ensure that the parent directory exist.
    """
    if on_linux:
        keys = [path_to_bytes(k) for k in keys]
        base_path = path_to_bytes(base_path)
    else:
        keys = [path_to_unicode(k) for k in keys]
        base_path = path_to_unicode(base_path)

    dir1, dir2, file_name = keys
    parent = os.path.join(base_path, dir1, dir2)
    fileutils.create_dir(parent)
    return parent, file_name
예제 #20
0
def paths_from_keys(base_path, keys):
    """
    Return a tuple of (parent dir path, filename) for a cache entry built from a cache
    keys triple and a base_directory. Ensure that the parent directory exist.
    """
    if on_linux:
        keys = [path_to_bytes(k) for k in keys]
        base_path = path_to_bytes(base_path)
    else:
        keys = [path_to_unicode(k) for k in keys]
        base_path = path_to_unicode(base_path)

    dir1, dir2, file_name = keys
    parent = os.path.join(base_path, dir1, dir2)
    fileutils.create_dir(parent)
    return parent, file_name
def get_extraction_path(path):
    """
    Return a path where to extract.
    """
    if on_linux:
        path = path_to_bytes(path)
    return path.rstrip(PATHS_SEPS) + EXTRACT_SUFFIX
def remove_archive_suffix(path):
    """
    Remove all the extracted suffix from a path.
    """
    if on_linux:
        path = path_to_bytes(path)
    return re.sub(EXTRACT_SUFFIX, EMPTY_STRING, path)
예제 #23
0
def resource_paths(base_path, diag, scans_cache_class, pre_scan_plugins=()):
    """
    Yield `Resource` objects for all the files found at base_path
    (either a directory or file) given an absolute base_path. Only yield
    Files, not directories.
    absolute path is a native OS path.
    base_path-relative path is a POSIX path.

    The relative path is guaranted to be unicode and may be URL-encoded and may not
    be suitable to address an actual file.
    """
    if base_path:
        if on_linux:
            base_path = path_to_bytes(base_path)
        else:
            base_path = path_to_unicode(base_path)

    base_path = os.path.abspath(os.path.normpath(os.path.expanduser(base_path)))
    base_is_dir = filetype.is_dir(base_path)
    len_base_path = len(base_path)
    ignores = {}
    if pre_scan_plugins:
        for plugin in pre_scan_plugins:
            ignores.update(plugin.get_ignores())
    ignores.update(ignore.ignores_VCS)

    ignorer = build_ignorer(ignores, unignores={})
    resources = fileutils.resource_iter(base_path, ignored=ignorer)

    for abs_path in resources:
        resource = Resource(scans_cache_class, abs_path, base_is_dir, len_base_path)
        # always fetch infos and cache.
        resource.put_info(scan_infos(abs_path, diag=diag))
        yield resource
예제 #24
0
def test_extractcode_command_can_extract_archive_with_unicode_names(
        monkeypatch):
    monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True)
    test_dir = test_env.get_test_loc('unicodearch', copy=True)
    if on_linux:
        test_dir = path_to_bytes(test_dir)
    runner = CliRunner()
    result = runner.invoke(extract_cli.extractcode, [test_dir],
                           catch_exceptions=False)
    assert result.exit_code == 0

    uni_arch = b'unicodepath.tgz' if on_linux else 'unicodepath.tgz'
    uni_path = b'/unicodepath/' if on_linux else '/unicodepath/'

    file_result = [
        f for f in map(as_posixpath, file_iter(test_dir))
        if not f.endswith(uni_arch)
    ]
    file_result = [
        EMPTY_STRING.join(f.partition(uni_path)[1:]) for f in file_result
    ]
    file_result = [f for f in file_result if f]
    expected = [
        '/unicodepath/Ho_', '/unicodepath/Ho_a',
        '/unicodepath/koristenjem_Karkkainen_-_Sander.pdf'
    ]
    assert sorted(expected) == sorted(file_result)
def remove_backslashes_and_dotdots(directory):
    """
    Walk a directory and rename the files if their names contain backslashes.
    Return a list of errors if any.
    """
    if on_linux:
        directory = path_to_bytes(directory)
    errors = []
    for top, _, files in os.walk(directory):
        for filename in files:
            if not (WIN_PATH_SEP in filename or DOTDOT in filename):
                continue
            try:
                new_path = fileutils.as_posixpath(filename)
                new_path = new_path.strip(POSIX_PATH_SEP)
                new_path = posixpath.normpath(new_path)
                new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP)
                new_path = new_path.strip(POSIX_PATH_SEP)
                new_path = posixpath.normpath(new_path)
                segments = new_path.split(POSIX_PATH_SEP)
                directory = os.path.join(top, *segments[:-1])
                fileutils.create_dir(directory)
                shutil.move(os.path.join(top, filename),
                            os.path.join(top, *segments))
            except Exception:
                errors.append(os.path.join(top, filename))
    return errors
예제 #26
0
def recognize_package(location):
    """
    Return a Package object if one was recognized or None for this `location`.
    """

    if not filetype.is_file(location):
        return

    T = contenttype.get_type(location)
    ftype = T.filetype_file.lower()
    mtype = T.mimetype_file


    for package_type in PACKAGE_TYPES:
        # Note: default to True if there is nothing to match against
        metafiles = package_type.metafiles
        if on_linux:
            metafiles = (path_to_bytes(m) for m in metafiles)
        if location.endswith(tuple(metafiles)):
            logger_debug('metafile matching: package_type is of type:', package_type)
            return package_type.recognize(location)

        if package_type.filetypes:
            type_matched = any(t in ftype for t in package_type.filetypes)
        else:
            type_matched = False
        if package_type.mimetypes:
            mime_matched = any(m in mtype for m in package_type.mimetypes)
        else:
            mime_matched = False

        extensions = package_type.extensions
        if extensions:
            if on_linux:
                extensions = tuple(path_to_bytes(e) for e in extensions)
            extension_matched = location.lower().endswith(extensions)
        else:
            extension_matched = False

        if type_matched and mime_matched and extension_matched:
            # we return the first match in the order of PACKAGE_TYPES
            logger_debug('all matching: package is of type:', package_type)
            recognized = package_type.recognize(location)
            logger_debug('all matching: recognized as:', repr(recognized))
            return recognized

        logger_debug('no match: package is not of known type:', package_type)
예제 #27
0
def is_extraction_path(path):
    """
    Return True is the path points to an extraction path.
    """
    if on_linux:
        path = path_to_bytes(path)

    return path and path.rstrip(PATHS_SEPS).endswith(EXTRACT_SUFFIX)
예제 #28
0
def is_extracted(location):
    """
    Return True is the location is already extracted to the corresponding
    extraction location.
    """
    if on_linux:
        location = path_to_bytes(location)
    return location and os.path.exists(get_extraction_path(location))
def is_extraction_path(path):
    """
    Return True is the path points to an extraction path.
    """
    if on_linux:
        path = path_to_bytes(path)

    return path and path.rstrip(PATHS_SEPS).endswith(EXTRACT_SUFFIX)
예제 #30
0
def update_path_environment(new_path, _os_module=os):
    """
    Update the PATH environment variable by adding `new_path` to the front
    of PATH if `new_path` is not alreday in the PATH.
    """
    # note: _os_module is used to facilitate mock testing using an
    # object with a sep string attribute and an environ mapping
    # attribute

    if not new_path:
        return

    new_path = new_path.strip()
    if not new_path:
        return

    path_env = _os_module.environ.get(b'PATH')
    if not path_env:
        # this is quite unlikely to ever happen, but here for safety
        path_env = ''

    # ensure we use unicode or bytes depending on OSes
    if on_linux:
        new_path = path_to_bytes(new_path)
        path_env = path_to_bytes(path_env)
        sep = _os_module.pathsep
    else:
        new_path = path_to_unicode(new_path)
        path_env = path_to_unicode(path_env)
        sep = unicode(_os_module.pathsep)

    path_segments = path_env.split(sep)

    # add lib path to the front of the PATH env var
    # this will use bytes on Linux and unicode elsewhere
    if new_path not in path_segments:
        if not path_env:
            new_path_env = new_path
        else:
            new_path_env = sep.join([new_path, path_env])

        if not on_linux:
            # recode to bytes using FS encoding
            new_path_env = path_to_bytes(new_path_env)
        # ... and set the variable back as bytes
        _os_module.environ[b'PATH'] = new_path_env
def is_extracted(location):
    """
    Return True is the location is already extracted to the corresponding
    extraction location.
    """
    if on_linux:
        location = path_to_bytes(location)
    return location and os.path.exists(get_extraction_path(location))
예제 #32
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
def new_name(location, is_dir=False):
    """
    Return a new non-existing location from a `location` usable to write a file
    or create directory without overwriting existing files or directories in the same
    parent directory, ignoring the case of the filename.

    The case of the filename is ignored to ensure that similar results are returned
    across case sensitive (*nix) and case insensitive file systems.

    To find a new unique filename, this tries new names this way:
     * pad a directory name with _X where X is an incremented number.
     * pad a file base name with _X where X is an incremented number and keep
       the extension unchanged.
    """
    assert location
    if on_linux:
        location = path_to_bytes(location)
    location = location.rstrip(PATHS_SEPS)
    assert location

    parent = fileutils.parent_directory(location)

    # all existing files or directory as lower case
    siblings_lower = set(s.lower() for s in os.listdir(parent))

    filename = fileutils.file_name(location)

    # corner case
    if filename in (DOT, DOT):
        filename = UNDERSCORE

    # if unique, return this
    if filename.lower() not in siblings_lower:
        return os.path.join(parent, filename)

    # otherwise seek a unique name
    if is_dir:
        # directories do not have an "extension"
        base_name = filename
        ext = EMPTY_STRING
    else:
        base_name, dot, ext = filename.partition(DOT)
        if dot:
            ext = dot + ext
        else:
            base_name = filename
            ext = EMPTY_STRING

    # find a unique filename, adding a counter int to the base_name
    counter = 1
    while 1:
        filename = base_name + UNDERSCORE + str(counter) + ext
        if filename.lower() not in siblings_lower:
            break
        counter += 1
    return os.path.join(parent, filename)
예제 #34
0
def new_name(location, is_dir=False):
    """
    Return a new non-existing location from a `location` usable to write a file
    or create directory without overwriting existing files or directories in the same
    parent directory, ignoring the case of the filename.

    The case of the filename is ignored to ensure that similar results are returned
    across case sensitive (*nix) and case insensitive file systems.

    To find a new unique filename, this tries new names this way:
     * pad a directory name with _X where X is an incremented number.
     * pad a file base name with _X where X is an incremented number and keep
       the extension unchanged.
    """
    assert location
    if on_linux:
        location = path_to_bytes(location)
    location = location.rstrip(PATHS_SEPS)
    assert location

    parent = fileutils.parent_directory(location)

    # all existing files or directory as lower case
    siblings_lower = set(s.lower() for s in os.listdir(parent))

    filename = fileutils.file_name(location)

    # corner case
    if filename in (DOT, DOT):
        filename = UNDERSCORE

    # if unique, return this
    if filename.lower() not in siblings_lower:
        return os.path.join(parent, filename)

    # otherwise seek a unique name
    if is_dir:
        # directories do not have an "extension"
        base_name = filename
        ext = EMPTY_STRING
    else:
        base_name, dot, ext = filename.partition(DOT)
        if dot:
            ext = dot + ext
        else:
            base_name = filename
            ext = EMPTY_STRING

    # find a unique filename, adding a counter int to the base_name
    counter = 1
    while 1:
        filename = base_name + UNDERSCORE + str(counter) + ext
        if filename.lower() not in siblings_lower:
            break
        counter += 1
    return os.path.join(parent, filename)
예제 #35
0
def info_keys(path, seed=None):
    """
    Return a file info cache "keys" tripple for a path composed of three
    paths segments derived from a checksum.

    For example:
    >>> expected = 'fb87db2bb28e9501ac7fdc4812782118f4c94a0f'
    >>> assert expected == sha1('/w421/scancode-toolkit2').hexdigest()
    >>> expected = ('f', 'b', '87db2bb28e9501ac7fdc4812782118f4c94a0f')
    >>> assert expected == info_keys('/w421/scancode-toolkit2')
    """
    # ensure that we always pass bytes to the hash function
    if isinstance(path, unicode):
        path = path_to_bytes(path)
    if seed:
        if isinstance(seed, unicode):
            seed = path_to_bytes(seed)
        path = seed + path
    return keys_from_hash(sha1(path).hexdigest())
예제 #36
0
 def log_file_path(cls, logfile_fd, path):
     """
     Log file path in the cache logfile_fd **opened** file descriptor.
     """
     # we dump one path per line written as bytes or unicode
     if on_linux:
         path = path_to_bytes(path) + b'\n'
     else:
         path = path_to_unicode(path) + '\n'
     logfile_fd.write(path)
예제 #37
0
def get_handlers(location):
    """
    Return an iterable of (handler, type_matched, mime_matched,
    extension_matched,) for this `location`.
    """
    if on_linux:
        location = path_to_bytes(location)

    if filetype.is_file(location):
        T = typecode.contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        for handler in archive_handlers:
            if not handler.extractors:
                continue

            extractor_count = len(handler.extractors)
            if extractor_count > 2:
                raise Exception('Maximum level of archive nesting is two.')

            # default to False
            type_matched = handler.filetypes and any(t in ftype for t in handler.filetypes)
            mime_matched = handler.mimetypes and any(m in mtype for m in handler.mimetypes)
            exts = handler.extensions
            if exts:
                if on_linux:
                    exts = tuple(path_to_bytes(e) for e in exts)
                extension_matched = exts and location.lower().endswith(exts)

            if TRACE_DEEP:
                handler_name = handler.name
                logger.debug('get_handlers: considering %(handler_name)r  handler for %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals())
                logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals())

            if handler.strict and not all([type_matched, mime_matched, extension_matched]):
                continue

            if type_matched or mime_matched or extension_matched:
                if TRACE_DEEP:
                    logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals())
                    logger.debug('get_handlers: %(location)s: handler: %(handler)r' % locals())
                yield handler, type_matched, mime_matched, extension_matched
예제 #38
0
 def log_file_path(cls, logfile_fd, path):
     """
     Log file path in the cache logfile_fd **opened** file descriptor.
     """
     # we dump one path per line written as bytes or unicode
     if on_linux:
         path = path_to_bytes(path) + b'\n'
     else:
         path = path_to_unicode(path) + '\n'
     logfile_fd.write(path)
예제 #39
0
def info_keys(path, seed=None):
    """
    Return a file info cache "keys" tripple for a path composed of three
    paths segments derived from a checksum.

    For example:
    >>> expected = 'fb87db2bb28e9501ac7fdc4812782118f4c94a0f'
    >>> assert expected == sha1('/w421/scancode-toolkit2').hexdigest()
    >>> expected = ('f', 'b', '87db2bb28e9501ac7fdc4812782118f4c94a0f')
    >>> assert expected == info_keys('/w421/scancode-toolkit2')
    """
    # ensure that we always pass bytes to the hash function
    if isinstance(path, unicode):
        path = path_to_bytes(path)
    if seed:
        if isinstance(seed, unicode):
            seed = path_to_bytes(seed)
        path = seed + path
    return keys_from_hash(sha1(path).hexdigest())
예제 #40
0
 def __extract(self, test_path, extract_func=None, verbatim=False):
     """
     Given an archive file identified by test_path relative
     to a test files directory, return a new temp directory where the
     archive file has been extracted using extract_func.
     If `verbatim` is True preserve the permissions.
     """
     assert test_path and test_path != ''
     if on_linux:
         test_path = path_to_bytes(test_path)
     test_path = to_os_native_path(test_path)
     target_path = os.path.basename(test_path)
     target_dir = self.get_temp_dir(target_path)
     original_archive = self.get_test_loc(test_path)
     if on_linux:
         target_dir = path_to_bytes(target_dir)
         original_archive = path_to_bytes(original_archive)
     extract_func(original_archive, target_dir, verbatim=verbatim)
     return target_dir
예제 #41
0
def to_os_native_path(path):
    """
    Normalize a path to use the native OS path separator.
    """
    if on_linux:
        path = path_to_bytes(path)
    path = path.replace(POSIX_PATH_SEP, OS_PATH_SEP)
    path = path.replace(WIN_PATH_SEP, OS_PATH_SEP)
    path = path.rstrip(OS_PATH_SEP)
    return path
예제 #42
0
def to_os_native_path(path):
    """
    Normalize a path to use the native OS path separator.
    """
    if on_linux:
        path = path_to_bytes(path)
    path = path.replace(POSIX_PATH_SEP, OS_PATH_SEP)
    path = path.replace(WIN_PATH_SEP, OS_PATH_SEP)
    path = path.rstrip(OS_PATH_SEP)
    return path
예제 #43
0
 def __extract(self, test_path, extract_func=None, verbatim=False):
     """
     Given an archive file identified by test_path relative
     to a test files directory, return a new temp directory where the
     archive file has been extracted using extract_func.
     If `verbatim` is True preserve the permissions.
     """
     assert test_path and test_path != ''
     if on_linux:
         test_path = path_to_bytes(test_path)
     test_path = to_os_native_path(test_path)
     target_path = os.path.basename(test_path)
     target_dir = self.get_temp_dir(target_path)
     original_archive = self.get_test_loc(test_path)
     if on_linux:
         target_dir = path_to_bytes(target_dir)
         original_archive = path_to_bytes(original_archive)
     extract_func(original_archive, target_dir,
                  verbatim=verbatim)
     return target_dir
예제 #44
0
    def get_temp_file(self, extension=None, dir_name='td', file_name='tf'):
        """
        Return a unique new temporary file location to a non-existing
        temporary file that can safely be created without a risk of name
        collision.
        """
        if extension is None:
            extension = '.txt'

        if on_linux:
            extension = path_to_bytes(extension)
            dir_name = path_to_bytes(dir_name)
            file_name = path_to_bytes(file_name)

        if extension and not extension.startswith(DOT):
            extension = DOT + extension

        file_name = file_name + extension
        temp_dir = self.get_temp_dir(dir_name)
        location = os.path.join(temp_dir, file_name)
        return location
예제 #45
0
    def get_temp_file(self, extension=None, dir_name='td', file_name='tf'):
        """
        Return a unique new temporary file location to a non-existing
        temporary file that can safely be created without a risk of name
        collision.
        """
        if extension is None:
            extension = '.txt'

        if on_linux:
            extension = path_to_bytes(extension)
            dir_name = path_to_bytes(dir_name)
            file_name = path_to_bytes(file_name)

        if extension and not extension.startswith(DOT):
                extension = DOT + extension

        file_name = file_name + extension
        temp_dir = self.get_temp_dir(dir_name)
        location = os.path.join(temp_dir, file_name)
        return location
예제 #46
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, ('sha1', 'md5',)))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
예제 #47
0
def extract_twice(location, target_dir, extractor1, extractor2):
    """
    Extract a nested compressed archive at `location` to `target_dir` using
    the `extractor1` function to a temporary directory then the `extractor2`
    function on the extracted payload of `extractor1`.

    Return a list of warning messages. Raise exceptions on errors.

    Typical nested archives include compressed tarballs and RPMs (containing a
    compressed cpio).

    Note: it would be easy to support deeper extractor chains, but this gets
    hard to trace and debug very quickly. A depth of two is simple and sane and
    covers most common cases.
    """
    if on_linux:
        location = path_to_bytes(location)
        target_dir = path_to_bytes(target_dir)
    abs_location = os.path.abspath(os.path.expanduser(location))
    abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir)))
    # extract first the intermediate payload to a temp dir
    temp_target = unicode(fileutils.get_temp_dir('extract'))
    warnings = extractor1(abs_location, temp_target)
    if TRACE:
        logger.debug('extract_twice: temp_target: %(temp_target)r' % locals())

    # extract this intermediate payload to the final target_dir
    try:
        inner_archives = list(fileutils.file_iter(temp_target))
        if not inner_archives:
            warnings.append(location + ': No files found in archive.')
        else:
            for extracted1_loc in inner_archives:
                if TRACE:
                    logger.debug('extract_twice: extractor2: %(extracted1_loc)r' % locals())
                warnings.extend(extractor2(extracted1_loc, abs_target_dir))
    finally:
        # cleanup the temporary output from extractor1
        fileutils.delete(temp_target)
    return warnings
예제 #48
0
    def remove_vcs(self, test_dir):
        """
        Remove some version control directories and some temp editor files.
        """
        vcses = ('CVS', '.svn', '.git', '.hg')
        if on_linux:
            vcses = tuple(path_to_bytes(p) for p in vcses)
            test_dir = path_to_bytes(test_dir)

        for root, dirs, files in os.walk(test_dir):
            for vcs_dir in vcses:
                if vcs_dir in dirs:
                    for vcsroot, vcsdirs, vcsfiles in os.walk(test_dir):
                        for vcsfile in vcsdirs + vcsfiles:
                            vfile = os.path.join(vcsroot, vcsfile)
                            fileutils.chmod(vfile, fileutils.RW, recurse=False)
                    shutil.rmtree(os.path.join(root, vcs_dir), False)

            # editors temp file leftovers
            tilde = b'~' if on_linux else '~'
            map(os.remove, [os.path.join(root, file_loc)
                            for file_loc in files if file_loc.endswith(tilde)])
예제 #49
0
def get_scans_cache_class(cache_dir=scans_cache_dir):
    """
    Return a new persistent cache class configured with a unique storage directory.
    """
    # create a unique temp directory in cache_dir
    fileutils.create_dir(cache_dir)
    prefix = timeutils.time2tstamp() + u'-'
    cache_dir = fileutils.get_temp_dir(cache_dir, prefix=prefix)
    if on_linux:
        cache_dir = path_to_bytes(cache_dir)
    sc = ScanFileCache(cache_dir)
    sc.setup()
    return partial(ScanFileCache, cache_dir)
예제 #50
0
def get_best_handler(location, kinds=all_kinds):
    """
    Return the best handler of None for the file at location.
    """
    if on_linux:
        location = path_to_bytes(location)
    location = os.path.abspath(os.path.expanduser(location))
    if not filetype.is_file(location):
        return
    handlers = list(get_handlers(location))
    if handlers:
        candidates = score_handlers(handlers)
        return candidates and pick_best_handler(candidates, kinds)
예제 #51
0
def get_scans_cache_class(cache_dir=scans_cache_dir):
    """
    Return a new persistent cache class configured with a unique storage directory.
    """
    # create a unique temp directory in cache_dir
    fileutils.create_dir(cache_dir)
    prefix = timeutils.time2tstamp() + u'-'
    cache_dir = fileutils.get_temp_dir(cache_dir, prefix=prefix)
    if on_linux:
        cache_dir = path_to_bytes(cache_dir)
    sc = ScanFileCache(cache_dir)
    sc.setup()
    return partial(ScanFileCache, cache_dir)
예제 #52
0
    def remove_vcs(self, test_dir):
        """
        Remove some version control directories and some temp editor files.
        """
        vcses = ('CVS', '.svn', '.git', '.hg')
        if on_linux:
            vcses = tuple(path_to_bytes(p) for p in vcses)
            test_dir = path_to_bytes(test_dir)

        for root, dirs, files in os.walk(test_dir):
            for vcs_dir in vcses:
                if vcs_dir in dirs:
                    for vcsroot, vcsdirs, vcsfiles in os.walk(test_dir):
                        for vcsfile in vcsdirs + vcsfiles:
                            vfile = os.path.join(vcsroot, vcsfile)
                            fileutils.chmod(vfile, fileutils.RW, recurse=False)
                    shutil.rmtree(os.path.join(root, vcs_dir), False)

            # editors temp file leftovers
            tilde = b'~' if on_linux else '~'
            map(os.remove, [
                os.path.join(root, file_loc)
                for file_loc in files if file_loc.endswith(tilde)
            ])
예제 #53
0
def extract_zip(location, target_dir, *args, **kwargs):
    """
    Extract a zip archive file at location in the target_dir directory.
    """
    if not os.path.isfile(location) and zipfile.is_zipfile(location):
        raise Exception('Incorrect zip file %(location)r' % locals())

    if on_linux:
        location = path_to_bytes(location)
        target_dir = path_to_bytes(target_dir)

    with zipfile.ZipFile(location) as zipf:
        for info in zipf.infolist():
            name = info.filename
            content = zipf.read(name)
            target = os.path.join(target_dir, name)
            if not os.path.exists(os.path.dirname(target)):
                os.makedirs(os.path.dirname(target))
            if not content and target.endswith(os.path.sep):
                if not os.path.exists(target):
                    os.makedirs(target)
            if not os.path.exists(target):
                with open(target, 'wb') as f:
                    f.write(content)
예제 #54
0
def extract_zip(location, target_dir, *args, **kwargs):
    """
    Extract a zip archive file at location in the target_dir directory.
    """
    if not os.path.isfile(location) and zipfile.is_zipfile(location):
        raise Exception('Incorrect zip file %(location)r' % locals())

    if on_linux:
        location = path_to_bytes(location)
        target_dir = path_to_bytes(target_dir)

    with zipfile.ZipFile(location) as zipf:
        for info in zipf.infolist():
            name = info.filename
            content = zipf.read(name)
            target = os.path.join(target_dir, name)
            if not os.path.exists(os.path.dirname(target)):
                os.makedirs(os.path.dirname(target))
            if not content and target.endswith(os.path.sep):
                if not os.path.exists(target):
                    os.makedirs(target)
            if not os.path.exists(target):
                with open(target, 'wb') as f:
                    f.write(content)
예제 #55
0
    def __init__(self, cache_dir):
        # subdirs for info and scans_dir caches
        if on_linux:
            infos_dir = b'infos_dir/'
            scans_dir = b'scans_dir/'
            files_log = b'files_log'
            self.cache_base_dir = path_to_bytes(cache_dir)

        else:
            infos_dir = u'infos_dir/'
            scans_dir = u'scans_dir/'
            files_log = u'files_log'
            self.cache_base_dir = cache_dir

        self.cache_infos_dir = as_posixpath(os.path.join(self.cache_base_dir, infos_dir))
        self.cache_scans_dir = as_posixpath(os.path.join(self.cache_base_dir, scans_dir))
        self.cache_files_log = as_posixpath(os.path.join(self.cache_base_dir, files_log))
예제 #56
0
def test_scan_does_not_fail_when_scanning_unicode_test_files_from_express():

    # On Windows, Python tar cannot extract these files. Other
    # extractors either fail or change the file name, making the test
    # moot. Git cannot check these files. So for now it makes no sense
    # to test this on Windows at all. Extractcode works fine, but does
    # rename the problematic files.

    test_dir = test_env.extract_test_tar_raw(b'unicode_fixtures.tar.gz')
    test_dir = path_to_bytes(test_dir)

    args = [
        '-n0', '--info', '--license', '--copyright', '--package', '--email',
        '--url', '--strip-root', test_dir
    ]
    result = run_scan_click(args, catch_exceptions=False)
    if result.exit_code != 0:
        raise Exception(result.output, args)
    assert 'Scanning done' in result.output