def check_files(test_dir, expected):
    """
    Walk test_dir.
    Check that all dirs are readable.
    Check that all files are:
     * non-special,
     * readable,
     * have a posix path that ends with one of the expected tuple paths.
    """
    result = []
    locs = []
    if filetype.is_file(test_dir):
        test_dir = fileutils.parent_directory(test_dir)

    test_dir_path = fileutils.as_posixpath(test_dir)
    for top, _, files in os.walk(test_dir):
        for f in files:
            location = os.path.join(top, f)
            locs.append(location)
            path = fileutils.as_posixpath(location)
            path = path.replace(test_dir_path, '').strip('/')
            result.append(path)

    assert sorted(expected) == sorted(result)

    for location in locs:
        assert filetype.is_file(location)
        assert not filetype.is_special(location)
        assert filetype.is_readable(location)
Beispiel #2
0
def is_pom(location):
    """
    Return True if the file at location is highly likely to be a POM.
    """
    if (not filetype.is_file(location)
    or not location.endswith(('.pom', 'pom.xml', 'project.xml',))):
        if TRACE: logger.debug('is_pom: not a POM on name: {}'.format(location))
        return

    T = contenttype.get_type(location)
    if T.is_text:

        # check the POM version in the first 150 lines
        with codecs.open(location, encoding='utf-8') as pom:
            for n, line in enumerate(pom):
                if n > 150:
                    break
                if any(x in line for x in
                       ('http://maven.apache.org/POM/4.0.0',
                        'http://maven.apache.org/xsd/maven-4.0.0.xsd',
                        '<modelVersion>',
                        # somehow we can still parse version 3 poms too
                        '<pomVersion>',)
                       ):
                    return True

    if TRACE: logger.debug('is_pom: not a POM based on type: {}: {}'.format(T, location))
def recognize_packaged_archives(location):
    """
    Return a Package object if one was recognized or None for this `location`.
    """
    if not filetype.is_file(location):
        return

    T = typecode.contenttype.get_type(location)
    ftype = T.filetype_file.lower()
    mtype = T.mimetype_file

    for package in PACKAGE_TYPES:
        if not package.packaging == models.Package.as_archive:
            continue

        # Note: default to True if there is nothing to match against

        if package.filetypes:
            type_matched = any(t in ftype for t in package.filetypes)
        else:
            type_matched = True
        if package.mimetypes:
            mime_matched = any(m in mtype for m in package.mimetypes)
        else:
            mime_matched = True
        if package.extensions:
            extension_matched = location.lower().endswith(package.extensions)
        else:
            extension_matched = True

        if type_matched and mime_matched and extension_matched:
            # we return the first match in the order of PACKAGE_TYPES
            return package(location=location)
    def __init__(self, location):
        if (not location
            or (not os.path.exists(location)
                and not filetype.is_broken_link(location))):
            raise IOError("[Errno 2] No such file or directory: "
                          "'%(location)r'" % locals())
        self.location = location
        # flags and values
        self.is_file = filetype.is_file(location)
        self.is_dir = filetype.is_dir(location)
        self.is_regular = filetype.is_regular(location)
        self.is_special = filetype.is_special(location)

        self.date = filetype.get_last_modified_date(location)

        self.is_link = filetype.is_link(location)
        self.is_broken_link = filetype.is_broken_link(location)

        # computed on demand
        self._size = None
        self._link_target = None

        self._mimetype_python = None
        self._filetype_file = None
        self._mimetype_file = None
        self._filetype_pygments = None
        self._is_pdf_with_text = None
        self._is_text = None
        self._is_binary = None
Beispiel #5
0
    def __init__(self, location):
        if not location or (not os.path.exists(location) and not filetype.is_broken_link(location)):
            raise IOError("[Errno 2] No such file or directory: " "'%(location)r'" % locals())
        self.location = location
        # flags and values
        self.is_file = filetype.is_file(location)
        self.is_dir = filetype.is_dir(location)
        self.is_regular = filetype.is_regular(location)
        self.is_special = filetype.is_special(location)

        self.date = filetype.get_last_modified_date(location)

        self.is_link = filetype.is_link(location)
        self.is_broken_link = filetype.is_broken_link(location)

        # FIXME: the way the True and False values are checked in properties is verbose and contrived at best
        # and is due to use None/True/False as different values
        # computed on demand
        self._size = None
        self._link_target = None

        self._mimetype_python = None
        self._filetype_file = None
        self._mimetype_file = None
        self._filetype_pygments = None
        self._is_pdf_with_text = None
        self._is_text = None
        self._is_binary = None
    def get_test_loc(self, test_path, copy=False, debug=False):
        """
        Given a `test_path` relative to the self.test_data_dir directory, return the
        location to a test file or directory for this path. Copy to a temp
        test location if `copy` is True.
        """
        if debug:
            import inspect
            caller = inspect.stack()[1][3]
            print('\nself.get_test_loc,%(caller)s,"%(test_path)s"' % locals())

        test_loc = get_test_loc(test_path, self.test_data_dir, debug=debug)
        if copy:
            base_name = os.path.basename(test_loc)
            if filetype.is_file(test_loc):
                # target must be an existing dir
                target_dir = self.get_temp_dir()
                fileutils.copyfile(test_loc, target_dir)
                test_loc = os.path.join(target_dir, base_name)
            else:
                # target must be a NON existing dir
                target_dir = os.path.join(self.get_temp_dir(), base_name)
                fileutils.copytree(test_loc, target_dir)
                # cleanup of VCS that could be left over from checkouts
                self.remove_vcs(target_dir)
                test_loc = target_dir
        return test_loc
def walk(location, ignored=ignore_nothing):
    """
    Walk location returning the same tuples as os.walk but with a different
    behavior:
     - always walk top-down, breadth-first.
     - always ignore and never follow symlinks, .
     - always ignore special files (FIFOs, etc.)
     - optionally ignore files and directories by invoking the `ignored`
       callable on files and directories returning True if it should be ignored.
     - location is a directory or a file: for a file, the file is returned.
    """
    if on_linux:
        location = path_to_bytes(location)

    # TODO: consider using the new "scandir" module for some speed-up.
    if TRACE:
        ign = ignored(location)
        logger_debug('walk: ignored:', location, ign)
    if ignored(location):
        return

    if filetype.is_file(location) :
        yield parent_directory(location), [], [file_name(location)]

    elif filetype.is_dir(location):
        dirs = []
        files = []
        # TODO: consider using scandir
        for name in os.listdir(location):
            loc = os.path.join(location, name)
            if filetype.is_special(loc) or ignored(loc):
                if TRACE:
                    ign = ignored(loc)
                    logger_debug('walk: ignored:', loc, ign)
                continue
            # special files and symlinks are always ignored
            if filetype.is_dir(loc):
                dirs.append(name)
            elif filetype.is_file(loc):
                files.append(name)
        yield location, dirs, files

        for dr in dirs:
            for tripple in walk(os.path.join(location, dr), ignored):
                yield tripple
Beispiel #8
0
def is_metadata_json(location):
    """
    Return True if `location` path is for a Chef metadata.json file.
    The metadata.json is also used in Python installed packages in a 'dist-info'
    directory.
    """
    return (filetype.is_file(location)
            and fileutils.file_name(location).lower() == 'metadata.json'
            and not fileutils.file_name(fileutils.parent_directory(
                location)).lower().endswith('dist-info'))
Beispiel #9
0
def walk(location, ignored=ignore_nothing):
    """
    Walk location returning the same tuples as os.walk but with a different
    behavior:
     - always walk top-down, breadth-first.
     - always ignore and never follow symlinks, .
     - always ignore special files (FIFOs, etc.)
     - optionally ignore files and directories by invoking the `ignored`
       callable on files and directories returning True if it should be ignored.
     - location is a directory or a file: for a file, the file is returned.
    """
    # TODO: consider using the new "scandir" module for some speed-up.
    if DEBUG:
        ign = ignored(location)
        logger.debug('walk: ignored:', location, ign)
    if ignored(location):
        return

    if filetype.is_file(location) :
        yield parent_directory(location), [], [file_name(location)]

    elif filetype.is_dir(location):
        dirs = []
        files = []
        # TODO: consider using scandir
        for name in os.listdir(location):
            loc = os.path.join(location, name)
            if filetype.is_special(loc) or ignored(loc):
                if DEBUG:
                    ign = ignored(loc)
                    logger.debug('walk: ignored:', loc, ign)
                continue
            # special files and symlinks are always ignored
            if filetype.is_dir(loc):
                dirs.append(name)
            elif filetype.is_file(loc):
                files.append(name)
        yield location, dirs, files

        for dr in dirs:
            for tripple in walk(os.path.join(location, dr), ignored):
                yield tripple
Beispiel #10
0
def get_best_handler(location, kinds=all_kinds):
    """
    Return the best handler of None for the file at location.
    """
    location = os.path.abspath(os.path.expanduser(location))
    if not filetype.is_file(location):
        return
    handlers = list(get_handlers(location))
    if handlers:
        candidates = score_handlers(handlers)
        return candidates and pick_best_handler(candidates, kinds)
Beispiel #11
0
def get_handlers(location):
    """
    Return an iterable of (handler, type_matched, mime_matched,
    extension_matched,) for this `location`.
    """
    if on_linux and py2:
        location = fileutils.fsencode(location)

    if filetype.is_file(location):

        T = contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        if TRACE_DEEP:
            logger.debug(
                'get_handlers: processing %(location)s: ftype: %(ftype)s, mtype: %(mtype)s '
                % locals())
        for handler in archive_handlers:
            if not handler.extractors:
                continue

            extractor_count = len(handler.extractors)
            if extractor_count > 2:
                raise Exception('Maximum level of archive nesting is two.')

            # default to False
            type_matched = handler.filetypes and any(
                t in ftype for t in handler.filetypes)
            mime_matched = handler.mimetypes and any(
                m in mtype for m in handler.mimetypes)
            exts = handler.extensions
            if exts:
                if on_linux and py2:
                    exts = tuple(fileutils.fsencode(e) for e in exts)
                extension_matched = exts and location.lower().endswith(exts)

            if TRACE_DEEP:
                logger.debug(
                    '  get_handlers: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s'
                    % locals())

            if handler.strict and not all(
                [type_matched, mime_matched, extension_matched]):
                logger.debug('  get_handlers: skip strict' % locals())
                continue

            if type_matched or mime_matched or extension_matched:
                if TRACE_DEEP:
                    handler_name = handler.name
                    logger.debug(
                        '     get_handlers: yielding handler: %(handler_name)r'
                        % locals())
                yield handler, type_matched, mime_matched, extension_matched
Beispiel #12
0
def recognize_package_data(location):
    """
    Return a list of Package objects if any package_data were recognized for
    this `location`, or None if there were no Packages found. Raises Exceptions
    on errors.
    """

    if not filetype.is_file(location):
        return []

    return list(_parse(location))
Beispiel #13
0
def parse_status_file(location, distro='debian'):
    """
    Yield Debian Package objects from a dpkg `status` file or None.
    """
    if not os.path.exists(location):
        raise FileNotFoundError(
            '[Errno 2] No such file or directory: {}'.format(repr(location)))
    if not filetype.is_file(location):
        raise Exception(f'Location is not a file: {location}')
    for debian_pkg_data in debcon.get_paragraphs_data_from_file(location):
        yield build_package(debian_pkg_data, distro)
def get_best_handler(location, kinds=all_kinds):
    """
    Return the best handler of None for the file at location.
    """
    location = os.path.abspath(os.path.expanduser(location))
    if not filetype.is_file(location):
        return
    handlers = list(get_handlers(location))
    if handlers:
        candidates = score_handlers(handlers)
        return candidates and pick_best_handler(candidates, kinds)
Beispiel #15
0
def copytree(src, dst):
    """
    Copy recursively the `src` directory to the `dst` directory. If `dst` is an
    existing directory, files in `dst` may be overwritten during the copy.
    Preserve timestamps.
    Ignores:
     -`src` permissions: `dst` files are created with the default permissions.
     - all special files such as FIFO or character devices and symlinks.

    Raise an shutil.Error with a list of reasons.

    This function is similar to and derived from the Python shutil.copytree
    function. See fileutils.py.ABOUT for details.
    """
    if on_linux and py2:
        src = fsencode(src)
        dst = fsencode(dst)

    if not filetype.is_readable(src):
        chmod(src, R, recurse=False)

    names = os.listdir(src)

    if not os.path.exists(dst):
        os.makedirs(dst)

    errors = []
    errors.extend(copytime(src, dst))

    for name in names:
        srcname = os.path.join(src, name)
        dstname = os.path.join(dst, name)

        # skip anything that is not a regular file, dir or link
        if not filetype.is_regular(srcname):
            continue

        if not filetype.is_readable(srcname):
            chmod(srcname, R, recurse=False)
        try:
            if os.path.isdir(srcname):
                copytree(srcname, dstname)
            elif filetype.is_file(srcname):
                copyfile(srcname, dstname)
        # catch the Error from the recursive copytree so that we can
        # continue with other files
        except shutil.Error as err:
            errors.extend(err.args[0])
        except EnvironmentError as why:
            errors.append((srcname, dstname, str(why)))

    if errors:
        raise shutil.Error(errors)
Beispiel #16
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, ('sha1', 'md5',)))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
Beispiel #17
0
def parse(location):
    """
    Return a Package object from a Cargo.toml/Cargo.lock file.
    """
    handlers = {
        'cargo.toml': build_cargo_toml_package,
        'cargo.lock': build_cargo_lock_package
    }
    filename = filetype.is_file(location) and fileutils.file_name(
        location).lower()
    handler = handlers.get(filename)
    if handler:
        return handler and handler(toml.load(location, _dict=OrderedDict))
def copytree(src, dst):
    """
    Copy recursively the `src` directory to the `dst` directory. If `dst` is an
    existing directory, files in `dst` may be overwritten during the copy.
    Preserve timestamps.
    Ignores:
     -`src` permissions: `dst` files are created with the default permissions.
     - all special files such as FIFO or character devices and symlinks.

    Raise an shutil.Error with a list of reasons.

    This function is similar to and derived from the Python shutil.copytree
    function. See fileutils.py.ABOUT for details.
    """
    if on_linux:
        src = path_to_bytes(src)
        dst = path_to_bytes(dst)

    if not filetype.is_readable(src):
        chmod(src, R, recurse=False)

    names = os.listdir(src)

    if not os.path.exists(dst):
        os.makedirs(dst)

    errors = []
    errors.extend(copytime(src, dst))

    for name in names:
        srcname = os.path.join(src, name)
        dstname = os.path.join(dst, name)

        # skip anything that is not a regular file, dir or link
        if not filetype.is_regular(srcname):
            continue

        if not filetype.is_readable(srcname):
            chmod(srcname, R, recurse=False)
        try:
            if os.path.isdir(srcname):
                copytree(srcname, dstname)
            elif filetype.is_file(srcname):
                copyfile(srcname, dstname)
        # catch the Error from the recursive copytree so that we can
        # continue with other files
        except shutil.Error, err:
            errors.extend(err.args[0])
        except EnvironmentError, why:
            errors.append((srcname, dstname, str(why)))
Beispiel #19
0
def get_extractors(location, kinds=all_kinds):
    """
    Return a list of extractors that can extract the file at
    location or an empty list.
    """
    location = os.path.abspath(os.path.expanduser(location))
    if filetype.is_file(location):
        handlers = list(get_handlers(location))
        if handlers:
            candidates = score_handlers(handlers)
            if candidates:
                best = pick_best_handler(candidates, kinds)
                if best:
                    return best.extractors
    return []
def get_handlers(location):
    """
    Return an iterable of (handler, type_matched, mime_matched,
    extension_matched,) for this `location`.
    """
    if filetype.is_file(location):
        T = typecode.contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        for handler in archive_handlers:
            if not handler.extractors:
                continue

            extractor_count = len(handler.extractors)
            if extractor_count > 2:
                raise Exception('Maximum level of archive nesting is two.')

            # default to False
            type_matched = handler.filetypes and any(
                t in ftype for t in handler.filetypes)
            mime_matched = handler.mimetypes and any(
                m in mtype for m in handler.mimetypes)
            extension_matched = handler.extensions and location.lower(
            ).endswith(handler.extensions)

            if TRACE_DEEP:
                handler_name = handler.name
                logger.debug(
                    'get_handlers: considering %(handler_name)r  handler for %(location)s: ftype: %(ftype)s, mtype: %(mtype)s '
                    % locals())
                logger.debug(
                    'get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s'
                    % locals())

            if handler.strict and not all(
                [type_matched, mime_matched, extension_matched]):
                continue

            if type_matched or mime_matched or extension_matched:
                if TRACE_DEEP:
                    logger.debug(
                        'get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s'
                        % locals())
                    logger.debug(
                        'get_handlers: %(location)s: handler: %(handler)r' %
                        locals())
                yield handler, type_matched, mime_matched, extension_matched
Beispiel #21
0
def get_best_handler(location, kinds=all_kinds):
    """
    Return the best handler of None for the file at location.
    """
    if on_linux:
        location = fileutils.fsencode(location)
    location = os.path.abspath(os.path.expanduser(location))
    if not filetype.is_file(location):
        return
    handlers = list(get_handlers(location))
    if TRACE_DEEP:
        logger.debug('get_best_handler: handlers: %(handlers)r ' % locals())

    if handlers:
        candidates = score_handlers(handlers)
        return candidates and pick_best_handler(candidates, kinds)
Beispiel #22
0
def recognize_package(location):
    """
    Return a Package object if one was recognized or None for this `location`.
    """

    if not filetype.is_file(location):
        return

    T = contenttype.get_type(location)
    ftype = T.filetype_file.lower()
    mtype = T.mimetype_file


    for package_type in PACKAGE_TYPES:
        # Note: default to True if there is nothing to match against
        metafiles = package_type.metafiles
        if on_linux:
            metafiles = (path_to_bytes(m) for m in metafiles)
        if location.endswith(tuple(metafiles)):
            logger_debug('metafile matching: package_type is of type:', package_type)
            return package_type.recognize(location)

        if package_type.filetypes:
            type_matched = any(t in ftype for t in package_type.filetypes)
        else:
            type_matched = False
        if package_type.mimetypes:
            mime_matched = any(m in mtype for m in package_type.mimetypes)
        else:
            mime_matched = False

        extensions = package_type.extensions
        if extensions:
            if on_linux:
                extensions = tuple(path_to_bytes(e) for e in extensions)
            extension_matched = location.lower().endswith(extensions)
        else:
            extension_matched = False

        if type_matched and mime_matched and extension_matched:
            # we return the first match in the order of PACKAGE_TYPES
            logger_debug('all matching: package is of type:', package_type)
            recognized = package_type.recognize(location)
            logger_debug('all matching: recognized as:', repr(recognized))
            return recognized

        logger_debug('no match: package is not of known type:', package_type)
Beispiel #23
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    infos['name'] = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''
    infos['base_name'] = base_name
    infos['extension'] = extension
    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
Beispiel #24
0
def multi_checksums(location, checksum_names=('md5', 'sha1', 'sha256', 'sha512', 'sha1_git')):
    """
    Return a mapping of hexdigest checksums keyed by checksum name from the content
    of the file at `location`. Use the `checksum_names` list of checksum names.
    The mapping is guaranted to contains all the requested names as keys.
    If the location is not a file, the values are None.
    """
    results = OrderedDict([(name, None) for name in checksum_names])
    if not filetype.is_file(location):
        return results

    # fixme: we should read in chunks?
    with open(location, 'rb') as f:
        hashable = f.read()

    for name in checksum_names:
        results[name] = _hashmodules_by_name[name](hashable).hexdigest()
    return results
Beispiel #25
0
def multi_checksums(location, checksum_names=('md5', 'sha1', 'sha256', 'sha512', 'sha1_git')):
    """
    Return a mapping of hexdigest checksums keyed by checksum name from the content
    of the file at `location`. Use the `checksum_names` list of checksum names.
    The mapping is guaranted to contains all the requested names as keys.
    If the location is not a file, the values are None.
    """
    results = OrderedDict([(name, None) for name in checksum_names])
    if not filetype.is_file(location):
        return results

    # fixme: we should read in chunks?
    with open(location, 'rb') as f:
        hashable = f.read()

    for name in checksum_names:
        results[name] = _hashmodules_by_name[name](hashable).hexdigest()
    return results
Beispiel #26
0
def is_data(location, definitions=DATA_TYPE_DEFINITIONS):
    """
    Return True isthe file at `location` is a data file.
    """
    if on_linux:
        location = fileutils.fsencode(location)

    if not filetype.is_file(location):
        return False

    T = get_type(location)
    ftype = T.filetype_file.lower()
    mtype = T.mimetype_file.lower()

    for ddef in definitions:
        type_matched = ddef.filetypes and any(t in ftype
                                              for t in ddef.filetypes)
        mime_matched = ddef.mimetypes and any(m in mtype
                                              for m in ddef.mimetypes)

        exts = ddef.extensions
        if exts:
            if on_linux:
                exts = tuple(fileutils.fsencode(e) for e in exts)
            extension_matched = exts and location.lower().endswith(exts)

        if TRACE:
            logger_debug(
                'is_data: considering def: %(ddef)r for %(location)s' %
                locals())
            logger_debug(
                'matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s'
                % locals())

        if ddef.strict and not all(
            [type_matched, mime_matched, extension_matched]):
            continue

        if type_matched or mime_matched or extension_matched:
            if TRACE:
                logger_debug('is_data: True: %(location)s: ' % locals())
            return True

    return False
def checksum(location, name, base64=False):
    """
    Return a checksum of `bitsize` length from the content of the file at
    `location`. The checksum is a hexdigest or base64-encoded is `base64` is
    True.
    """
    if not filetype.is_file(location):
        return
    hasher = _hashmodules_by_name[name]

    # fixme: we should read in chunks?
    with open(location, 'rb') as f:
        hashable = f.read()

    hashed = hasher(hashable)
    if base64:
        return hashed.b64digest()

    return hashed.hexdigest()
Beispiel #28
0
def get_handlers(location):
    """
    Return an iterable of (handler, type_matched, mime_matched,
    extension_matched,) for this `location`.
    """
    if on_linux:
        location = path_to_bytes(location)

    if filetype.is_file(location):
        T = typecode.contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        for handler in archive_handlers:
            if not handler.extractors:
                continue

            extractor_count = len(handler.extractors)
            if extractor_count > 2:
                raise Exception('Maximum level of archive nesting is two.')

            # default to False
            type_matched = handler.filetypes and any(t in ftype for t in handler.filetypes)
            mime_matched = handler.mimetypes and any(m in mtype for m in handler.mimetypes)
            exts = handler.extensions
            if exts:
                if on_linux:
                    exts = tuple(path_to_bytes(e) for e in exts)
                extension_matched = exts and location.lower().endswith(exts)

            if TRACE_DEEP:
                handler_name = handler.name
                logger.debug('get_handlers: considering %(handler_name)r  handler for %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals())
                logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals())

            if handler.strict and not all([type_matched, mime_matched, extension_matched]):
                continue

            if type_matched or mime_matched or extension_matched:
                if TRACE_DEEP:
                    logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals())
                    logger.debug('get_handlers: %(location)s: handler: %(handler)r' % locals())
                yield handler, type_matched, mime_matched, extension_matched
Beispiel #29
0
def checksum(location, name, base64=False):
    """
    Return a checksum of `bitsize` length from the content of the file at
    `location`. The checksum is a hexdigest or base64-encoded is `base64` is
    True.
    """
    if not filetype.is_file(location):
        return
    hasher = _hashmodules_by_name[name]

    # fixme: we should read in chunks?
    with open(location, 'rb') as f:
        hashable = f.read()

    hashed = hasher(hashable)
    if base64:
        return hashed.b64digest()

    return hashed.hexdigest()
Beispiel #30
0
    def recon(self, location):
        for f in  os.listdir(location):
            loc = join(location, f)
            if not filetype.is_file(loc):
                continue
            # a pom is an xml doc
            pom_ver = pom_version(location)
            if not pom_ver:
                continue

            if f == 'pom.xml':
                # first case: a maven pom.xml inside a META-INF directory
                # such as in META-INF/maven/log4j/log4j/pom.xml
                # the directory tree has a fixed depth
                # as is: META-INF/maven/groupid/artifactid/pom.xml
                # this will typically be inside a binary jar, so we should find
                # a typical structure above
                try:
                    gggp = dirname(dirname(dirname(dirname(loc))))
                    if fileutils.file_name(gggp) == 'META-INF':
                        # recon here: the root of the component is the parent of
                        # META-INF, return that, with a type and the POM
                        # metafile to parse.
                        pass
                except:
                    pass

                # second case: a maven pom.xml at the root of component
                # development tree we should find a few extra clues in the
                # conventional directory structure below for now we take this as
                # being the component root. return that, with a type and the POM
                # metafile to parse.

                pass
            elif f.endswith('.pom'):
                # first case: a maven repo layout
                # the jars are side-by-side with the pom
                # check if there are side-by-side artifacts
                jar = loc.replace('.pom', '.jar')
                if os.path.exists(jar):
                # return that, with a type and the POM metafile to parse.
                    pass
Beispiel #31
0
def get_file_infos(location, as_list=True):
    """
    Return a list of dictionaries of informations collected from the file or
    directory at location.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from scancode import utils
    from typecode import contenttype

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    infos['name'] = fileutils.file_name(location)
    infos['extension'] = is_file and fileutils.file_extension(location) or ''
    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    if as_list:
        return [infos]
    else:
        return infos
Beispiel #32
0
    def recon(self, location):
        for f in  os.listdir(location):
            loc = join(location, f)
            if not filetype.is_file(loc):
                continue
            # a pom is an xml doc
            if not is_pom(location):
                continue

            if f == 'pom.xml':
                # first case: a maven pom.xml inside a META-INF directory
                # such as in META-INF/maven/log4j/log4j/pom.xml
                # the directory tree has a fixed depth
                # as is: META-INF/maven/groupid/artifactid/pom.xml
                # this will typically be inside a binary jar, so we should find
                # a typical structure above
                try:
                    gggp = dirname(dirname(dirname(dirname(loc))))
                    if fileutils.file_name(gggp) == 'META-INF':
                        # recon here: the root of the component is the parent of
                        # META-INF, return that, with a type and the POM
                        # metafile to parse.
                        pass
                except:
                    pass

                # second case: a maven pom.xml at the root of component
                # development tree we should find a few extra clues in the
                # conventional directory structure below for now we take this as
                # being the component root. return that, with a type and the POM
                # metafile to parse.

                pass
            elif f.endswith('.pom'):
                # first case: a maven repo layout
                # the jars are side-by-side with the pom
                # check if there are side-by-side artifacts
                jar = loc.replace('.pom', '.jar')
                if os.path.exists(jar):
                # return that, with a type and the POM metafile to parse.
                    pass
Beispiel #33
0
def get_handlers(location):
    """
    Return an iterable of (handler, type_matched, mime_matched,
    extension_matched,) for this `location`.
    """
    if filetype.is_file(location):
        T = typecode.contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        for handler in archive_handlers:
            if not handler.extractors:
                continue

            extractor_count = len(handler.extractors)
            if extractor_count > 2:
                raise Exception("Maximum level of archive nesting is two.")

            # default to False
            type_matched = handler.filetypes and any(t in ftype for t in handler.filetypes)
            mime_matched = handler.mimetypes and any(m in mtype for m in handler.mimetypes)
            extension_matched = handler.extensions and location.lower().endswith(handler.extensions)

            if DEBUG_DEEP:
                logger.debug("get_handlers: %(location)s: ftype: %(ftype)s, mtype: %(mtype)s " % locals())
                logger.debug(
                    "get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s"
                    % locals()
                )

            if handler.strict and not all([type_matched, mime_matched, extension_matched]):
                continue

            if type_matched or mime_matched or extension_matched:
                if DEBUG_DEEP:
                    logger.debug(
                        "get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s"
                        % locals()
                    )
                    logger.debug("get_handlers: %(location)s: handler: %(handler)r" % locals())
                yield handler, type_matched, mime_matched, extension_matched
Beispiel #34
0
def get_handlers(location):
    """
    Return an iterable of (handler, type_matched, mime_matched,
    extension_matched,) for this `location`.
    """
    if filetype.is_file(location):
        T = typecode.contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        for handler in archive_handlers:
            if not handler.extractors:
                continue

            extractor_count = len(handler.extractors)
            if extractor_count > 2:
                raise Exception('Maximum level of archive nesting is two.')

            type_matched = None
            if handler.types:
                type_matched = any(t in ftype for t in handler.types)

            mime_matched = None
            if handler.mimes:
                mime_matched = any(m in mtype for m in handler.mimes)

            extension_matched = None
            if handler.exts:
                extension_matched = location.lower().endswith(handler.exts)

            if DEBUG_DEEP:
                logger.debug('get_handlers: %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals())
                logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals())


            if type_matched or mime_matched or extension_matched:
                if DEBUG_DEEP:
                    logger.debug('get_handlers: %(location)s: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals())
                    logger.debug('get_handlers: %(location)s: handler: %(handler)r' % locals())
                yield handler, type_matched, mime_matched, extension_matched
 def get_test_loc(self, test_path, copy=False):
     """
     Given a `test_path` relative to the self.test_data_dir directory, return the
     location to a test file or directory for this path. Copy to a temp
     test location if `copy` is True.
     """
     test_loc = get_test_loc(test_path, self.test_data_dir)
     if copy:
         base_name = os.path.basename(test_loc)
         if filetype.is_file(test_loc):
             # target must be an existing dir
             target_dir = self.get_temp_dir()
             fileutils.copyfile(test_loc, target_dir)
             test_loc = os.path.join(target_dir, base_name)
         else:
             # target must be a NON existing dir
             target_dir = os.path.join(self.get_temp_dir(), base_name)
             fileutils.copytree(test_loc, target_dir)
             # cleanup of VCS that could be left over from checkouts
             self.remove_vcs(target_dir)
             test_loc = target_dir
     return test_loc
Beispiel #36
0
 def get_test_loc(self, test_path, copy=False):
     """
     Given a `test_path` relative to the self.test_data_dir directory, return the
     location to a test file or directory for this path. Copy to a temp
     test location if `copy` is True.
     """
     test_loc = get_test_loc(test_path, self.test_data_dir)
     if copy:
         base_name = os.path.basename(test_loc)
         if filetype.is_file(test_loc):
             # target must be an existing dir
             target_dir = self.get_temp_dir()
             fileutils.copyfile(test_loc, target_dir)
             test_loc = os.path.join(target_dir, base_name)
         else:
             # target must be a NON existing dir
             target_dir = os.path.join(self.get_temp_dir(), base_name)
             fileutils.copytree(test_loc, target_dir)
             # cleanup of VCS that could be left over from checkouts
             self.remove_vcs(target_dir)
             test_loc = target_dir
     return test_loc
    def __init__(self, location):
        if (not location
            or (not os.path.exists(location)
                and not filetype.is_broken_link(location))):
            raise IOError("[Errno 2] No such file or directory: "
                          "'%(location)r'" % locals())
        self.location = location
        # flags and values
        self.is_file = filetype.is_file(location)
        self.is_dir = filetype.is_dir(location)
        self.is_regular = filetype.is_regular(location)
        self.is_special = filetype.is_special(location)

        self.date = filetype.get_last_modified_date(location)

        self.is_link = filetype.is_link(location)
        self.is_broken_link = filetype.is_broken_link(location)

        # FIXME: the way the True and False values are checked in properties is verbose and contrived at best
        # and is due to use None/True/False as different values
        # computed on demand
        self._size = None
        self._link_target = None

        self._mimetype_python = None
        self._filetype_file = None
        self._mimetype_file = None
        self._filetype_pygments = None
        self._is_pdf_with_text = None
        self._is_text = None
        self._is_text_with_long_lines = None
        self._is_compact_js = None
        self._is_js_map = None
        self._is_binary = None
        self._is_data = None
        self._is_archive = None
        self._contains_text = None
Beispiel #38
0
    def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True):
        """
        Given a `test_path` relative to the self.test_data_dir directory, return the
        location to a test file or directory for this path. Copy to a temp
        test location if `copy` is True.

        Raise an IOError if `must_exist` is True and the `test_path` does not
        exists.
        """
        test_data_dir = self.test_data_dir
        if debug:
            import inspect
            caller = inspect.stack()[1][3]
            print('\nself.get_test_loc,%(caller)s,"%(test_path)s"' % locals())

        test_loc = get_test_loc(
            test_path,
            test_data_dir,
            debug=debug,
            must_exist=must_exist,
        )
        if copy:
            base_name = path.basename(test_loc)
            if filetype.is_file(test_loc):
                # target must be an existing dir
                target_dir = self.get_temp_dir()
                fileutils.copyfile(test_loc, target_dir)
                test_loc = path.join(target_dir, base_name)
            else:
                # target must be a NON existing dir
                target_dir = path.join(self.get_temp_dir(), base_name)
                fileutils.copytree(test_loc, target_dir)
                # cleanup of VCS that could be left over from checkouts
                self.remove_vcs(target_dir)
                test_loc = target_dir
        return test_loc
Beispiel #39
0
def is_podspec_json(location):
    """
    Checks if the file is actually a podspec.json metadata file
    """
    return (filetype.is_file(location) and location.endswith('.podspec.json'))
Beispiel #40
0
def is_podfile_lock(location):
    """
    Checks if the file is actually a podfile.lock file
    """
    return (filetype.is_file(location) and location.endswith(
        ('podfile.lock', 'Podfile.lock')))
Beispiel #41
0
def is_podspec(location):
    """
    Checks if the file is actually a podspec file
    """
    return (filetype.is_file(location) and location.endswith('.podspec'))
Beispiel #42
0
def file_endswith(location, endswith):
    """
    Check if the file at ``location`` ends with ``endswith`` string or tuple.
    """
    return filetype.is_file(location) and location.endswith(endswith)
 def is_manifest(cls, location):
     """
     Return True if the file at ``location`` is likely a manifest of this type.
     """
     return filetype.is_file(location) and location.endswith('Gemfile.lock')
Beispiel #44
0
def is_about_file(location):
    return (filetype.is_file(location) and location.lower().endswith(
        ('.about', )))
Beispiel #45
0
def is_package_json(location):
    return (filetype.is_file(location)
            and fileutils.file_name(location).lower() == 'package.json')
Beispiel #46
0
def is_ignore_file(location):
    """
    Return True if the location is an ignore file.
    """
    return (filetype.is_file(location)
            and fileutils.file_name(location) == '.scancodeignore')
Beispiel #47
0
 def _is_build_manifest(cls, location):
     if not filetype.is_file(location):
         return False
     fn = fileutils.file_name(location)
     return any(fn == mf for mf in cls.metafiles)
def is_haxelib_json(location):
    return (filetype.is_file(location)
            and fileutils.file_name(location).lower() == 'haxelib.json')
def is_phpcomposer_json(location):
    return (filetype.is_file(location)
            and fileutils.file_name(location).lower() == 'composer.json')
 def test_get_file_count_with_single_file(self):
     test_file = self.get_temp_file()
     with open(test_file, 'wb') as f:
         f.write('')
     assert filetype.is_file(test_file)
     assert 1 == filetype.get_file_count(test_file)