Python walk Examples, os.scandir.walk Python Examples

Example #1

0

Show file

def makePdbxPathList(fp, cachePath=".", skipFile=None):
    """Return the list of pdbx file paths in the current repository."""

    try:
        skipD = {}
        if skipFile and os.access(skipFile, "r"):
            with open(skipFile, "r", encoding="utf-8") as ifh:
                for line in ifh:
                    idcode = str(line[:-1]).strip().lower() + ".cif"
                    skipD[idcode] = idcode
            logger.info("Skip list length %d", len(skipD))
        #
        with open(fp, "w", encoding="utf-8") as ofh:
            for root, _, files in scandir.walk(cachePath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.endswith(".cif") and len(
                            name) == 8 and name not in skipD:
                        ofh.write("%s\n" % os.path.join(root, name))
            #
            # logger.info("\nFound %d files in %s\n" % (len(pathList), cachePath))
        return True
    except Exception as e:
        logger.exception("Failing with %s", str(e))

    return False

Example #2

0

Show file

File: crawler.py Project: digideskio/crates

 def enumerate_tasks(self):
     # beware of infinite loop when followlinks is true.
     for dirpath,dirnames,filenames in walk(self.directory,followlinks=False):
         for filename in filenames:
             filepath = join(dirpath,filename)
             root, ext = splitext(filepath)
             # better on the remaining time estimator + memory
             if ext in ('.mp3','.aac'):
                 yield filepath

Example #3

0

Show file

File: crawler.py Project: jimjibone/crates

 def enumerate_tasks(self):
     # beware of infinite loop when followlinks is true.
     for dirpath, dirnames, filenames in walk(self.directory,
                                              followlinks=False):
         for filename in filenames:
             filepath = join(dirpath, filename)
             root, ext = splitext(filepath)
             # better on the remaining time estimator + memory
             if ext in ('.mp3', '.aac'):
                 yield filepath

Example #4

0

Show file

File: utilities.py Project: diamondIPP/StripTelescopeHelperScripts

def list_files(dir, name, level=0, indentation=0,followlinks=True):
    str_ind = ' '*indentation
    print str_ind+'list_files in "%s" with name "%s"' % (dir, name)
    r = []
    walk = scandir.walk(dir, followlinks=followlinks)
    print
    for root, dirs, files in walk:
        if '/root' in root:
            continue
        if '/eps' in root:
            continue
        if (len(files) > 0):
            for file in files:
                if file.endswith('png'):
                    print 'continue in',root
                    break
                if name in file:
                    r.append(root + "/" + file)
    print str_ind+'found %d files' % len(r)
    return r

Example #5

0

Show file

    def _process_work_queue(self):
        """
    Fill in docstring
    """
        self.log.log(9, "_process_work_queue invoked")
        start_time = time.time()
        temp_work = []
        try:
            work_item = self.work_queue.popleft()
        except:
            # No work items
            end_time = time.time()
            self._set_state('idle')
            return False
        work_type = work_item.get('type', None)
        self.log.log(9, "Work type: %s" % work_type)
        self._set_state('processing')
        if work_type == 'dir':
            work_dir = work_item.get('path')
            self.log.log(9, "Processing directory: %s" % work_dir)
            handled = self.handle_directory_pre(work_dir)
            if handled:
                self.stats['filtered_dirs'] += 1
            else:
                temp_work = []
                for root, dirs, files in scandir.walk(work_dir):
                    # Filter subdirectories and files by calling the method in the derived class
                    before_filter_dirs = len(dirs)
                    before_filter_files = len(files)
                    dirs[:], files[:] = self.filter_subdirectories(
                        work_dir, dirs, files)
                    after_filter_dirs = len(dirs)
                    after_filter_files = len(files)
                    if before_filter_dirs != after_filter_dirs:
                        self.stats[
                            'filtered_dirs'] += after_filter_dirs - before_filter_dirs
                    if before_filter_files != after_filter_files:
                        self.stats[
                            'filtered_files'] += after_filter_files - before_filter_files

                    # We queue up any new directories to the left in our work queue.
                    # This leaves the directories closer to the initial ones on the right
                    # side of the work queue.
                    for dir in reversed(dirs):
                        self.work_queue.appendleft({
                            'type':
                            'dir',
                            'path':
                            os.path.join(root, dir)
                        })
                    self.stats['queued_dirs'] += len(dirs)
                    for file in files:
                        # Keep track of how long we have been processing this
                        # directory.  If the time exceeds LONG_PROCESSING_THRESHOLD
                        # we will queue up the files and push them onto the
                        # processing queue and let the main processing loop have a
                        # chance to pick up new commands
                        proc_time = time.time()
                        if (proc_time - start_time
                            ) > HydraUtils.LONG_PROCESSING_THRESHOLD:
                            temp_work.append(file)
                            continue
                        if (self.handle_file(work_dir, file)):
                            self.stats['processed_files'] += 1
                        else:
                            self.stats['skipped_files'] += 1
                    # We actually want to abort the tree walk as we want to handle the directory structure 1 directory at a time
                    dirs[:] = []
            if temp_work:
                self.work_queue.appendleft({
                    'type': 'partial_dir',
                    'path': work_dir,
                    'files': temp_work
                })
            else:
                self.stats['queued_dirs'] -= 1
                self.stats['processed_dirs'] += 1
                handled = self.handle_directory_post(work_dir)
        elif work_type == 'partial_dir':
            # For a partial directory, we need to continue processing all the files
            # remaining in the directory.
            work_dir = work_item.get('path')
            self.log.log(logging.DEBUG,
                         "Processing directory (continued): %s" % work_dir)
            for file in work_item.get('files'):
                # Keep track of how long we have been processing this directory.
                # If the time exceeds LONG_PROCESSING_THRESHOLD, we will queue up
                # the files and push them onto the processing queue and let the
                # main processing loop have a chance to pick up new commands
                proc_time = time.time()
                if (proc_time -
                        start_time) > HydraUtils.LONG_PROCESSING_THRESHOLD:
                    temp_work.append(file)
                    continue
                if (self.handle_file(work_dir, file)):
                    self.stats['processed_files'] += 1
            # If temp_work is empty, we finished the remainder of the directory
            # so we will do the post directory processing
            # If not then we will re-queue the work and continue processing after
            # checking the command queue
            if temp_work:
                self.work_queue.appendleft({
                    'type': 'partial_dir',
                    'path': work_dir,
                    'files': temp_work
                })
            else:
                self.stats['queued_dirs'] -= 1
                self.stats['processed_dirs'] += 1
                handled = self.handle_directory_post(work_dir)
        #elif work_type == 'file':
        #  # TODO: NOT YET IMPLEMENTED
        #  self.handle_file(work_dir, file)
        #  self.stats['processed_files'] += 1
        else:
            self.log.error(
                "Unknown work type found in work queue. Queued work item: %r" %
                work_item)
        end_time = time.time()
        return True