def test_rmtree_basic(tmpdir, use_shutil): path1 = tmpdir.join('a/b/c/d') os.makedirs(os.path.dirname(str(path1))) path1.write('test') path2 = tmpdir.join('a/c/d') os.makedirs(os.path.dirname(str(path2))) path2.write('test') observed = set() for root, dir_names, file_names in os.walk(str(tmpdir)): for fname in file_names: fpath = os.path.join(root, fname) observed.add(fpath) ## set the permissions so that nobody can even read it, ## and it should still get deleted. os.chmod(fpath, 0000) assert len(observed) == 2 logger.info(os.listdir(str(tmpdir))) rmtree(str(tmpdir.join('a')), use_shutil=use_shutil) logger.info(os.listdir(str(tmpdir))) assert len(os.listdir(str(tmpdir))) == 0
def cleanup(self): '''shutdown all the stages, terminate the work_unit, remove tmp dir This is idempotent. Pipeline users should call this explicitly when done with the pipeline, but this is also registered to be called at shutdown. ''' if self._cleanup_done: return #from streamcorpus_pipeline._rmtree import get_open_fds #logger.critical(get_open_fds(verbose=True)) if (self.t_chunk): self.t_chunk.close() if self.work_unit: self.work_unit.terminate() for transform in self.batch_transforms: transform.shutdown() if not self.cleanup_tmp_files: logger.info( 'skipping cleanup due to config.cleanup_tmp_files=False') else: logger.debug('attempting rm -rf %s', self.tmp_dir_path) rmtree(self.tmp_dir_path) logger.info('finished rm -rf %s', self.tmp_dir_path) self._cleanup_done = True
def test_rmtree_followlinks_True(tmpdir): path_to_delete = make_tree_with_symlink(tmpdir) rmtree(path_to_delete, followlinks=True) observed = set() for root, dir_names, file_names in os.walk(str(tmpdir.join('c')), followlinks=True): for fname in dir_names + file_names: fpath = os.path.join(root, fname) observed.add(fpath) logger.info('\n'.join(sorted(observed))) assert len(observed) == 0
def test_rmtree_single_file(tmpdir, use_shutil): path1 = tmpdir.join('b/c') dirname = os.path.dirname(str(path1)) os.makedirs(dirname) path1.write('test') os.chmod(str(path1), 0000) assert len(os.listdir(dirname)) == 1 logger.info(os.listdir(dirname)) rmtree(str(tmpdir.join('b/c')), use_shutil=use_shutil) logger.info(os.listdir(dirname)) assert len(os.listdir(dirname)) == 0
def cleanup(self): """shutdown all the stages, terminate the work_unit, remove tmp dir This is idempotent. Pipeline users should call this explicitly when done with the pipeline, but this is also registered to be called at shutdown. """ if self._cleanup_done: return # from streamcorpus_pipeline._rmtree import get_open_fds # logger.critical(get_open_fds(verbose=True)) if self.t_chunk: self.t_chunk.close() if self.work_unit: self.work_unit.terminate() for transform in self.batch_transforms: transform.shutdown() if not self.cleanup_tmp_files: logger.info("skipping cleanup due to config.cleanup_tmp_files=False") else: logger.debug("attempting rm -rf %s", self.tmp_dir_path) rmtree(self.tmp_dir_path) logger.info("finished rm -rf %s", self.tmp_dir_path) self._cleanup_done = True
def run(self, i_str, start_count=0, start_chunk_time=None): '''Run the pipeline. This runs all of the steps described in the pipeline constructor, reading from some input and writing to some output. :param str i_str: name of the input file, or other reader-specific description of where to get input :param int start_count: index of the first stream item :param int start_chunk_time: timestamp for the first stream item ''' try: if not os.path.exists(self.tmp_dir_path): os.makedirs(self.tmp_dir_path) if start_chunk_time is None: start_chunk_time = time.time() ## the reader returns generators of StreamItems i_chunk = self.reader(i_str) ## t_path points to the currently in-progress temp chunk t_path = None ## loop over all docs in the chunk processing and cutting ## smaller chunks if needed len_clean_visible = 0 sources = set() next_idx = 0 ## how many have we input and actually done processing on? input_item_count = 0 for si in i_chunk: # TODO: break out a _process_stream_item function? next_idx += 1 ## yield to the gevent hub to allow other things to run if gevent: gevent.sleep(0) ## skip forward until we reach start_count if next_idx <= start_count: continue if next_idx % self.rate_log_interval == 0: ## indexing is zero-based, so next_idx corresponds ## to length of list of SIs processed so far elapsed = time.time() - start_chunk_time if elapsed > 0: rate = float(next_idx) / elapsed logger.info('%d in %.1f --> %.1f per sec on ' '(pre-partial_commit) %s', next_idx - start_count, elapsed, rate, i_str) if not self.t_chunk: ## make a temporary chunk at a temporary path # (Lazy allocation after we've read an item that might get processed out to the new chunk file) # TODO: make this EVEN LAZIER by not opening the t_chunk until inside _run_incremental_transforms whe the first output si is ready t_path = os.path.join(self.tmp_dir_path, 't_chunk-%s' % uuid.uuid4().hex) self.t_chunk = streamcorpus.Chunk(path=t_path, mode='wb') assert self.t_chunk.message == streamcorpus.StreamItem_v0_3_0, self.t_chunk.message # TODO: a set of incremental transforms is equivalent # to a batch transform. Make the pipeline explicitly # configurable as such: # # batch_transforms: [[incr set 1], batch op, [incr set 2], ...] # # OR: for some list of transforms (mixed incremental # and batch) pipeline can detect and batchify as needed ## incremental transforms populate t_chunk ## let the incremental transforms destroy the si by ## returning None si = self._run_incremental_transforms( si, self.incremental_transforms) ## insist that every chunk has only one source string if si: sources.add( si.source ) if self.assert_single_source and len(sources) != 1: raise InvalidStreamItem( 'stream item %r had source %r, not %r ' '(set assert_single_source: false to suppress)' % (si.stream_id, si.source, sources)) if si and si.body and si.body.clean_visible: len_clean_visible += len(si.body.clean_visible) ## log binned clean_visible lengths, for quick stats estimates #logger.debug('len(si.body.clean_visible)=%d' % int(10 * int(math.floor(float(len(si.body.clean_visible)) / 2**10)/10))) #logger.debug('len(si.body.clean_visible)=%d' % len(si.body.clean_visible)) if (self.output_chunk_max_count is not None and len(self.t_chunk) == self.output_chunk_max_count): logger.info('reached output_chunk_max_count (%d) at: %d', len(self.t_chunk), next_idx) self.t_chunk.close() self._intermediate_output_chunk( start_count, next_idx, sources, i_str, t_path) start_count = next_idx elif (self.output_max_clean_visible_bytes is not None and len_clean_visible >= self.output_chunk_max_clean_visible_bytes): logger.info( 'reached output_chunk_max_clean_visible_bytes ' '(%d) at: %d', self.output_chunk_max_clean_visible_bytes, len_clean_visible) len_clean_visible = 0 self.t_chunk.close() self._intermediate_output_chunk( start_count, next_idx, sources, i_str, t_path) start_count = next_idx input_item_count += 1 if ((self.input_item_limit is not None) and (input_item_count > self.input_item_limit)): break ## bool(t_chunk) is False if t_chunk has no data, but we still ## want to make sure it gets closed. if self.t_chunk is not None: self.t_chunk.close() o_paths = self._process_output_chunk( start_count, next_idx, sources, i_str, t_path) self.t_chunk = None else: o_paths = None ## set start_count and o_paths in work_unit and updated data = dict(start_count=next_idx, o_paths=o_paths) logger.debug('WorkUnit.update() data=%r', data) if self.work_unit is not None: self.work_unit.data.update(data) self.work_unit.update() ## return how many stream items we processed return next_idx finally: if self.t_chunk is not None: self.t_chunk.close() for transform in self.batch_transforms: transform.shutdown() if self.cleanup_tmp_files: rmtree(self.tmp_dir_path)