Example #1
0
def test_rmtree_basic(tmpdir, use_shutil):
    path1 = tmpdir.join('a/b/c/d')
    os.makedirs(os.path.dirname(str(path1)))
    path1.write('test')

    path2 = tmpdir.join('a/c/d')
    os.makedirs(os.path.dirname(str(path2)))
    path2.write('test')

    observed = set()
    for root, dir_names, file_names in os.walk(str(tmpdir)):
        for fname in file_names:
            fpath = os.path.join(root, fname)
            observed.add(fpath)

            ## set the permissions so that nobody can even read it,
            ## and it should still get deleted.
            os.chmod(fpath, 0000)

    assert len(observed) == 2

    logger.info(os.listdir(str(tmpdir)))

    rmtree(str(tmpdir.join('a')), use_shutil=use_shutil)

    logger.info(os.listdir(str(tmpdir)))

    assert len(os.listdir(str(tmpdir))) == 0
def test_rmtree_basic(tmpdir, use_shutil):
    path1 = tmpdir.join('a/b/c/d')
    os.makedirs(os.path.dirname(str(path1)))
    path1.write('test')

    path2 = tmpdir.join('a/c/d')
    os.makedirs(os.path.dirname(str(path2)))
    path2.write('test')

    observed = set()
    for root, dir_names, file_names in os.walk(str(tmpdir)):
        for fname in file_names:
            fpath = os.path.join(root, fname)
            observed.add(fpath)

            ## set the permissions so that nobody can even read it,
            ## and it should still get deleted.
            os.chmod(fpath, 0000)

    assert len(observed) == 2

    logger.info(os.listdir(str(tmpdir)))

    rmtree(str(tmpdir.join('a')), use_shutil=use_shutil)

    logger.info(os.listdir(str(tmpdir)))

    assert  len(os.listdir(str(tmpdir))) == 0
Example #3
0
    def cleanup(self):
        '''shutdown all the stages, terminate the work_unit, remove tmp dir

        This is idempotent.  Pipeline users should call this explicitly
        when done with the pipeline, but this is also registered to
        be called at shutdown.
        '''
        if self._cleanup_done:
            return
        #from streamcorpus_pipeline._rmtree import get_open_fds
        #logger.critical(get_open_fds(verbose=True))
        if (self.t_chunk):
            self.t_chunk.close()
        if self.work_unit:
            self.work_unit.terminate()
        for transform in self.batch_transforms:
            transform.shutdown()
        if not self.cleanup_tmp_files:
            logger.info(
                'skipping cleanup due to config.cleanup_tmp_files=False')
        else:
            logger.debug('attempting rm -rf %s', self.tmp_dir_path)
            rmtree(self.tmp_dir_path)
            logger.info('finished rm -rf %s', self.tmp_dir_path)
        self._cleanup_done = True
def test_rmtree_followlinks_True(tmpdir):
    path_to_delete = make_tree_with_symlink(tmpdir)
    rmtree(path_to_delete, followlinks=True)

    observed = set()
    for root, dir_names, file_names in os.walk(str(tmpdir.join('c')), followlinks=True):
        for fname in dir_names + file_names:
            fpath = os.path.join(root, fname)
            observed.add(fpath)

    logger.info('\n'.join(sorted(observed)))
    assert  len(observed) == 0
Example #5
0
def test_rmtree_followlinks_True(tmpdir):
    path_to_delete = make_tree_with_symlink(tmpdir)
    rmtree(path_to_delete, followlinks=True)

    observed = set()
    for root, dir_names, file_names in os.walk(str(tmpdir.join('c')),
                                               followlinks=True):
        for fname in dir_names + file_names:
            fpath = os.path.join(root, fname)
            observed.add(fpath)

    logger.info('\n'.join(sorted(observed)))
    assert len(observed) == 0
def test_rmtree_single_file(tmpdir, use_shutil):
    path1 = tmpdir.join('b/c')
    dirname = os.path.dirname(str(path1))
    os.makedirs(dirname)
    path1.write('test')

    os.chmod(str(path1), 0000)

    assert len(os.listdir(dirname)) == 1

    logger.info(os.listdir(dirname))

    rmtree(str(tmpdir.join('b/c')), use_shutil=use_shutil)

    logger.info(os.listdir(dirname))

    assert  len(os.listdir(dirname)) == 0
Example #7
0
def test_rmtree_single_file(tmpdir, use_shutil):
    path1 = tmpdir.join('b/c')
    dirname = os.path.dirname(str(path1))
    os.makedirs(dirname)
    path1.write('test')

    os.chmod(str(path1), 0000)

    assert len(os.listdir(dirname)) == 1

    logger.info(os.listdir(dirname))

    rmtree(str(tmpdir.join('b/c')), use_shutil=use_shutil)

    logger.info(os.listdir(dirname))

    assert len(os.listdir(dirname)) == 0
    def cleanup(self):
        """shutdown all the stages, terminate the work_unit, remove tmp dir

        This is idempotent.  Pipeline users should call this explicitly
        when done with the pipeline, but this is also registered to
        be called at shutdown.
        """
        if self._cleanup_done:
            return
        # from streamcorpus_pipeline._rmtree import get_open_fds
        # logger.critical(get_open_fds(verbose=True))
        if self.t_chunk:
            self.t_chunk.close()
        if self.work_unit:
            self.work_unit.terminate()
        for transform in self.batch_transforms:
            transform.shutdown()
        if not self.cleanup_tmp_files:
            logger.info("skipping cleanup due to config.cleanup_tmp_files=False")
        else:
            logger.debug("attempting rm -rf %s", self.tmp_dir_path)
            rmtree(self.tmp_dir_path)
            logger.info("finished rm -rf %s", self.tmp_dir_path)
        self._cleanup_done = True
    def run(self, i_str, start_count=0, start_chunk_time=None):
        '''Run the pipeline.

        This runs all of the steps described in the pipeline constructor,
        reading from some input and writing to some output.

        :param str i_str: name of the input file, or other reader-specific
          description of where to get input
        :param int start_count: index of the first stream item
        :param int start_chunk_time: timestamp for the first stream item

        '''
        try:
            if not os.path.exists(self.tmp_dir_path):
                os.makedirs(self.tmp_dir_path)

            if start_chunk_time is None:
                start_chunk_time = time.time()

            ## the reader returns generators of StreamItems
            i_chunk = self.reader(i_str)

            ## t_path points to the currently in-progress temp chunk
            t_path = None

            ## loop over all docs in the chunk processing and cutting
            ## smaller chunks if needed

            len_clean_visible = 0
            sources = set()
            next_idx = 0

            ## how many have we input and actually done processing on?
            input_item_count = 0

            for si in i_chunk:
                # TODO: break out a _process_stream_item function?
                next_idx += 1

                ## yield to the gevent hub to allow other things to run
                if gevent:
                    gevent.sleep(0)

                ## skip forward until we reach start_count
                if next_idx <= start_count:
                    continue

                if next_idx % self.rate_log_interval == 0:
                    ## indexing is zero-based, so next_idx corresponds
                    ## to length of list of SIs processed so far
                    elapsed = time.time() - start_chunk_time
                    if elapsed > 0:
                        rate = float(next_idx) / elapsed
                        logger.info('%d in %.1f --> %.1f per sec on '
                                    '(pre-partial_commit) %s',
                                    next_idx - start_count, elapsed, rate,
                                    i_str)

                if not self.t_chunk:
                    ## make a temporary chunk at a temporary path
                    # (Lazy allocation after we've read an item that might get processed out to the new chunk file)
                    # TODO: make this EVEN LAZIER by not opening the t_chunk until inside _run_incremental_transforms whe the first output si is ready
                    t_path = os.path.join(self.tmp_dir_path,
                                          't_chunk-%s' % uuid.uuid4().hex)
                    self.t_chunk = streamcorpus.Chunk(path=t_path, mode='wb')
                    assert self.t_chunk.message == streamcorpus.StreamItem_v0_3_0, self.t_chunk.message

                # TODO: a set of incremental transforms is equivalent
                # to a batch transform.  Make the pipeline explicitly
                # configurable as such:
                #
                # batch_transforms: [[incr set 1], batch op, [incr set 2], ...]
                #
                # OR: for some list of transforms (mixed incremental
                # and batch) pipeline can detect and batchify as needed

                ## incremental transforms populate t_chunk
                ## let the incremental transforms destroy the si by
                ## returning None
                si = self._run_incremental_transforms(
                    si, self.incremental_transforms)

                ## insist that every chunk has only one source string
                if si:
                    sources.add( si.source )
                    if self.assert_single_source and len(sources) != 1:
                        raise InvalidStreamItem(
                            'stream item %r had source %r, not %r '
                            '(set assert_single_source: false to suppress)' %
                            (si.stream_id, si.source, sources))

                if si and si.body and si.body.clean_visible:
                    len_clean_visible += len(si.body.clean_visible)
                    ## log binned clean_visible lengths, for quick stats estimates
                    #logger.debug('len(si.body.clean_visible)=%d' % int(10 * int(math.floor(float(len(si.body.clean_visible)) / 2**10)/10)))
                    #logger.debug('len(si.body.clean_visible)=%d' % len(si.body.clean_visible))


                if (self.output_chunk_max_count is not None and
                    len(self.t_chunk) == self.output_chunk_max_count):
                    logger.info('reached output_chunk_max_count (%d) at: %d',
                                len(self.t_chunk), next_idx)
                    self.t_chunk.close()
                    self._intermediate_output_chunk(
                        start_count, next_idx, sources, i_str, t_path)
                    start_count = next_idx

                elif (self.output_max_clean_visible_bytes is not None and
                      len_clean_visible >=
                      self.output_chunk_max_clean_visible_bytes):
                    logger.info(
                        'reached output_chunk_max_clean_visible_bytes '
                        '(%d) at: %d',
                        self.output_chunk_max_clean_visible_bytes,
                        len_clean_visible)
                    len_clean_visible = 0
                    self.t_chunk.close()
                    self._intermediate_output_chunk(
                        start_count, next_idx, sources, i_str, t_path)
                    start_count = next_idx

                input_item_count += 1
                if ((self.input_item_limit is not None) and
                    (input_item_count > self.input_item_limit)):
                    break

            ## bool(t_chunk) is False if t_chunk has no data, but we still
            ## want to make sure it gets closed.
            if self.t_chunk is not None:
                self.t_chunk.close()
                o_paths = self._process_output_chunk(
                    start_count, next_idx, sources, i_str, t_path)
                self.t_chunk = None
            else:
                o_paths = None

            ## set start_count and o_paths in work_unit and updated
            data = dict(start_count=next_idx, o_paths=o_paths)
            logger.debug('WorkUnit.update() data=%r', data)
            if self.work_unit is not None:
                self.work_unit.data.update(data)
                self.work_unit.update()

            ## return how many stream items we processed
            return next_idx

        finally:
            if self.t_chunk is not None:
                self.t_chunk.close()
            for transform in self.batch_transforms:
                transform.shutdown()
            if self.cleanup_tmp_files:
                rmtree(self.tmp_dir_path)