Example #1
0
    def _execute_to_output(self):
        # Skip task if output already exists
        if hdfs.path_exists(self.output):
            self.log('Output path already exists %s', self.output)
            return

        # Define an intermediate output dir
        wip = '%s_wip' % self.output.rstrip('/')
        if hdfs.dus(wip + '*'):
            self.log('Removing intermediate outputs found under %s*', wip)
            hdfs.rmr(wip + '*')
            sleep(3) # give hdfs a chance to remove dir before job recreate it

        # Compute dumbo args and execute dumbo program
        self.execargs = self._execargs(output=wip)
        PythonTask.execute(self)

        # Check dumbo program output and move output to final path
        assert job_succeeded(wip), 'Intermediate output is invalid, check %s' % wip
        self.log('wip dir output is valid, moving to %s', self.output)
        hdfs.mv(self.output, wip)