Esempio n. 1
0
 def _after_run(self):
     super(SparkPipeline, self)._after_run()
     for local_uri_info in self._local_uri_infos:
         local_uri = local_uri_info['local_uri']
         hdfs_uri = local_uri_info['hdfs_uri']
         if local_uri_info['overwrite']:
             logger.info("Preparing local directory: %s" % local_uri)
         if not self._force_delete_file(local_uri):
             raise error.BigflowHDFSException(
                 "Failed to remove target path: %s" % local_uri)
         else:
             if self._path_exists(local_uri):
                 raise error.BigflowHDFSException(
                     "Failed to output target path: %s, target path is existed"
                     % local_uri)
         os.makedirs(local_uri)
         self._client.fs_get(hdfs_uri + "/*", local_uri,
                             self._hadoop_config)
     self._local_uri_infos = []
     if SparkPipeline.output_dir_conf_key in self._config["spark_conf"]:
         del self._config["spark_conf"][SparkPipeline.output_dir_conf_key]
Esempio n. 2
0
    def fs_mkdir(self, path, args=None):
        """
        Wraps console command 'hadoop fs -mkdir -p <path>'

        Args:
          path (str):  path to be created
        """
        if not self.fs_test(path, args):
            commit_args = ["fs"]
            commit_args.extend(self.__build_args(path, args))
            commit_args.extend(["-mkdir", "-p", path])
            if not self.__commit(commit_args):
                raise error.BigflowHDFSException("Error create HDFS path %s" %
                                                 path)
            return self.fs_test(path, args)
Esempio n. 3
0
    def fs_mv(self, source, target, args=None):
        """
        Wraps console command 'hadoop fs -mv <source> <target>'

        Args:
          source (str):  path of source
          target (str):  path of target
        """
        commit_args = ["fs"]
        commit_args.extend(self.__build_args(source, args))
        commit_args.extend(["-mv", source, target])

        if not self.__commit(commit_args):
            msg = "Error moving HDFS path ['%s'] to ['%s']" % (source, target)
            raise error.BigflowHDFSException(msg)
Esempio n. 4
0
    def fs_rmr(self, path, args=None):
        """
        Wraps console command 'hadoop fs -rmr <path>'

        Args:
          path (str):  path to be removed
        """
        if self.fs_test(path, args):
            commit_args = ["fs"]
            commit_args.extend(self.__build_args(path, args))
            commit_args.extend(["-rmr", path])
            if not self.__commit(commit_args):
                raise error.BigflowHDFSException(
                    "Error removing HDFS path %s" % path)
            return not self.fs_test(path, args)
Esempio n. 5
0
    def _prepare_cache_archive(self):
        logger.info("Checking PreparedArchive for Spark Pipeline...")
        existed = self._client.fs_test(self.prepared_archive_path,
                                       self._hadoop_config)
        tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4())
        self._job_config.prepared_archive_path = self.prepared_archive_path
        self._job_config.tmp_data_path = tmp_path

        if self._config['reprepare_cache_archive'] or not existed:
            if self._config['reprepare_cache_archive']:
                if not existed:
                    logger.info("Bigflow PreparedArchive does not exist")
                else:
                    logger.info("Re-prepare Bigflow PreparedArchive")
                    self._client.fs_rmr(self.prepared_archive_path,
                                        self._hadoop_config)
            import subprocess

            bigflow_home = self._get_bigflow_python_home()
            local_cache_archive = "bigflow_python_%s.tar.gz" % (str(
                uuid.uuid4()))
            cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % (
                local_cache_archive, bigflow_home)
            ret = subprocess.call(cmd, shell=True)
            if ret != 0:
                raise error.BigflowPlanningException(
                    "Cannot make PreparedArchive file")
            try:
                self._client.fs_put(local_cache_archive, tmp_path,
                                    self._hadoop_config)
                self._client.fs_mv(tmp_path, self.prepared_archive_path,
                                   self._hadoop_config)
            except error.BigflowHDFSException:
                # only need to delete archive path when exception occurs.
                self._remote_temp_files.append(tmp_path)
                if not self._client.fs_test(self.prepared_archive_path,
                                            self._hadoop_config):
                    msg = "Unable to upload Bigflow PreparedArchive, please " \
                          "make sure you have write permission to " \
                          "tmp_data_path['%s']" % self._config['tmp_data_path']
                    raise error.BigflowHDFSException(msg)
            finally:
                ret = subprocess.call("rm %s" % local_cache_archive,
                                      shell=True)
                self._client.fs_rmr(tmp_path, self._hadoop_config)
        else:
            logger.info("Bigflow PreparedArchive exists already")
Esempio n. 6
0
    def fs_put(self, source, target, args=None, need_mkdir=True):
        """
        Wraps console command 'hadoop fs -put <source> <target>'

        Args:
          source (str):  path of source
          target (str):  path of target
        """
        if need_mkdir:
            import os
            mk_path = os.path.dirname(target)
            self.fs_mkdir(mk_path)

        commit_args = ["fs"]
        commit_args.extend(self.__build_args(target, args))
        commit_args.extend(["-put", source, target])
        if not self.__commit(commit_args):
            msg = "Error uploading temp file from [%s] to [%s],"\
                  " please make sure source file exists on local filesystem" \
                  " and you have the write permission to the target hdfs directory," \
                  " or may be you can change your 'tmp_data_path'" % (source, target)
            raise error.BigflowHDFSException(msg)