Example #1
0
        def wrapper(test_class_obj):
            """ inner """
            _first_run = True
            if test_class_obj.pipeline_type in modes:
                for filesystem in expect_filesystems:
                    if filesystem in test_class_obj.support_file_system:
                        if _skip_filesystem_test(filesystem):
                            continue
                        test_class_obj.root_path = test_class_obj.root_path_dict[
                            filesystem]
                        test_class_obj.running_on_filesystem = filesystem
                        logger.info(
                            "running case [%s.%s] root_path=[%s], filesystem=[%s]"
                            % (type(test_class_obj).__name__, fn.func_name,
                               test_class_obj.root_path,
                               test_class_obj.running_on_filesystem))

                        if not _first_run:
                            test_class_obj.tearDown()
                        test_class_obj.setUp()
                        fn(test_class_obj)
                        _first_run = False
                    else:
                        logger.warn('\033[01;31mWarning!!! %s not executed,' \
                                ' because filesystem is %s.\033[00m' \
                                %(fn.__name__, filesystem))
Example #2
0
    def setUp(self):
        """ no comments """
        if self.shortDescription() is None:
            case_msg = self.id()
        else:
            case_msg = "%s - %s" % (self.shortDescription(), self.id())

        logger.info('setUp for case: %s' % case_msg)
        self._tmp_path = []

        self.pipeline_type = os.environ.get('PIPELINE_TYPE', 'local').lower()
        if self.pipeline_type == 'hadoop':
            self.pipeline_type = 'dagmr'

        if not hasattr(self, "running_on_filesystem"):
            if self.pipeline_type == "local":
                self.running_on_filesystem = "local"
            else:
                self.running_on_filesystem = "hdfs"

        if self.pipeline_type not in self._supported_pipeline_type():
             self.skipTest("pipeline type is not supported in this case")

        hdfs_root_path = os.environ.get('HDFS_ROOT_PATH', '').strip()

        self.support_file_system = ['local']
        if hdfs_root_path:
            self.support_file_system.append("hdfs")

        self.root_path_dict = {'local': '.', 'hdfs': hdfs_root_path}
        self.root_path = self.root_path_dict[self.running_on_filesystem]

        self.setConfig()

        self._conditions = []
Example #3
0
    def _upload_file(self, local_path):
        user_provided_config = self._hadoop_config
        hdfs_path = []
        tmp_data_path = self._job_config.tmp_data_path

        for path in glob.glob(local_path):

            if os.path.isdir(path):
                target = os.path.join(tmp_data_path, "local_input",
                                      str(uuid.uuid4()))
                logger.info("Uploading input directory [%s] to [%s]" %
                            (path, target))
            elif os.path.isfile(path):
                # do not change the basename of the input file to keep the suffix.
                file_name = os.path.basename(path)
                target = os.path.join(tmp_data_path, "local_input",
                                      str(uuid.uuid4()), file_name)
                logger.info("Uploading input file [%s] to [%s]" %
                            (path, target))
            else:
                raise error.BigflowRuntimeException(
                    "file [%s] (matched by pattern [%s]) "
                    "is neither a dir nor a regular file" % (path, local_path))

            hdfs_path.append(target)
            self._client.fs_put(path, target, user_provided_config)
            self._remote_temp_files.append(target)

        return hdfs_path
Example #4
0
    def setUp(self):
        """ no comments """
        if self.shortDescription() is None:
            case_msg = self.id()
        else:
            case_msg = "%s - %s" % (self.shortDescription(), self.id())

        logger.info('setUp for case: %s' % case_msg)
        self._tmp_path = []

        self.pipeline_type = os.environ.get('PIPELINE_TYPE', 'local').lower()
        if self.pipeline_type == 'hadoop':
            self.pipeline_type = 'dagmr'

        if not hasattr(self, "running_on_filesystem"):
            if self.pipeline_type == "local":
                self.running_on_filesystem = "local"
            else:
                self.running_on_filesystem = "hdfs"

        if self.pipeline_type not in self._supported_pipeline_type():
            self.skipTest("pipeline type is not supported in this case")

        hdfs_root_path = os.environ.get('HDFS_ROOT_PATH', '').strip()

        self.support_file_system = ['local']
        if hdfs_root_path:
            self.support_file_system.append("hdfs")

        self.root_path_dict = {'local': '.', 'hdfs': hdfs_root_path}
        self.root_path = self.root_path_dict[self.running_on_filesystem]

        self.setConfig()

        self._conditions = []
Example #5
0
 def act(self, line):
     pos = 0
     for level in _SparkDriverParser.spark_log_levels:
         pos = line.find(level)
         if pos != -1:
             break
     self._con_lines_with_space.reset_mismatch_line()
     logger.info(line[pos+1:])
Example #6
0
 def act(self, line):
     pos = 0
     for level in _SparkDriverParser.spark_log_levels:
         pos = line.find(level)
         if pos != -1:
             break
     self._con_lines_with_space.reset_mismatch_line()
     logger.info(line[pos + 1:])
Example #7
0
 def accept(self, line):
     # line starts with at least `space_num` white spaces or starts with '\t'
     matched = len(line[:self.space_num].strip()) == 0 or \
               line.startswith(self.leading_w_space)
     if not matched:
         if not self.empty():  # buffered log should be flushed
             logger.info(self.msg())
             self.reset()
         self.pre_mismatched_line = line
     return matched
Example #8
0
 def accept(self, line):
     # line starts with at least `space_num` white spaces or starts with '\t'
     matched = len(line[:self.space_num].strip()) == 0 or \
               line.startswith(self.leading_w_space)
     if not matched:
         if not self.empty():  # buffered log should be flushed
             logger.info(self.msg())
             self.reset()
         self.pre_mismatched_line = line
     return matched
Example #9
0
 def run_tests(self):
     """execute tests"""
     for _, value in self.tests.iteritems():
         lhs, rhs, expr, need_sorted = value
         logger.info("I am testing: %s" % str(expr))
         if isinstance(lhs, list):
             if need_sorted:
                 self.assertItemsEqual(lhs, rhs.get(), expr)
             else:
                 self.assertListEqual(lhs, rhs.get(), expr)
         elif isinstance(lhs, dict):
             self.assertDictEqual(lhs, rhs.get(), expr)
         else:
             self.assertEqual(lhs, rhs.get(), expr)
     # clear all the tests
     self.tests.clear()
Example #10
0
 def run_tests(self):
     """execute tests"""
     for _, value in self.tests.iteritems():
         lhs, rhs, expr, need_sorted = value
         logger.info("I am testing: %s" % str(expr))
         if isinstance(lhs, list):
             if need_sorted:
                 self.assertItemsEqual(lhs, rhs.get(), expr)
             else:
                 self.assertListEqual(lhs, rhs.get(), expr)
         elif isinstance(lhs, dict):
             self.assertDictEqual(lhs, rhs.get(), expr)
         else:
             self.assertEqual(lhs, rhs.get(), expr)
     # clear all the tests
     self.tests.clear()
Example #11
0
 def _print_counters(self):
     # print counters after run
     c_dict = counter._get_all(grouped=True)
     if len(c_dict) > 0:
         logger.info("=========================================================")
         logger.info("all counters:")
         for group in sorted(c_dict.iterkeys()):
             logger.info("\t%s:" % group)
             for k, v in c_dict[group].iteritems():
                 logger.info("\t\t%s=%d" % (k, v))
Example #12
0
 def _after_run(self):
     super(SparkPipeline, self)._after_run()
     for local_uri_info in self._local_uri_infos:
         local_uri = local_uri_info['local_uri']
         hdfs_uri = local_uri_info['hdfs_uri']
         if local_uri_info['overwrite']:
             logger.info("Preparing local directory: %s" % local_uri)
         if not self._force_delete_file(local_uri):
             raise error.BigflowHDFSException("Failed to remove target path: %s" % local_uri)
         else:
             if self._path_exists(local_uri):
                 raise error.BigflowHDFSException(
                     "Failed to output target path: %s, target path is existed" % local_uri)
         os.makedirs(local_uri)
         self._client.fs_get(hdfs_uri + "/*", local_uri, self._hadoop_config)
     self._local_uri_infos = []
     if SparkPipeline.output_dir_conf_key in self._config["spark_conf"]:
         del self._config["spark_conf"][SparkPipeline.output_dir_conf_key]
Example #13
0
    def run(self):
        """
        立刻运行Pipeline并等待结束

        Raises:
          BigflowRuntimeException:  若运行期出错抛出此异常
        """
        self._before_run()
        try:
            commit_args = []
            for key, value in self._hadoop_config.iteritems():
                commit_args.extend(["-D", key + "=" + value])
            requests.launch(self._id, self._plan_message, self._resource_message, commit_args)
            logger.info("Job ran successfully")
        except Exception as e:
            self._handle_serialized_exception()
            raise
        self._after_run()
Example #14
0
 def _print_counters(self):
     # print counters after run
     c_dict = counter._get_all(grouped=True)
     if len(c_dict) > 0:
         logger.info(
             "=========================================================")
         logger.info("all counters:")
         for group in sorted(c_dict.iterkeys()):
             logger.info("\t%s:" % group)
             for k, v in c_dict[group].iteritems():
                 logger.info("\t\t%s=%d" % (k, v))
Example #15
0
    def run(self):
        """
        立刻运行Pipeline并等待结束

        Raises:
          BigflowRuntimeException:  若运行期出错抛出此异常
        """
        self._before_run()
        try:
            commit_args = []
            for key, value in self._hadoop_config.iteritems():
                commit_args.extend(["-D", key + "=" + value])
            requests.launch(self._id, self._plan_message,
                            self._resource_message, commit_args)
            logger.info("Job ran successfully")
        except Exception as e:
            self._handle_serialized_exception()
            raise
        self._after_run()
Example #16
0
    def end_serde_test(self):
        """ test """
        import sys
        from bigflow.core import entity
        logger.info(str(self._checking_condition))
        values = map(lambda condition: condition[1], self._checking_condition)
        p_values = self._pipeline.parallelize([values
                                               ])  # 避免map结点超过32个(Hadoop的限制)
        p_value_list = []

        out = []
        for (i, (sd, value)) in enumerate(self._checking_condition):
            sd1 = serde.of(int)
            sd2 = sd

            cpp_deserialize_fn = entity.KVDeserializeFn(sd1, sd2)
            cpp_serialize_fn = entity.KVSerializeFn(sd1, sd2)

            python_deserialize_fn = lambda kv: (sd1.deserialize(kv[0]),
                                                sd2.deserialize(kv[1]))
            python_serialize_fn = lambda kv: (sd1.serialize(kv[0]),
                                              sd2.serialize(kv[1]))

            serialize_fns = [cpp_serialize_fn, python_serialize_fn]
            deserialize_fns = [cpp_deserialize_fn, python_deserialize_fn]

            kv_val = (1, value)

            def _assert_eq_val(v):
                assert v == kv_val

            for serialize_fn in serialize_fns:
                for deserialize_fn in deserialize_fns:
                    out.append(
                        p_values.map(lambda x: (1, x[i])).map(serialize_fn).
                        map(deserialize_fn).map(_assert_eq_val))
        if out:
            transforms.union(*out).cache()
        else:
            print >> sys.stderr, "SKIP a test!!!"
        self._pipeline.run()
Example #17
0
 def _after_run(self):
     super(SparkPipeline, self)._after_run()
     for local_uri_info in self._local_uri_infos:
         local_uri = local_uri_info['local_uri']
         hdfs_uri = local_uri_info['hdfs_uri']
         if local_uri_info['overwrite']:
             logger.info("Preparing local directory: %s" % local_uri)
         if not self._force_delete_file(local_uri):
             raise error.BigflowHDFSException(
                 "Failed to remove target path: %s" % local_uri)
         else:
             if self._path_exists(local_uri):
                 raise error.BigflowHDFSException(
                     "Failed to output target path: %s, target path is existed"
                     % local_uri)
         os.makedirs(local_uri)
         self._client.fs_get(hdfs_uri + "/*", local_uri,
                             self._hadoop_config)
     self._local_uri_infos = []
     if SparkPipeline.output_dir_conf_key in self._config["spark_conf"]:
         del self._config["spark_conf"][SparkPipeline.output_dir_conf_key]
Example #18
0
 def _handle_new_writtens(self):
     if len(self._uri_to_write) > 0:
         logger.info("=========================================================")
         logger.info("all outputs:")
         for uri in self._uri_to_write:
             logger.info("\t%s" % uri)
         self._uri_to_write[:] = []
Example #19
0
 def _handle_new_writtens(self):
     if len(self._uri_to_write) > 0:
         logger.info(
             "=========================================================")
         logger.info("all outputs:")
         for uri in self._uri_to_write:
             logger.info("\t%s" % uri)
         self._uri_to_write[:] = []
Example #20
0
    def end_serde_test(self):
        """ test """
        import sys
        from bigflow.core import entity
        logger.info(str(self._checking_condition))
        values = map(lambda condition: condition[1], self._checking_condition)
        p_values = self._pipeline.parallelize([values]) # 避免map结点超过32个(Hadoop的限制)
        p_value_list = []


        out = []
        for (i, (sd, value)) in enumerate(self._checking_condition):
            sd1 = serde.of(int)
            sd2 = sd

            cpp_deserialize_fn = entity.KVDeserializeFn(sd1, sd2)
            cpp_serialize_fn = entity.KVSerializeFn(sd1, sd2)

            python_deserialize_fn = lambda kv: (sd1.deserialize(kv[0]), sd2.deserialize(kv[1]))
            python_serialize_fn = lambda kv: (sd1.serialize(kv[0]), sd2.serialize(kv[1]))

            serialize_fns = [cpp_serialize_fn, python_serialize_fn]
            deserialize_fns = [cpp_deserialize_fn, python_deserialize_fn]

            kv_val = (1, value)
            def _assert_eq_val(v):
                assert v == kv_val
            for serialize_fn in serialize_fns:
                for deserialize_fn in deserialize_fns:
                    out.append(p_values.map(lambda x: (1, x[i]))
                            .map(serialize_fn)
                            .map(deserialize_fn)
                            .map(_assert_eq_val))
        if out:
            transforms.union(*out).cache()
        else:
            print >> sys.stderr, "SKIP a test!!!"
        self._pipeline.run()
Example #21
0
        def wrapper(test_class_obj):
            """ inner """
            _first_run = True
            if test_class_obj.pipeline_type in modes:
                for filesystem in expect_filesystems:
                    if filesystem in test_class_obj.support_file_system:
                        if _skip_filesystem_test(filesystem):
                            continue
                        test_class_obj.root_path = test_class_obj.root_path_dict[filesystem]
                        test_class_obj.running_on_filesystem = filesystem
                        logger.info("running case [%s.%s] root_path=[%s], filesystem=[%s]" %
                                    (type(test_class_obj).__name__, fn.func_name, test_class_obj.root_path,
                                     test_class_obj.running_on_filesystem))

                        if not _first_run:
                            test_class_obj.tearDown()
                        test_class_obj.setUp()
                        fn(test_class_obj)
                        _first_run = False
                    else:
                        logger.warn('\033[01;31mWarning!!! %s not executed,' \
                                ' because filesystem is %s.\033[00m' \
                                %(fn.__name__, filesystem))
Example #22
0
    def _prepare_cache_archive(self):
        logger.info("Checking PreparedArchive for Spark Pipeline...")
        existed = self._client.fs_test(self.prepared_archive_path,
                                       self._hadoop_config)
        tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4())
        self._job_config.prepared_archive_path = self.prepared_archive_path
        self._job_config.tmp_data_path = tmp_path

        if self._config['reprepare_cache_archive'] or not existed:
            if self._config['reprepare_cache_archive']:
                if not existed:
                    logger.info("Bigflow PreparedArchive does not exist")
                else:
                    logger.info("Re-prepare Bigflow PreparedArchive")
                    self._client.fs_rmr(self.prepared_archive_path,
                                        self._hadoop_config)
            import subprocess

            bigflow_home = self._get_bigflow_python_home()
            local_cache_archive = "bigflow_python_%s.tar.gz" % (str(
                uuid.uuid4()))
            cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % (
                local_cache_archive, bigflow_home)
            ret = subprocess.call(cmd, shell=True)
            if ret != 0:
                raise error.BigflowPlanningException(
                    "Cannot make PreparedArchive file")
            try:
                self._client.fs_put(local_cache_archive, tmp_path,
                                    self._hadoop_config)
                self._client.fs_mv(tmp_path, self.prepared_archive_path,
                                   self._hadoop_config)
            except error.BigflowHDFSException:
                # only need to delete archive path when exception occurs.
                self._remote_temp_files.append(tmp_path)
                if not self._client.fs_test(self.prepared_archive_path,
                                            self._hadoop_config):
                    msg = "Unable to upload Bigflow PreparedArchive, please " \
                          "make sure you have write permission to " \
                          "tmp_data_path['%s']" % self._config['tmp_data_path']
                    raise error.BigflowHDFSException(msg)
            finally:
                ret = subprocess.call("rm %s" % local_cache_archive,
                                      shell=True)
                self._client.fs_rmr(tmp_path, self._hadoop_config)
        else:
            logger.info("Bigflow PreparedArchive exists already")
Example #23
0
    def _upload_file(self, local_path):
        user_provided_config = self._hadoop_config
        hdfs_path = []
        tmp_data_path = self._job_config.tmp_data_path

        for path in glob.glob(local_path):

            if os.path.isdir(path):
                target = os.path.join(tmp_data_path, "local_input", str(uuid.uuid4()))
                logger.info("Uploading input directory [%s] to [%s]" % (path, target))
            elif os.path.isfile(path):
                # do not change the basename of the input file to keep the suffix.
                file_name = os.path.basename(path)
                target = os.path.join(tmp_data_path, "local_input", str(uuid.uuid4()), file_name)
                logger.info("Uploading input file [%s] to [%s]" % (path, target))
            else:
                raise error.BigflowRuntimeException("file [%s] (matched by pattern [%s]) "
                                                    "is neither a dir nor a regular file" % (path, local_path))

            hdfs_path.append(target)
            self._client.fs_put(path, target, user_provided_config)
            self._remote_temp_files.append(target)

        return hdfs_path
Example #24
0
    def _prepare_cache_archive(self):
        logger.info("Checking PreparedArchive for Spark Pipeline...")
        existed = self._client.fs_test(self.prepared_archive_path, self._hadoop_config)
        tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4())
        self._job_config.prepared_archive_path = self.prepared_archive_path
        self._job_config.tmp_data_path = tmp_path

        if self._config['reprepare_cache_archive'] or not existed:
            if self._config['reprepare_cache_archive']:
                if not existed:
                    logger.info("Bigflow PreparedArchive does not exist")
                else:
                    logger.info("Re-prepare Bigflow PreparedArchive")
                    self._client.fs_rmr(self.prepared_archive_path, self._hadoop_config)
            import subprocess

            bigflow_home = self._get_bigflow_python_home()
            local_cache_archive = "bigflow_python_%s.tar.gz" % (str(uuid.uuid4()))
            cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % (local_cache_archive, bigflow_home)
            ret = subprocess.call(cmd, shell=True)
            if ret != 0:
                raise error.BigflowPlanningException("Cannot make PreparedArchive file")
            try:
                self._client.fs_put(
                        local_cache_archive, tmp_path, self._hadoop_config)
                self._client.fs_mv(
                        tmp_path, self.prepared_archive_path, self._hadoop_config)
            except error.BigflowHDFSException:
                # only need to delete archive path when exception occurs.
                self._remote_temp_files.append(tmp_path)
                if not self._client.fs_test(self.prepared_archive_path, self._hadoop_config):
                    msg = "Unable to upload Bigflow PreparedArchive, please " \
                          "make sure you have write permission to " \
                          "tmp_data_path['%s']" % self._config['tmp_data_path']
                    raise error.BigflowHDFSException(msg)
            finally:
                ret = subprocess.call("rm %s" % local_cache_archive, shell=True)
                self._client.fs_rmr(tmp_path, self._hadoop_config)
        else:
            logger.info("Bigflow PreparedArchive exists already")
Example #25
0
 def act(self, line):
     pos = line.find("split uri : ") + len("split uri : ")
     logger.info("Reading input: %s" % line[pos:])
Example #26
0
 def act(self, line):
     pos = line.find("split uri : ") + len("split uri : ")
     logger.info("Reading input: %s" % line[pos:])
Example #27
0
 def act(self, line):
     if isinstance(self.replace, list):
         for r in self.replace:
             logger.info(r)
     else:
         logger.info(self.replace)
Example #28
0
 def act(self, line):
     pos = line.find(_HadoopLogParser.hadoop_stderr_msg)
     logger.info(line[pos:])
Example #29
0
 def act(self, line):
     if isinstance(self.replace, list):
         for r in self.replace:
             logger.info(r)
     else:
         logger.info(self.replace)
Example #30
0
 def act(self, line):
     pos = line.find(_HadoopLogParser.hadoop_stderr_msg)
     logger.info(line[pos:])