def _check_dependencies(python_exec, modules_to_test): # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and # explicitly prints out. See SPARK-23300. if pyspark_sql in modules_to_test: # TODO(HyukjinKwon): Relocate and deduplicate these version specifications. minimum_pyarrow_version = '0.8.0' minimum_pandas_version = '0.19.2' try: pyarrow_version = subprocess_check_output([ python_exec, "-c", "import pyarrow; print(pyarrow.__version__)" ], universal_newlines=True, stderr=open( os.devnull, 'w')).strip() if LooseVersion(pyarrow_version) >= LooseVersion( minimum_pyarrow_version): LOGGER.info( "Will test PyArrow related features against Python executable " "'%s' in '%s' module." % (python_exec, pyspark_sql.name)) else: LOGGER.warning( "Will skip PyArrow related features against Python executable " "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow " "%s was found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version)) except: LOGGER.warning( "Will skip PyArrow related features against Python executable " "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow " "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version)) try: pandas_version = subprocess_check_output([ python_exec, "-c", "import pandas; print(pandas.__version__)" ], universal_newlines=True, stderr=open( os.devnull, 'w')).strip() if LooseVersion(pandas_version) >= LooseVersion( minimum_pandas_version): LOGGER.info( "Will test Pandas related features against Python executable " "'%s' in '%s' module." % (python_exec, pyspark_sql.name)) else: LOGGER.warning( "Will skip Pandas related features against Python executable " "'%s' in '%s' module. Pandas >= %s is required; however, Pandas " "%s was found." % (python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version)) except: LOGGER.warning( "Will skip Pandas related features against Python executable " "'%s' in '%s' module. Pandas >= %s is required; however, Pandas " "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
def _check_dependencies(python_exec, modules_to_test): if "COVERAGE_PROCESS_START" in os.environ: # Make sure if coverage is installed. try: subprocess_check_output( [python_exec, "-c", "import coverage"], stderr=open(os.devnull, 'w')) except: print_red("Coverage is not installed in Python executable '%s' " "but 'COVERAGE_PROCESS_START' environment variable is set, " "exiting." % python_exec) sys.exit(-1) # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and # explicitly prints out. See SPARK-23300. if pyspark_sql in modules_to_test: # TODO(HyukjinKwon): Relocate and deduplicate these version specifications. minimum_pyarrow_version = '0.8.0' minimum_pandas_version = '0.19.2' try: pyarrow_version = subprocess_check_output( [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"], universal_newlines=True, stderr=open(os.devnull, 'w')).strip() if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version): LOGGER.info("Will test PyArrow related features against Python executable " "'%s' in '%s' module." % (python_exec, pyspark_sql.name)) else: LOGGER.warning( "Will skip PyArrow related features against Python executable " "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow " "%s was found." % ( python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version)) except: LOGGER.warning( "Will skip PyArrow related features against Python executable " "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow " "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version)) try: pandas_version = subprocess_check_output( [python_exec, "-c", "import pandas; print(pandas.__version__)"], universal_newlines=True, stderr=open(os.devnull, 'w')).strip() if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version): LOGGER.info("Will test Pandas related features against Python executable " "'%s' in '%s' module." % (python_exec, pyspark_sql.name)) else: LOGGER.warning( "Will skip Pandas related features against Python executable " "'%s' in '%s' module. Pandas >= %s is required; however, Pandas " "%s was found." % ( python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version)) except: LOGGER.warning( "Will skip Pandas related features against Python executable " "'%s' in '%s' module. Pandas >= %s is required; however, Pandas " "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
def main(): opts = parse_opts() if (opts.verbose): log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = opts.python_executables.split(',') modules_to_test = [] for module_name in opts.modules.split(','): if module_name in python_modules: modules_to_test.append(python_modules[module_name]) else: print("Error: unrecognized module '%s'. Supported modules: %s" % (module_name, ", ".join(python_modules))) sys.exit(-1) LOGGER.info("Will test against the following Python executables: %s", python_execs) LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test]) task_queue = Queue.Queue() for python_exec in python_execs: python_implementation = subprocess_check_output( [python_exec, "-c", "import platform; print(platform.python_implementation())"], universal_newlines=True).strip() LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation) LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output( [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) for module in modules_to_test: if python_implementation not in module.blacklisted_python_implementations: for test_goal in module.python_test_goals: task_queue.put((python_exec, test_goal)) def process_queue(task_queue): while True: try: (python_exec, test_goal) = task_queue.get_nowait() except Queue.Empty: break try: run_individual_python_test(test_goal, python_exec) finally: task_queue.task_done() start_time = time.time() for _ in range(opts.parallelism): worker = Thread(target=process_queue, args=(task_queue,)) worker.daemon = True worker.start() try: task_queue.join() except (KeyboardInterrupt, SystemExit): print_red("Exiting due to interrupt") sys.exit(-1) total_duration = time.time() - start_time LOGGER.info("Tests passed in %i seconds", total_duration)
def main(): opts = parse_opts() if (opts.verbose): log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = opts.python_executables.split(',') modules_to_test = [] for module_name in opts.modules.split(','): if module_name in snappy_python_modules: modules_to_test.append(module_name) else: print("Error: unrecognized module '%s'. Supported modules: %s" % (module_name, ", ".join(snappy_python_modules))) sys.exit(-1) LOGGER.info("Will test against the following Python executables: %s", python_execs) LOGGER.info("Will test the following Python modules: %s", [x for x in modules_to_test]) task_queue = Queue.PriorityQueue() for python_exec in python_execs: python_implementation = subprocess_check_output( [python_exec, "-c", "import platform; print(platform.python_implementation())"], universal_newlines=True).strip() LOGGER.info("%s python_implementation is %s", python_exec, python_implementation) LOGGER.info("%s version is: %s", python_exec, subprocess_check_output( [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) for module in modules_to_test: test_goal = python_test_goals[module] task_queue.put((0, (python_exec, test_goal))) def process_queue(task_queue): while True: try: (priority, (python_exec, test_goal)) = task_queue.get_nowait() except Queue.Empty: break try: run_individual_python_test(test_goal, python_exec) finally: task_queue.task_done() start_time = time.time() for _ in range(opts.parallelism): worker = Thread(target=process_queue, args=(task_queue,)) worker.daemon = True worker.start() try: task_queue.join() except (KeyboardInterrupt, SystemExit): print_red("Exiting due to interrupt") sys.exit(-1) total_duration = time.time() - start_time LOGGER.info("Tests passed in %i seconds", total_duration)
def _check_coverage(python_exec): # Make sure if coverage is installed. try: subprocess_check_output([python_exec, "-c", "import coverage"], stderr=open(os.devnull, 'w')) except: print_red("Coverage is not installed in Python executable '%s' " "but 'COVERAGE_PROCESS_START' environment variable is set, " "exiting." % python_exec) sys.exit(-1)
def _check_coverage(python_exec): # Make sure if coverage is installed. try: subprocess_check_output( [python_exec, "-c", "import coverage"], stderr=open(os.devnull, 'w')) except: print_red("Coverage is not installed in Python executable '%s' " "but 'COVERAGE_PROCESS_START' environment variable is set, " "exiting." % python_exec) sys.exit(-1)
def main(): opts = parse_opts() if (opts.verbose): log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = opts.python_executables.split(',') modules_to_test = [] for module_name in opts.modules.split(','): if module_name in python_modules: modules_to_test.append(python_modules[module_name]) else: print("Error: unrecognized module '%s'. Supported modules: %s" % (module_name, ", ".join(python_modules))) sys.exit(-1) LOGGER.info("Will test against the following Python executables: %s", python_execs) LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test]) task_queue = Queue.PriorityQueue() for python_exec in python_execs: # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START' # environmental variable is set. if "COVERAGE_PROCESS_START" in os.environ: _check_coverage(python_exec) python_implementation = subprocess_check_output( [ python_exec, "-c", "import platform; print(platform.python_implementation())" ], universal_newlines=True).strip() LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation) LOGGER.debug( "%s version is: %s", python_exec, subprocess_check_output([python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) for module in modules_to_test: if python_implementation not in module.blacklisted_python_implementations: for test_goal in module.python_test_goals: if test_goal in ('pyspark.streaming.tests', 'pyspark.mllib.tests', 'pyspark.tests', 'pyspark.sql.tests'): priority = 0 else: priority = 100 task_queue.put((priority, (python_exec, test_goal))) # Create the target directory before starting tasks to avoid races. target_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), 'target')) if not os.path.isdir(target_dir): os.mkdir(target_dir) def process_queue(task_queue): while True: try: (priority, (python_exec, test_goal)) = task_queue.get_nowait() except Queue.Empty: break try: run_individual_python_test(target_dir, test_goal, python_exec) finally: task_queue.task_done() start_time = time.time() for _ in range(opts.parallelism): worker = Thread(target=process_queue, args=(task_queue, )) worker.daemon = True worker.start() try: task_queue.join() except (KeyboardInterrupt, SystemExit): print_red("Exiting due to interrupt") sys.exit(-1) total_duration = time.time() - start_time LOGGER.info("Tests passed in %i seconds", total_duration) for key, lines in sorted(SKIPPED_TESTS.items()): pyspark_python, test_name = key LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python)) for line in lines: LOGGER.info(" %s" % line.rstrip())
def main(): opts = parse_opts() if opts.verbose: log_level = logging.DEBUG else: log_level = logging.INFO should_test_modules = opts.testnames is None logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = opts.python_executables.split(',') LOGGER.info("Will test against the following Python executables: %s", python_execs) if should_test_modules: modules_to_test = [] for module_name in opts.modules.split(','): if module_name in python_modules: modules_to_test.append(python_modules[module_name]) else: print("Error: unrecognized module '%s'. Supported modules: %s" % (module_name, ", ".join(python_modules))) sys.exit(-1) LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test]) else: testnames_to_test = opts.testnames.split(',') LOGGER.info("Will test the following Python tests: %s", testnames_to_test) task_queue = Queue.PriorityQueue() for python_exec in python_execs: # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START' # environmental variable is set. if "COVERAGE_PROCESS_START" in os.environ: _check_coverage(python_exec) python_implementation = subprocess_check_output( [python_exec, "-c", "import platform; print(platform.python_implementation())"], universal_newlines=True).strip() LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation) LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output( [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) if should_test_modules: for module in modules_to_test: if python_implementation not in module.blacklisted_python_implementations: for test_goal in module.python_test_goals: heavy_tests = ['pyspark.streaming.tests', 'pyspark.mllib.tests', 'pyspark.tests', 'pyspark.sql.tests', 'pyspark.ml.tests'] if any(map(lambda prefix: test_goal.startswith(prefix), heavy_tests)): priority = 0 else: priority = 100 task_queue.put((priority, (python_exec, test_goal))) else: for test_goal in testnames_to_test: task_queue.put((0, (python_exec, test_goal))) # Create the target directory before starting tasks to avoid races. target_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'target')) if not os.path.isdir(target_dir): os.mkdir(target_dir) def process_queue(task_queue): while True: try: (priority, (python_exec, test_goal)) = task_queue.get_nowait() except Queue.Empty: break try: run_individual_python_test(target_dir, test_goal, python_exec) finally: task_queue.task_done() start_time = time.time() for _ in range(opts.parallelism): worker = Thread(target=process_queue, args=(task_queue,)) worker.daemon = True worker.start() try: task_queue.join() except (KeyboardInterrupt, SystemExit): print_red("Exiting due to interrupt") sys.exit(-1) total_duration = time.time() - start_time LOGGER.info("Tests passed in %i seconds", total_duration) for key, lines in sorted(SKIPPED_TESTS.items()): pyspark_python, test_name = key LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python)) for line in lines: LOGGER.info(" %s" % line.rstrip())
env = get_build_environment() mtt = modules_to_test(env) circleNodeIndex = os.getenv("CIRCLE_NODE_INDEX") circleNodeTotal = os.getenv("CIRCLE_NODE_TOTAL") if circleNodeTotal is not None: length = len(all_python_executables) fromExec = int(circleNodeIndex) * length / int(circleNodeTotal) toExec = (int(circleNodeIndex) + 1) * length / int(circleNodeTotal) python_executables_for_run = all_python_executables[fromExec:toExec] else: python_executables_for_run = all_python_executables LOGGER.info("Testing following python executables in this run: %s", python_executables_for_run) modules_with_python_tests = [m for m in mtt.test_modules if m.python_test_goals] if modules_with_python_tests: run_python_tests(modules_with_python_tests, 8, python_executables_for_run) # Packaging tests create a conda environment for each python version # We'd like to use the same version that our executables above use python_exact_versions = [ subprocess_check_output( [python_exec, "-c", "import platform; print(platform.python_version())"], universal_newlines=True).strip() for python_exec in python_executables_for_run ] LOGGER.info("Running python packaging tests for following python versions using conda: %s", python_exact_versions) run_python_packaging_tests(use_conda=True, python_versions=python_exact_versions)