Beispiel #1
0
def main():
    init_logging()
    if get_bool_env_var('YB_SKIP_POSTGRES_BUILD'):
        logging.info(
            "Skipping PostgreSQL build (YB_SKIP_POSTGRES_BUILD is set)")
        return
    PostgresBuilder().run()
Beispiel #2
0
    def run(self):
        if get_bool_env_var('YB_SKIP_POSTGRES_BUILD'):
            logging.info("Skipping PostgreSQL build (YB_SKIP_POSTGRES_BUILD is set)")
            return

        self.parse_args()
        self.build_postgres()
Beispiel #3
0
    def run(self):
        if get_bool_env_var('YB_SKIP_POSTGRES_BUILD'):
            logging.info("Skipping PostgreSQL build (YB_SKIP_POSTGRES_BUILD is set)")
            return

        self.parse_args()
        self.build_postgres()
Beispiel #4
0
    def configure_postgres(self):
        if is_verbose_mode():
            logging.info("Running configure in the postgres build directory")
        # Don't enable -Werror when running configure -- that can affect the resulting
        # configuration.
        configure_cmd_line = [
            './configure',
            '--prefix',
            self.pg_prefix,
            '--with-extra-version=-YB-' + self.get_yb_version(),
            '--enable-depend',
            '--with-openssl',
            '--with-libedit-preferred',
            '--with-includes=' + self.openssl_include_dir,
            '--with-libraries=' + self.openssl_lib_dir,
            # We're enabling debug symbols for all types of builds.
            '--enable-debug'
        ]
        if not get_bool_env_var('YB_NO_PG_CONFIG_CACHE'):
            configure_cmd_line.append('--config-cache')

        # We get readline-related errors in ASAN/TSAN, so let's disable readline there.
        if self.build_type in ['asan', 'tsan']:
            # TODO: do we still need this limitation?
            configure_cmd_line += ['--without-readline']

        if self.build_type != 'release':
            configure_cmd_line += ['--enable-cassert']
        configure_result = run_program(configure_cmd_line, error_ok=True)
        if configure_result.failure():
            rerun_configure = False
            for line in configure_result.stderr.splitlines():
                if REMOVE_CONFIG_CACHE_MSG_RE.search(line.strip()):
                    logging.info(
                        "Configure failed because of stale config.cache, re-running."
                    )
                    run_program('rm -f config.cache')
                    rerun_configure = True
                    break

            if not rerun_configure:
                logging.error("Standard error from configure:\n" +
                              configure_result.stderr)
                raise RuntimeError("configure failed")

            configure_result = run_program(configure_cmd_line,
                                           shell=True,
                                           stdout_stderr_prefix='configure',
                                           error_ok=True)

        if is_verbose_mode() and configure_result.success():
            configure_result.print_output_to_stdout()

        configure_result.print_output_and_raise_error_if_failed()

        logging.info(
            "Successfully ran configure in the postgres build directory")
    def configure_postgres(self):
        if is_verbose_mode():
            logging.info("Running configure in the postgres build directory")
        # Don't enable -Werror when running configure -- that can affect the resulting
        # configuration.
        self.set_env_vars('configure')
        configure_cmd_line = [
            './configure',
            '--prefix',
            self.pg_prefix,
            '--enable-depend',
            # We're enabling debug symbols for all types of builds.
            '--enable-debug'
        ]
        if not get_bool_env_var('YB_NO_PG_CONFIG_CACHE'):
            configure_cmd_line.append('--config-cache')

        # We get readline-related errors in ASAN/TSAN, so let's disable readline there.
        if self.build_type in ['asan', 'tsan']:
            configure_cmd_line += ['--without-readline']

        if self.build_type != 'release':
            configure_cmd_line += ['--enable-cassert']
        configure_result = run_program(configure_cmd_line, error_ok=True)
        if configure_result.failure():
            rerun_configure = False
            for line in configure_result.stderr.splitlines():
                if REMOVE_CONFIG_CACHE_MSG_RE.search(line.strip()):
                    logging.info(
                        "Configure failed because of stale config.cache, re-running."
                    )
                    run_program('rm -f config.cache')
                    rerun_configure = True
                    break

            if not rerun_configure:
                logging.error("Standard error from configure:\n" +
                              configure_result.stderr)
                raise RuntimeError("configure failed")

            configure_result = run_program(configure_cmd_line)

        if is_verbose_mode():
            configure_result.print_output_to_stdout()
        write_program_output_to_file('configure', configure_result,
                                     self.pg_build_root)

        logging.info(
            "Successfully ran configure in the postgres build directory")
Beispiel #6
0
    def configure_postgres(self):
        if is_verbose_mode():
            logging.info("Running configure in the postgres build directory")
        # Don't enable -Werror when running configure -- that can affect the resulting
        # configuration.
        self.set_env_vars('configure')
        configure_cmd_line = [
                './configure',
                '--prefix', self.pg_prefix,
                '--enable-depend',
                # We're enabling debug symbols for all types of builds.
                '--enable-debug']
        if not get_bool_env_var('YB_NO_PG_CONFIG_CACHE'):
            configure_cmd_line.append('--config-cache')

        # We get readline-related errors in ASAN/TSAN, so let's disable readline there.
        if self.build_type in ['asan', 'tsan']:
            configure_cmd_line += ['--without-readline']

        if self.build_type != 'release':
            configure_cmd_line += ['--enable-cassert']
        configure_result = run_program(configure_cmd_line, error_ok=True)
        if configure_result.failure():
            rerun_configure = False
            for line in configure_result.stderr.splitlines():
                if REMOVE_CONFIG_CACHE_MSG_RE.search(line.strip()):
                    logging.info("Configure failed because of stale config.cache, re-running.")
                    run_program('rm -f config.cache')
                    rerun_configure = True
                    break

            if not rerun_configure:
                logging.error("Standard error from configure:\n" + configure_result.stderr)
                raise RuntimeError("configure failed")

            configure_result = run_program(configure_cmd_line)

        if is_verbose_mode():
            configure_result.print_output_to_stdout()
        write_program_output_to_file('configure', configure_result, self.pg_build_root)

        logging.info("Successfully ran configure in the postgres build directory")
def main() -> None:
    parser = argparse.ArgumentParser(
        description='A tool for working with the dependency graph')
    parser.add_argument('--verbose',
                        action='store_true',
                        help='Enable debug output')
    parser.add_argument(
        '-r',
        '--rebuild-graph',
        action='store_true',
        help='Rebuild the dependecy graph and save it to a file')
    parser.add_argument('--node-type',
                        help='Node type to look for',
                        type=NodeType,
                        choices=list(NodeType),
                        default=NodeType.ANY)
    parser.add_argument(
        '--file-regex',
        help='Regular expression for file names to select as initial nodes for '
        'querying the dependency graph.')
    parser.add_argument(
        '--file-name-glob',
        help='Like file-regex, but applies only to file name and uses the glob '
        'syntax instead of regex.')
    parser.add_argument(
        '--git-diff',
        help='Figure out the list of files to use as starting points in the '
        'dependency graph traversal by diffing the current state of the code '
        'against this commit. This could also be anything that could be '
        'passed to "git diff" as a single argument.')
    parser.add_argument(
        '--git-commit',
        help='Similar to --git-diff, but takes a git commit ref (e.g. sha1 or '
        'branch) and uses the set of files from that commit.')
    parser.add_argument(
        '--build-root',
        required=True,
        help='E.g. <some_root>/build/debug-gcc-dynamic-community')
    parser.add_argument('command',
                        type=Command,
                        choices=list(Command),
                        help='Command to perform')
    parser.add_argument(
        '--output-test-config',
        help=
        'Output a "test configuration file", which is a JSON containing the '
        'resulting list of C++ tests to run to this file, a flag indicating '
        'wheter to run Java tests or not, etc.')
    parser.add_argument(
        '--incomplete-build',
        action='store_true',
        help='Skip checking for file existence. Allows using the tool after '
        'build artifacts have been deleted.')
    parser.add_argument(
        '--build-args',
        help='Extra arguments to pass to yb_build.sh. The build is invoked e.g. '
        'if the compilation database file is missing.')
    parser.add_argument(
        '--link-cmd-out-file',
        help='For the %s command, write the linker arguments (one per line ) '
        'to the given file.')
    parser.add_argument(
        '--lto-output-suffix',
        default="-lto",
        help='The suffix to append to LTO-enabled binaries produced by '
        'the %s command' % Command.LINK_WHOLE_PROGRAM.value)
    parser.add_argument(
        '--run-linker',
        help=
        'Whether to actually run the linker. Setting this to false might be useful when '
        'debugging, combined with --link-cmd-out-file.',
        type=arg_str_to_bool,
        default=True)

    args = parser.parse_args()

    if args.file_regex and args.file_name_glob:
        raise RuntimeError(
            '--file-regex and --file-name-glob are incompatible')

    cmd = args.command
    if (not args.file_regex and not args.file_name_glob
            and not args.rebuild_graph and not args.git_diff
            and not args.git_commit
            and cmd not in COMMANDS_NOT_NEEDING_TARGET_SET):
        raise RuntimeError(
            "Neither of --file-regex, --file-name-glob, --git-{diff,commit}, or "
            "--rebuild-graph are specified, and the command is not one of: " +
            ", ".join([cmd.value for cmd in COMMANDS_NOT_NEEDING_TARGET_SET]))

    log_level = logging.INFO
    logging.basicConfig(
        level=log_level,
        format=
        "[%(filename)s:%(lineno)d] %(asctime)s %(levelname)s: %(message)s")

    conf = DepGraphConf(verbose=args.verbose,
                        build_root=args.build_root,
                        incomplete_build=args.incomplete_build,
                        file_regex=args.file_regex,
                        file_name_glob=args.file_name_glob,
                        build_args=args.build_args)
    if conf.file_regex and args.git_diff:
        raise RuntimeError(
            "--git-diff is incompatible with --file-{regex,name-glob}")

    if args.git_diff and args.git_commit:
        raise RuntimeError('--git-diff and --git-commit are incompatible')

    if args.git_commit:
        args.git_diff = "{}^..{}".format(args.git_commit, args.git_commit)

    graph_cache_path = os.path.join(args.build_root, 'dependency_graph.json')
    if args.rebuild_graph or not os.path.isfile(graph_cache_path):
        logging.info(
            "Generating a dependency graph at '{}'".format(graph_cache_path))
        dep_graph_builder = DependencyGraphBuilder(conf)
        dep_graph = dep_graph_builder.build()
        dep_graph.save_as_json(graph_cache_path)
    else:
        start_time = datetime.now()
        with open(graph_cache_path) as graph_input_file:
            dep_graph = DependencyGraph(conf,
                                        json_data=json.load(graph_input_file))
        logging.info("Loaded dependency graph from '%s' in %.2f sec" %
                     (graph_cache_path,
                      (datetime.now() - start_time).total_seconds()))
        dep_graph.validate_node_existence()

    # ---------------------------------------------------------------------------------------------
    # Commands that do not require an "initial set of targets"
    # ---------------------------------------------------------------------------------------------

    if cmd == Command.SELF_TEST:
        run_self_test(dep_graph)
        return
    if cmd == Command.DEBUG_DUMP:
        dep_graph.dump_debug_info()
        return

    # ---------------------------------------------------------------------------------------------
    # Figure out the initial set of targets based on a git commit, a regex, etc.
    # ---------------------------------------------------------------------------------------------

    updated_categories: Set[SourceFileCategory] = set()
    file_changes = []
    initial_nodes: Iterable[Node]
    if args.git_diff:
        old_working_dir = os.getcwd()
        with WorkDirContext(conf.yb_src_root):
            git_diff_output = subprocess.check_output(
                ['git', 'diff', args.git_diff, '--name-only']).decode('utf-8')

            initial_nodes = set()
            file_paths = set()
            for file_path in git_diff_output.split("\n"):
                file_path = file_path.strip()
                if not file_path:
                    continue
                file_changes.append(file_path)
                # It is important that we invoke os.path.realpath with the current directory set to
                # the git repository root.
                file_path = os.path.realpath(file_path)
                file_paths.add(file_path)
                node = dep_graph.node_by_path.get(file_path)
                if node:
                    initial_nodes.add(node)

        if not initial_nodes:
            logging.warning(
                "Did not find any graph nodes for this set of files: %s",
                file_paths)
            for basename in set(
                [os.path.basename(file_path) for file_path in file_paths]):
                logging.warning("Nodes for basename '{}': {}".format(
                    basename, dep_graph.find_nodes_by_basename(basename)))

    elif conf.file_regex:
        logging.info("Using file name regex: {}".format(conf.file_regex))
        initial_nodes = dep_graph.find_nodes_by_regex(conf.file_regex)
        if not initial_nodes:
            logging.warning(
                "Did not find any graph nodes for this pattern: %s",
                conf.file_regex)
        for node in initial_nodes:
            file_changes.append(node.path)
    else:
        raise RuntimeError(
            "Could not figure out how to generate the initial set of files")

    file_changes = [(os.path.relpath(file_path, conf.yb_src_root)
                     if os.path.isabs(file_path) else file_path)
                    for file_path in file_changes]

    if cmd == Command.LINK_WHOLE_PROGRAM:
        link_whole_program(dep_graph=dep_graph,
                           initial_nodes=initial_nodes,
                           link_cmd_out_file=args.link_cmd_out_file,
                           run_linker=args.run_linker,
                           lto_output_suffix=args.lto_output_suffix)
        return

    file_changes_by_category: Dict[SourceFileCategory, List[str]] = group_by(
        file_changes, get_file_category)

    # Same as file_changes_by_category, but with string values of categories instead of enum
    # elements.
    file_changes_by_category_str: Dict[str, List[str]] = {}
    for category, changes in file_changes_by_category.items():
        logging.info("File changes in category '%s':", category)
        for change in sorted(changes):
            logging.info("    %s", change)
        file_changes_by_category_str[category.value] = changes

    updated_categories = set(file_changes_by_category.keys())

    results: Set[Node] = set()
    if cmd == Command.AFFECTED:
        results = dep_graph.find_affected_nodes(set(initial_nodes),
                                                args.node_type)

    elif cmd == Command.DEPS:
        for node in initial_nodes:
            results.update(node.deps)
    elif cmd == Command.REVERSE_DEPS:
        for node in initial_nodes:
            results.update(node.reverse_deps)
    else:
        raise ValueError("Unimplemented command '{}'".format(cmd))

    if args.output_test_config:
        test_basename_list = sorted([
            os.path.basename(node.path) for node in results
            if node.node_type == NodeType.TEST
        ])
        affected_basenames = set(
            [os.path.basename(node.path) for node in results])

        # These are ALL tests, not just tests affected by the changes in question, used mostly
        # for logging.
        all_test_programs = [
            node for node in dep_graph.get_nodes()
            if node.node_type == NodeType.TEST
        ]
        all_test_basenames = set(
            [os.path.basename(node.path) for node in all_test_programs])

        # A very conservative way to decide whether to run all tests. If there are changes in any
        # categories (meaning the changeset is non-empty), and there are changes in categories other
        # than C++ / Java / files known not to affect unit tests, we force re-running all tests.
        unsafe_categories = updated_categories - CATEGORIES_NOT_CAUSING_RERUN_OF_ALL_TESTS
        user_said_all_tests = get_bool_env_var('YB_RUN_ALL_TESTS')

        test_filter_re = os.getenv('YB_TEST_EXECUTION_FILTER_RE')
        manual_test_filtering_with_regex = bool(test_filter_re)

        select_all_tests_for_now = (bool(unsafe_categories)
                                    or user_said_all_tests
                                    or manual_test_filtering_with_regex)

        user_said_all_cpp_tests = get_bool_env_var('YB_RUN_ALL_CPP_TESTS')
        user_said_all_java_tests = get_bool_env_var('YB_RUN_ALL_JAVA_TESTS')
        cpp_files_changed = SourceFileCategory.CPP in updated_categories
        java_files_changed = SourceFileCategory.JAVA in updated_categories
        yb_master_or_tserver_changed = bool(affected_basenames
                                            & set(['yb-master', 'yb-tserver']))

        run_cpp_tests = select_all_tests_for_now or cpp_files_changed or user_said_all_cpp_tests

        run_java_tests = (select_all_tests_for_now or java_files_changed
                          or yb_master_or_tserver_changed
                          or user_said_all_java_tests)

        if select_all_tests_for_now:
            if user_said_all_tests:
                logging.info(
                    "User explicitly specified that all tests should be run")
            elif manual_test_filtering_with_regex:
                logging.info(
                    "YB_TEST_EXECUTION_FILTER_RE specified: %s, will filter tests at a later step",
                    test_filter_re)
            else:
                logging.info(
                    "All tests should be run based on file changes in these categories: {}"
                    .format(', '.join(
                        sorted([
                            category.value for category in unsafe_categories
                        ]))))
        else:
            if run_cpp_tests:
                if user_said_all_cpp_tests:
                    logging.info(
                        "User explicitly specified that all C++ tests should be run"
                    )
                else:
                    logging.info(
                        'Will run some C++ tests, some C++ files changed')
            if run_java_tests:
                if user_said_all_java_tests:
                    logging.info(
                        "User explicitly specified that all Java tests should be run"
                    )
                else:
                    logging.info('Will run all Java tests, ' + ' and '.join(
                        (['some Java files changed'] if java_files_changed else
                         []) + (['yb-{master,tserver} binaries changed']
                                if yb_master_or_tserver_changed else [])))

        if run_cpp_tests and not test_basename_list and not select_all_tests_for_now:
            logging.info(
                'There are no C++ test programs affected by the changes, '
                'will skip running C++ tests.')
            run_cpp_tests = False

        test_conf = dict(run_cpp_tests=run_cpp_tests,
                         run_java_tests=run_java_tests,
                         file_changes_by_category=file_changes_by_category_str)
        if test_filter_re:
            test_conf.update(test_filter_re=test_filter_re)

        if not select_all_tests_for_now:
            # We only have this kind of fine-grained filtering for C++ test programs, and for Java
            # tests we either run all of them or none.
            test_conf['cpp_test_programs'] = test_basename_list
            if len(all_test_basenames) > 0:
                logging.info(
                    "{} C++ test programs should be run (out of {} possible, {}%)"
                    .format(
                        len(test_basename_list), len(all_test_basenames),
                        "%.1f" % (100.0 * len(test_basename_list) /
                                  len(all_test_basenames))))
            if len(test_basename_list) != len(all_test_basenames):
                logging.info(
                    "The following C++ test programs will be run: {}".format(
                        ", ".join(sorted(test_basename_list))))

        with open(args.output_test_config, 'w') as output_file:
            output_file.write(json.dumps(test_conf, indent=2) + "\n")
        logging.info("Wrote a test configuration to {}".format(
            args.output_test_config))
    else:
        # For ad-hoc command-line use, mostly for testing and sanity-checking.
        for node in sorted(results,
                           key=lambda node: [node.node_type.value, node.path]):
            print(node)
        logging.info("Found {} results".format(len(results)))
Beispiel #8
0
    def configure_postgres(self) -> None:
        if is_verbose_mode():
            logging.info("Running configure in the postgres build directory")
        # Don't enable -Werror when running configure -- that can affect the resulting
        # configuration.
        configure_cmd_line = [
            './configure',
            '--prefix',
            self.pg_prefix,
            '--with-extra-version=-YB-' + self.get_yb_version(),
            '--enable-depend',
            '--with-icu',
            '--with-ldap',
            '--with-openssl',
            '--with-gssapi',
            # Options are ossp (original/old implementation), bsd (BSD) and e2fs
            # (libuuid-based for Unix/Mac).
            '--with-uuid=e2fs',
            '--with-libedit-preferred',
            '--with-includes=' + self.openssl_include_dir,
            '--with-libraries=' + self.openssl_lib_dir,
            # We're enabling debug symbols for all types of builds.
            '--enable-debug'
        ]
        if is_macos_arm64():
            configure_cmd_line.insert(0, '/opt/homebrew/bin/bash')

        if not get_bool_env_var('YB_NO_PG_CONFIG_CACHE'):
            configure_cmd_line.append('--config-cache')

        # We get readline-related errors in ASAN/TSAN, so let's disable readline there.
        if self.build_type in ['asan', 'tsan']:
            # TODO: do we still need this limitation?
            configure_cmd_line += ['--without-readline']

        if self.build_type != 'release':
            configure_cmd_line += ['--enable-cassert']
        # Unset YB_SHOW_COMPILER_COMMAND_LINE when configuring postgres to avoid unintended side
        # effects from additional compiler output.
        with EnvVarContext(YB_SHOW_COMPILER_COMMAND_LINE=None):
            configure_result = run_program(configure_cmd_line, error_ok=True)
        if configure_result.failure():
            rerun_configure = False
            for line in configure_result.stderr.splitlines():
                if REMOVE_CONFIG_CACHE_MSG_RE.search(line.strip()):
                    logging.info(
                        "Configure failed because of stale config.cache, re-running."
                    )
                    run_program('rm -f config.cache')
                    rerun_configure = True
                    break

            if not rerun_configure:
                logging.error("Standard error from configure:\n" +
                              configure_result.stderr)
                config_log_path = os.path.join(self.pg_build_root,
                                               "config.log")
                if os.path.exists(config_log_path):
                    with open(config_log_path) as config_log_file:
                        config_log_str = config_log_file.read()
                    logging.info(f"Contents of {config_log_path}:")
                    sys.stderr.write(config_log_str + "\n")
                else:
                    logging.warning(f"File not found: {config_log_path}")
                raise RuntimeError("configure failed")

            configure_result = run_program(configure_cmd_line,
                                           shell=True,
                                           stdout_stderr_prefix='configure',
                                           error_ok=True)

        if is_verbose_mode() and configure_result.success():
            configure_result.print_output_to_stdout()

        configure_result.print_output_and_raise_error_if_failed()

        logging.info(
            "Successfully ran configure in the postgres build directory")
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(
        description='A tool for working with the dependency graph')
    parser.add_argument('--verbose',
                        action='store_true',
                        help='Enable debug output')
    parser.add_argument(
        '-r',
        '--rebuild-graph',
        action='store_true',
        help='Rebuild the dependecy graph and save it to a file')
    parser.add_argument('--node-type',
                        help='Node type to look for',
                        default='any',
                        choices=['test', 'object', 'library', 'source', 'any'])
    parser.add_argument(
        '--file-regex',
        help='Regular expression for file names to select as initial nodes for '
        'querying the dependency graph.')
    parser.add_argument(
        '--file-name-glob',
        help='Like file-regex, but applies only to file name and uses the glob '
        'syntax instead of regex.')
    parser.add_argument(
        '--git-diff',
        help='Figure out the list of files to use as starting points in the '
        'dependency graph traversal by diffing the current state of the code '
        'against this commit. This could also be anything that could be '
        'passed to "git diff" as a single argument.')
    parser.add_argument(
        '--git-commit',
        help='Similar to --git-diff, but takes a git commit ref (e.g. sha1 or '
        'branch) and uses the set of files from that commit.')
    parser.add_argument(
        '--build-root',
        required=True,
        help='E.g. <some_root>/build/debug-gcc-dynamic-community')
    parser.add_argument('command', choices=COMMANDS, help='Command to perform')
    parser.add_argument(
        '--output-test-config',
        help=
        'Output a "test configuration file", which is a JSON containing the '
        'resulting list of C++ tests to run to this file, a flag indicating '
        'wheter to run Java tests or not, etc.')
    parser.add_argument(
        '--incomplete-build',
        action='store_true',
        help='Skip checking for file existence. Allows using the tool after '
        'build artifacts have been deleted.')
    args = parser.parse_args()

    if args.file_regex and args.file_name_glob:
        raise RuntimeError(
            '--file-regex and --file-name-glob are incompatible')

    cmd = args.command
    if (not args.file_regex and not args.file_name_glob
            and not args.rebuild_graph and not args.git_diff
            and not args.git_commit and cmd != SELF_TEST_CMD):
        raise RuntimeError(
            "Neither of --file-regex, --file-name-glob, --git-{diff,commit}, or "
            "--rebuild-graph are specified, and the command is not " +
            SELF_TEST_CMD)

    log_level = logging.INFO
    logging.basicConfig(
        level=log_level,
        format=
        "[%(filename)s:%(lineno)d] %(asctime)s %(levelname)s: %(message)s")

    conf = Configuration(args)
    if conf.file_regex and args.git_diff:
        raise RuntimeError(
            "--git-diff is incompatible with --file-{regex,name-glob}")

    if args.git_diff and args.git_commit:
        raise RuntimeError('--git-diff and --git-commit are incompatible')

    if args.git_commit:
        args.git_diff = "{}^..{}".format(args.git_commit, args.git_commit)

    graph_cache_path = os.path.join(args.build_root, 'dependency_graph.json')
    if args.rebuild_graph or not os.path.isfile(graph_cache_path):
        logging.info(
            "Generating a dependency graph at '{}'".format(graph_cache_path))
        dep_graph_builder = DependencyGraphBuilder(conf)
        dep_graph = dep_graph_builder.build()
        dep_graph.save_as_json(graph_cache_path)
    else:
        start_time = datetime.now()
        with open(graph_cache_path) as graph_input_file:
            dep_graph = DependencyGraph(conf,
                                        json_data=json.load(graph_input_file))
        logging.info("Loaded dependency graph from '%s' in %.2f sec" %
                     (graph_cache_path,
                      (datetime.now() - start_time).total_seconds()))
        dep_graph.validate_node_existence()

    if cmd == SELF_TEST_CMD:
        run_self_test(dep_graph)
        return

    updated_categories = None
    file_changes = []
    if args.git_diff:
        old_working_dir = os.getcwd()
        with WorkDirContext(conf.yb_src_root):
            git_diff_output = subprocess.check_output(
                ['git', 'diff', args.git_diff, '--name-only'])

            initial_nodes = set()
            file_paths = set()
            for file_path in git_diff_output.split("\n"):
                file_path = file_path.strip()
                if not file_path:
                    continue
                file_changes.append(file_path)
                # It is important that we invoke os.path.realpath with the current directory set to
                # the git repository root.
                file_path = os.path.realpath(file_path)
                file_paths.add(file_path)
                node = dep_graph.node_by_path.get(file_path)
                if node:
                    initial_nodes.add(node)

        if not initial_nodes:
            logging.warning(
                "Did not find any graph nodes for this set of files: {}".
                format(file_paths))
            for basename in set(
                [os.path.basename(file_path) for file_path in file_paths]):
                logging.warning("Nodes for basename '{}': {}".format(
                    basename, dep_graph.find_nodes_by_basename(basename)))

        file_changes_by_category = group_by(file_changes, get_file_category)
        for category, changes in file_changes_by_category.items():
            logging.info("File changes in category '{}':".format(category))
            for change in sorted(changes):
                logging.info("    {}".format(change))
        updated_categories = set(file_changes_by_category.keys())

    elif conf.file_regex:
        logging.info("Using file name regex: {}".format(conf.file_regex))
        initial_nodes = dep_graph.find_nodes_by_regex(conf.file_regex)
    else:
        raise RuntimeError(
            "Could not figure out how to generate the initial set of files")

    results = set()
    if cmd == LIST_AFFECTED_CMD:
        results = dep_graph.find_affected_nodes(initial_nodes, args.node_type)
    elif cmd == LIST_DEPS_CMD:
        for node in initial_nodes:
            results.update(node.deps)
    elif cmd == LIST_REVERSE_DEPS_CMD:
        for node in initial_nodes:
            results.update(node.reverse_deps)
    else:
        raise RuntimeError("Unimplemented command '{}'".format(command))

    if args.output_test_config:
        test_basename_list = sorted([
            os.path.basename(node.path) for node in results
            if node.node_type == 'test'
        ])
        affected_basenames = set(
            [os.path.basename(node.path) for node in results])

        # These are ALL tests, not just tests affected by the changes in question, used mostly
        # for logging.
        all_test_programs = [
            node for node in dep_graph.get_nodes() if node.node_type == 'test'
        ]
        all_test_basenames = set(
            [os.path.basename(node.path) for node in all_test_programs])

        # A very conservative way to decide whether to run all tests. If there are changes in any
        # categories (meaning the changeset is non-empty), and there are changes in categories other
        # than C++ / Java / files known not to affect unit tests, we force re-running all tests.
        unsafe_categories = updated_categories - CATEGORIES_NOT_CAUSING_RERUN_OF_ALL_TESTS
        user_said_all_tests = get_bool_env_var('YB_RUN_ALL_TESTS')
        run_all_tests = bool(unsafe_categories) or user_said_all_tests

        user_said_all_cpp_tests = get_bool_env_var('YB_RUN_ALL_CPP_TESTS')
        user_said_all_java_tests = get_bool_env_var('YB_RUN_ALL_JAVA_TESTS')
        cpp_files_changed = 'c++' in updated_categories
        java_files_changed = 'java' in updated_categories
        yb_master_or_tserver_changed = bool(affected_basenames
                                            & set(['yb-master', 'yb-tserver']))

        run_cpp_tests = run_all_tests or cpp_files_changed or user_said_all_cpp_tests
        run_java_tests = (run_all_tests or java_files_changed
                          or yb_master_or_tserver_changed
                          or user_said_all_java_tests)

        if run_all_tests:
            if user_said_all_tests:
                logging.info(
                    "User explicitly specified that all tests should be run")
            else:
                logging.info(
                    "All tests should be run based on file changes in these categories: {}"
                    .format(', '.join(sorted(unsafe_categories))))
        else:
            if run_cpp_tests:
                if user_said_all_cpp_tests:
                    logging.info(
                        "User explicitly specified that all C++ tests should be run"
                    )
                else:
                    logging.info(
                        'Will run some C++ tests, some C++ files changed')
            if run_java_tests:
                if user_said_all_java_tests:
                    logging.info(
                        "User explicitly specified that all Java tests should be run"
                    )
                else:
                    logging.info('Will run all Java tests, ' + ' and '.join(
                        (['some Java files changed'] if java_files_changed else
                         []) + (['yb-{master,tserver} binaries changed']
                                if yb_master_or_tserver_changed else [])))

        if run_cpp_tests and not test_basename_list and not run_all_tests:
            logging.info(
                'There are no C++ test programs affected by the changes, '
                'will skip running C++ tests.')
            run_cpp_tests = False

        test_conf = dict(run_cpp_tests=run_cpp_tests,
                         run_java_tests=run_java_tests,
                         file_changes_by_category=file_changes_by_category)
        if not run_all_tests:
            test_conf['cpp_test_programs'] = test_basename_list
            logging.info(
                "{} C++ test programs should be run (out of {} possible, {}%)".
                format(
                    len(test_basename_list), len(all_test_basenames),
                    "%.1f" % (100.0 * len(test_basename_list) /
                              len(all_test_basenames))))
            if len(test_basename_list) != len(all_test_basenames):
                logging.info(
                    "The following C++ test programs will be run: {}".format(
                        ", ".join(sorted(test_basename_list))))

        with open(args.output_test_config, 'w') as output_file:
            output_file.write(json.dumps(test_conf, indent=2) + "\n")
        logging.info("Wrote a test configuration to {}".format(
            args.output_test_config))
    else:
        # For ad-hoc command-line use, mostly for testing and sanity-checking.
        for node in sorted(results,
                           key=lambda node: [node.node_type, node.path]):
            print(node)
        logging.info("Found {} results".format(len(results)))
def main():
    parser = argparse.ArgumentParser(
        description='A tool for working with the dependency graph')
    parser.add_argument('--verbose', action='store_true',
                        help='Enable debug output')
    parser.add_argument('-r', '--rebuild-graph',
                        action='store_true',
                        help='Rebuild the dependecy graph and save it to a file')
    parser.add_argument('--node-type',
                        help='Node type to look for',
                        default='any',
                        choices=['test', 'object', 'library', 'source', 'any'])
    parser.add_argument('--file-regex',
                        help='Regular expression for file names to select as initial nodes for '
                             'querying the dependency graph.')
    parser.add_argument('--file-name-glob',
                        help='Like file-regex, but applies only to file name and uses the glob '
                             'syntax instead of regex.')
    parser.add_argument('--git-diff',
                        help='Figure out the list of files to use as starting points in the '
                             'dependency graph traversal by diffing the current state of the code '
                             'against this commit. This could also be anything that could be '
                             'passed to "git diff" as a single argument.')
    parser.add_argument('--git-commit',
                        help='Similar to --git-diff, but takes a git commit ref (e.g. sha1 or '
                             'branch) and uses the set of files from that commit.')
    parser.add_argument('--build-root',
                        required=True,
                        help='E.g. <some_root>/build/debug-gcc-dynamic-community')
    parser.add_argument('command',
                        choices=COMMANDS,
                        help='Command to perform')
    parser.add_argument('--output-test-config',
                        help='Output a "test configuration file", which is a JSON containing the '
                             'resulting list of C++ tests to run to this file, a flag indicating '
                             'wheter to run Java tests or not, etc.')
    parser.add_argument('--incomplete-build',
                        action='store_true',
                        help='Skip checking for file existence. Allows using the tool after '
                             'build artifacts have been deleted.')
    args = parser.parse_args()

    if args.file_regex and args.file_name_glob:
        raise RuntimeError('--file-regex and --file-name-glob are incompatible')

    cmd = args.command
    if (not args.file_regex and
            not args.file_name_glob and
            not args.rebuild_graph and
            not args.git_diff and
            not args.git_commit and
            cmd != SELF_TEST_CMD):
        raise RuntimeError(
                "Neither of --file-regex, --file-name-glob, --git-{diff,commit}, or "
                "--rebuild-graph are specified, and the command is not " + SELF_TEST_CMD)

    log_level = logging.INFO
    logging.basicConfig(
        level=log_level,
        format="[%(filename)s:%(lineno)d] %(asctime)s %(levelname)s: %(message)s")

    conf = Configuration(args)
    if conf.file_regex and args.git_diff:
        raise RuntimeError(
                "--git-diff is incompatible with --file-{regex,name-glob}")

    if args.git_diff and args.git_commit:
        raise RuntimeError('--git-diff and --git-commit are incompatible')

    if args.git_commit:
        args.git_diff = "{}^..{}".format(args.git_commit, args.git_commit)

    graph_cache_path = os.path.join(args.build_root, 'dependency_graph.json')
    if args.rebuild_graph or not os.path.isfile(graph_cache_path):
        logging.info("Generating a dependency graph at '{}'".format(graph_cache_path))
        dep_graph_builder = DependencyGraphBuilder(conf)
        dep_graph = dep_graph_builder.build()
        dep_graph.save_as_json(graph_cache_path)
    else:
        start_time = datetime.now()
        with open(graph_cache_path) as graph_input_file:
            dep_graph = DependencyGraph(conf, json_data=json.load(graph_input_file))
        logging.info("Loaded dependency graph from '%s' in %.2f sec" %
                     (graph_cache_path, (datetime.now() - start_time).total_seconds()))
        dep_graph.validate_node_existence()

    if cmd == SELF_TEST_CMD:
        run_self_test(dep_graph)
        return

    updated_categories = None
    file_changes = []
    if args.git_diff:
        old_working_dir = os.getcwd()
        os.chdir(conf.yb_src_root)
        git_diff_output = subprocess.check_output(
                ['git', 'diff', args.git_diff, '--name-only'])

        initial_nodes = set()
        file_paths = set()
        for file_path in git_diff_output.split("\n"):
            file_path = file_path.strip()
            if not file_path:
                continue
            file_changes.append(file_path)
            # It is important that we invoke os.path.realpath with the current directory set to
            # the git repository root.
            file_path = os.path.realpath(file_path)
            file_paths.add(file_path)
            node = dep_graph.node_by_path.get(file_path)
            if node:
                initial_nodes.add(node)

        os.chdir(old_working_dir)

        if not initial_nodes:
            logging.warning("Did not find any graph nodes for this set of files: {}".format(
                file_paths))
            for basename in set([os.path.basename(file_path) for file_path in file_paths]):
                logging.warning("Nodes for basename '{}': {}".format(
                    basename, dep_graph.find_nodes_by_basename(basename)))

        file_changes_by_category = group_by(file_changes, get_file_category)
        for category, changes in file_changes_by_category.items():
            logging.info("File changes in category '{}':".format(category))
            for change in sorted(changes):
                logging.info("    {}".format(change))
        updated_categories = set(file_changes_by_category.keys())

    elif conf.file_regex:
        logging.info("Using file name regex: {}".format(conf.file_regex))
        initial_nodes = dep_graph.find_nodes_by_regex(conf.file_regex)
    else:
        raise RuntimeError("Could not figure out how to generate the initial set of files")

    results = set()
    if cmd == LIST_AFFECTED_CMD:
        results = dep_graph.find_affected_nodes(initial_nodes, args.node_type)
    elif cmd == LIST_DEPS_CMD:
        for node in initial_nodes:
            results.update(node.deps)
    elif cmd == LIST_REVERSE_DEPS_CMD:
        for node in initial_nodes:
            results.update(node.reverse_deps)
    else:
        raise RuntimeError("Unimplemented command '{}'".format(command))

    if args.output_test_config:
        test_basename_list = sorted(
                [os.path.basename(node.path) for node in results if node.node_type == 'test'])
        affected_basenames = set([os.path.basename(node.path) for node in results])

        # These are ALL tests, not just tests affected by the changes in question, used mostly
        # for logging.
        all_test_programs = [node for node in dep_graph.get_nodes() if node.node_type == 'test']
        all_test_basenames = set([os.path.basename(node.path) for node in all_test_programs])

        # A very conservative way to decide whether to run all tests. If there are changes in any
        # categories (meaning the changeset is non-empty), and there are changes in categories other
        # than C++ / Java / files known not to affect unit tests, we force re-running all tests.
        unsafe_categories = updated_categories - CATEGORIES_NOT_CAUSING_RERUN_OF_ALL_TESTS
        user_said_all_tests = get_bool_env_var('YB_RUN_ALL_TESTS')
        run_all_tests = bool(unsafe_categories) or user_said_all_tests

        user_said_all_cpp_tests = get_bool_env_var('YB_RUN_ALL_CPP_TESTS')
        user_said_all_java_tests = get_bool_env_var('YB_RUN_ALL_JAVA_TESTS')
        cpp_files_changed = 'c++' in updated_categories
        java_files_changed = 'java' in updated_categories
        yb_master_or_tserver_changed = bool(affected_basenames & set(['yb-master', 'yb-tserver']))

        run_cpp_tests = run_all_tests or cpp_files_changed or user_said_all_cpp_tests
        run_java_tests = (
                run_all_tests or java_files_changed or yb_master_or_tserver_changed or
                user_said_all_java_tests
            )

        if run_all_tests:
            if user_said_all_tests:
                logging.info("User explicitly specified that all tests should be run")
            else:
                logging.info(
                    "All tests should be run based on file changes in these categories: {}".format(
                        ', '.join(sorted(unsafe_categories))))
        else:
            if run_cpp_tests:
                if user_said_all_cpp_tests:
                    logging.info("User explicitly specified that all C++ tests should be run")
                else:
                    logging.info('Will run some C++ tests, some C++ files changed')
            if run_java_tests:
                if user_said_all_java_tests:
                    logging.info("User explicitly specified that all Java tests should be run")
                else:
                    logging.info('Will run all Java tests, ' +
                                 ' and '.join(
                                     (['some Java files changed'] if java_files_changed else []) +
                                     (['yb-{master,tserver} binaries changed']
                                      if yb_master_or_tserver_changed else [])))

        if run_cpp_tests and not test_basename_list and not run_all_tests:
            logging.info('There are no C++ test programs affected by the changes, '
                         'will skip running C++ tests.')
            run_cpp_tests = False

        test_conf = dict(
            run_cpp_tests=run_cpp_tests,
            run_java_tests=run_java_tests,
            file_changes_by_category=file_changes_by_category
        )
        if not run_all_tests:
            test_conf['cpp_test_programs'] = test_basename_list
            logging.info(
                    "{} C++ test programs should be run (out of {} possible, {}%)".format(
                        len(test_basename_list),
                        len(all_test_basenames),
                        "%.1f" % (100.0 * len(test_basename_list) / len(all_test_basenames))))
            if len(test_basename_list) != len(all_test_basenames):
                logging.info("The following C++ test programs will be run: {}".format(
                    ", ".join(sorted(test_basename_list))))

        with open(args.output_test_config, 'w') as output_file:
            output_file.write(json.dumps(test_conf, indent=2) + "\n")
        logging.info("Wrote a test configuration to {}".format(args.output_test_config))
    else:
        # For ad-hoc command-line use, mostly for testing and sanity-checking.
        for node in sorted(results, key=lambda node: [node.node_type, node.path]):
            print(node)
        logging.info("Found {} results".format(len(results)))