Beispiel #1
0
def start():
    """Start casssandra nodes"""

    product = dse if config['product'] == 'dse' else cstar
    cassandra_path = product.get_cassandra_path()

    # Place environment file on host:
    env = config.get('env', '')
    fab.puts('env is: {}'.format(env))

    if isinstance(env, list) or isinstance(env, tuple):
        env = "\n".join(env)
    env += "\n"
    fab.puts('env is: {}'.format(env))
    if not config['use_jna']:
        env = 'JVM_EXTRA_OPTS=-Dcassandra.boot_without_jna=true\n\n' + env
    # Turn on GC logging:
    fab.run("mkdir -p ~/fab/cassandra/logs")
    log_dir = fab.run("readlink -m {log_dir}".format(log_dir=config['log_dir']))
    try:
        ip_address = cluster_config['hosts'][fab.env.host]['internal_ip']
    except:
        ip_address = fab.env.host
    env = "JVM_OPTS=\"$JVM_OPTS -Djava.rmi.server.hostname={hostname} -Xloggc:{log_dir}/gc.log\"\n\n".format(
        hostname=ip_address, log_dir=log_dir) + env
    # Enable JMX without authentication
    env = "JVM_OPTS=\"$JVM_OPTS -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false\"\n" + env

    # Flamegraph
    if flamegraph.is_enabled(config):
        env += "JVM_OPTS=\"$JVM_OPTS -XX:+PreserveFramePointer\""

    if profiler.yourkit_is_enabled(config):
        execute(profiler.yourkit_clean)
        env += profiler.yourkit_get_jvm_opts()

    fab.puts("running with token allocation type: {}".format(config['token_allocation']))
    if config['use_vnodes'] and config['token_allocation'] in ('static-random', 'static-algorithmic'):
        env += "JVM_OPTS=\"$JVM_OPTS -Dcassandra.initial_token={}\"\n".format(
            get_static_vnode_tokens(fab.env.host,
                                    fab.env.hosts,
                                    partitioner=config['partitioner'],
                                    group=config['token_allocation']))

    env_script = "{name}.sh".format(name=uuid.uuid1())
    env_file = StringIO(env)
    fab.run('mkdir -p ~/fab/scripts')
    fab.put(env_file, '~/fab/scripts/{env_script}'.format(env_script=env_script))

    fab.puts('env is: {}'.format(env))
    if len(env_script) > 0:
        env_path = os.path.join(cassandra_path, 'conf/cassandra-env.sh')
        fab.run('echo >> ~/fab/scripts/{env_script}'.format(**locals()))
        fab.run('cat {env_path} >> ~/fab/scripts/{env_script}'.format(**locals()))
        fab.run('cp ~/fab/scripts/{env_script} {env_path}'.format(**locals()))

    product.start(config)
Beispiel #2
0
def bootstrap_cluster(cfg):
    config = copy.copy(pristine_config)
    config.update(cfg)

    # Flamegraph Setup
    if flamegraph.is_enabled():
        execute(flamegraph.setup)

    git_id = bootstrap(config, destroy=True)
    return git_id
Beispiel #3
0
def bootstrap_cluster(cfg):
    config = copy.copy(pristine_config)
    config.update(cfg)

    # Flamegraph Setup
    if flamegraph.is_enabled():
        execute(flamegraph.setup)

    git_id = bootstrap(config, destroy=True)
    return git_id
def stress_compare(revisions,
                   title,
                   log,
                   operations = [],
                   subtitle = '',
                   capture_fincore=False,
                   initial_destroy=True,
                   leave_data=False,
                   keep_page_cache=False
               ):
    """
    Run Stress on multiple C* branches and compare them.

    revisions - List of dictionaries that contain cluster configurations
                to trial. This is combined with the default config.
    title - The title of the comparison
    subtitle - A subtitle for more information (displayed smaller underneath)
    log - The json file path to record stats to
    operations - List of dictionaries indicating the operations. Example:
       [# cassandra-stress command, node defaults to cluster defined 'stress_node'
        {'type': 'stress',
         'command': 'write n=19M -rate threads=50',
         'node': 'node1',
         'wait_for_compaction': True},
        # nodetool command to run in parallel on nodes:
        {'type': 'nodetool',
         'command': 'decomission',
         'nodes': ['node1','node2']},
        # cqlsh script, node defaults to cluster defined 'stress_node'
        {'type': 'cqlsh',
         'script': "use my_ks; INSERT INTO blah (col1, col2) VALUES (val1, val2);",
         'node': 'node1'}
       ]
    capture_fincore - Enables capturing of linux-fincore logs of C* data files.
    initial_destroy - Destroy all data before the first revision is run.
    leave_data - Whether to leave the Cassandra data/commitlog/etc directories intact between revisions.
    keep_page_cache - Whether to leave the linux page cache intact between revisions.
    """
    validate_revisions_list(revisions)
    validate_operations_list(operations)

    pristine_config = copy.copy(fab_config)

    # initial_destroy setting can be set in the job
    # configuration, or manually in the call to this function.
    # Either is fine, but they shouldn't conflict. If they do,
    # ValueError is raised.
    if initial_destroy == True and pristine_config.get('initial_destroy', None) == False:
        raise ValueError('setting for initial_destroy conflicts in job config and stress_compare() call')
    else:
        initial_destroy = bool(distutils.util.strtobool(str(pristine_config.get('initial_destroy', initial_destroy))))

    if initial_destroy:
        logger.info("Cleaning up from prior runs of stress_compare ...")
        teardown(destroy=True, leave_data=False)

    # Update our local cassandra git remotes and branches
    _, localhost_entry = get_localhost()
    with common.fab.settings(hosts=[localhost_entry]):
        execute(cstar.update_cassandra_git)

    # Flamegraph Setup
    if flamegraph.is_enabled():
        execute(flamegraph.setup)

    clean_stress()
    stress_revisions = set([operation['stress_revision'] for operation in operations if 'stress_revision' in operation])
    stress_shas = setup_stress(stress_revisions)

    with GracefulTerminationHandler() as handler:
        for rev_num, revision_config in enumerate(revisions):
            config = copy.copy(pristine_config)
            config.update(revision_config)
            revision = revision_config['revision']
            config['log'] = log
            config['title'] = title
            config['subtitle'] = subtitle
            product = dse if config.get('product') == 'dse' else cstar

            # leave_data setting can be set in the revision
            # configuration, or manually in the call to this function.
            # Either is fine, but they shouldn't conflict. If they do,
            # ValueError is raised.
            if leave_data == True and revision_config.get('leave_data', None) == False:
                raise ValueError('setting for leave_data conflicts in job config and stress_compare() call')
            else:
                leave_data = bool(distutils.util.strtobool(str(revision_config.get('leave_data', leave_data))))

            logger.info("Bringing up {revision} cluster...".format(revision=revision))

            # Drop the page cache between each revision, especially
            # important when leave_data=True :
            if not keep_page_cache:
                drop_page_cache()

            # Only fetch from git on the first run:
            git_fetch = True if rev_num == 0 else False
            revision_config['git_id'] = git_id = bootstrap(config,
                                                           destroy=initial_destroy,
                                                           leave_data=leave_data,
                                                           git_fetch=git_fetch)

            if flamegraph.is_enabled(revision_config):
                execute(flamegraph.ensure_stopped_perf_agent)
                execute(flamegraph.start_perf_agent, rev_num)

            if capture_fincore:
                start_fincore_capture(interval=10)

            last_stress_operation_id = 'None'
            for operation_i, operation in enumerate(operations, 1):
                try:
                    start = datetime.datetime.now()
                    stats = {
                        "id": str(uuid.uuid1()),
                        "type": operation['type'],
                        "revision": revision,
                        "git_id": git_id,
                        "start_date": start.isoformat(),
                        "label": revision_config.get('label', revision_config['revision']),
                        "test": '{operation_i}_{operation}'.format(
                            operation_i=operation_i,
                            operation=operation['type'])
                    }

                    if operation['type'] == 'stress':
                        last_stress_operation_id = stats['id']
                        # Default to all the nodes of the cluster if no
                        # nodes were specified in the command:
                        if operation.has_key('nodes'):
                            cmd = "{command} -node {hosts}".format(
                                command=operation['command'],
                                hosts=",".join(operation['nodes']))
                        elif '-node' in operation['command']:
                            cmd = operation['command']
                        else:
                            cmd = "{command} -node {hosts}".format(
                                command=operation['command'],
                                hosts=",".join([n for n in fab_config['hosts']]))
                        stats['command'] = cmd
                        stats['intervals'] = []
                        stats['test'] = '{operation_i}_{operation}'.format(
                            operation_i=operation_i, operation=cmd.strip().split(' ')[0]).replace(" ", "_")
                        logger.info('Running stress operation : {cmd}  ...'.format(cmd=cmd))
                        # Run stress:
                        # (stress takes the stats as a parameter, and adds
                        #  more as it runs):
                        stress_sha = stress_shas[operation.get('stress_revision', 'default')]
                        stats = stress(cmd, revision, stress_sha, stats=stats)
                        # Wait for all compactions to finish (unless disabled):
                        if operation.get('wait_for_compaction', True):
                            compaction_throughput = revision_config.get("compaction_throughput_mb_per_sec", 16)
                            wait_for_compaction(compaction_throughput=compaction_throughput)

                    elif operation['type'] == 'nodetool':
                        if 'nodes' not in operation:
                            operation['nodes'] = 'all'
                        if operation['nodes'] in ['all','ALL']:
                            nodes = [n for n in fab_config['hosts']]
                        else:
                            nodes = operation['nodes']

                        set_nodetool_path(os.path.join(product.get_bin_path(), 'nodetool'))
                        logger.info("Running nodetool on {nodes} with command: {command}".format(nodes=operation['nodes'], command=operation['command']))
                        stats['command'] = operation['command']
                        output = nodetool_multi(nodes, operation['command'])
                        stats['output'] = output
                        logger.info("Nodetool command finished on all nodes")

                    elif operation['type'] == 'cqlsh':
                        logger.info("Running cqlsh commands on {node}".format(node=operation['node']))
                        set_cqlsh_path(os.path.join(product.get_bin_path(), 'cqlsh'))
                        output = cqlsh(operation['script'], operation['node'])
                        stats['output'] = output.split("\n")
                        stats['command'] = operation['script']
                        logger.info("Cqlsh commands finished")

                    elif operation['type'] == 'bash':
                        nodes = operation.get('nodes', [n for n in fab_config['hosts']])
                        logger.info("Running bash commands on: {nodes}".format(nodes=nodes))
                        stats['output'] = bash(operation['script'], nodes)
                        stats['command'] = operation['script']
                        logger.info("Bash commands finished")

                    elif operation['type'] == 'spark_cassandra_stress':
                        node = operation['node']
                        logger.info("Running spark_cassandra_stress on {node}".format(node=node))
                        output = spark_cassandra_stress(operation['script'], node)
                        stats['output'] = output
                        logger.info("spark_cassandra_stress finished")

                    elif operation['type'] == 'ctool':
                        logger.info("Running ctool with parameters: {command}".format(command=operation['command']))
                        ctool = Ctool(operation['command'], common.config)
                        output = execute(ctool.run)
                        stats['output'] = output
                        logger.info("ctool finished")

                    elif operation['type'] == 'dsetool':
                        if 'nodes' not in operation:
                            operation['nodes'] = 'all'
                        if operation['nodes'] in ['all','ALL']:
                            nodes = [n for n in fab_config['hosts']]
                        else:
                            nodes = operation['nodes']

                        dsetool_options = operation['script']
                        logger.info("Running dsetool {command} on {nodes}".format(nodes=operation['nodes'], command=dsetool_options))
                        stats['command'] = dsetool_options
                        output = dsetool_cmd(nodes=nodes, options=dsetool_options)
                        stats['output'] = output
                        logger.info("dsetool command finished on all nodes")

                    elif operation['type'] == 'dse':
                        logger.info("Running dse command on {node}".format(node=operation['node']))
                        output = dse_cmd(node=operation['node'], options=operation['script'])
                        stats['output'] = output.split("\n")
                        stats['command'] = operation['script']
                        logger.info("dse commands finished")

                    end = datetime.datetime.now()
                    stats['end_date'] = end.isoformat()
                    stats['op_duration'] = str(end - start)
                    log_stats(stats, file=log)
                finally:
                    # Copy node logs:
                    retrieve_logs_and_create_tarball(job_id=stats['id'])
                    revision_config['last_log'] = stats['id']

                if capture_fincore:
                    stop_fincore_capture()
                    log_dir = os.path.join(CSTAR_PERF_LOGS_DIR, stats['id'])
                    retrieve_fincore_logs(log_dir)
                    # Restart fincore capture if this is not the last
                    # operation:
                    if operation_i < len(operations):
                        start_fincore_capture(interval=10)

            if flamegraph.is_enabled(revision_config):
                # Generate and Copy node flamegraphs
                execute(flamegraph.stop_perf_agent)
                execute(flamegraph.generate_flamegraph, rev_num)
                flamegraph_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf', 'flamegraph')
                flamegraph_test_dir = os.path.join(flamegraph_dir, last_stress_operation_id)
                retrieve_flamegraph(flamegraph_test_dir, rev_num+1)
                sh.tar('cfvz', "{}.tar.gz".format(stats['id']), last_stress_operation_id, _cwd=flamegraph_dir)
                shutil.rmtree(flamegraph_test_dir)

            log_add_data(log, {'title':title,
                               'subtitle': subtitle,
                               'revisions': revisions})

            if revisions[-1].get('leave_data', leave_data):
                teardown(destroy=False, leave_data=True)
            else:
                kill_delay = 300 if profiler.yourkit_is_enabled(revision_config) else 0
                teardown(destroy=True, leave_data=False, kill_delay=kill_delay)

            if profiler.yourkit_is_enabled(revision_config):
                yourkit_config = profiler.yourkit_get_config()
                yourkit_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf', 'yourkit')
                yourkit_test_dir = os.path.join(yourkit_dir, last_stress_operation_id)
                retrieve_yourkit(yourkit_test_dir, rev_num+1)
                sh.tar('cfvz', "{}.tar.gz".format(stats['id']),
                       last_stress_operation_id, _cwd=yourkit_dir)
                shutil.rmtree(yourkit_test_dir)
Beispiel #5
0
def stress_compare(revisions,
                   title,
                   log,
                   operations = [],
                   subtitle = '',
                   capture_fincore=False,
                   initial_destroy=True,
                   leave_data=False,
                   keep_page_cache=False,
                   git_fetch_before_test=True,
                   bootstrap_before_test=True,
                   teardown_after_test=True
               ):
    """
    Run Stress on multiple C* branches and compare them.

    revisions - List of dictionaries that contain cluster configurations
                to trial. This is combined with the default config.
    title - The title of the comparison
    subtitle - A subtitle for more information (displayed smaller underneath)
    log - The json file path to record stats to
    operations - List of dictionaries indicating the operations. Example:
       [# cassandra-stress command, node defaults to cluster defined 'stress_node'
        {'type': 'stress',
         'command': 'write n=19M -rate threads=50',
         'node': 'node1',
         'wait_for_compaction': True},
        # nodetool command to run in parallel on nodes:
        {'type': 'nodetool',
         'command': 'decomission',
         'nodes': ['node1','node2']},
        # cqlsh script, node defaults to cluster defined 'stress_node'
        {'type': 'cqlsh',
         'script': "use my_ks; INSERT INTO blah (col1, col2) VALUES (val1, val2);",
         'node': 'node1'}
       ]
    capture_fincore - Enables capturing of linux-fincore logs of C* data files.
    initial_destroy - Destroy all data before the first revision is run.
    leave_data - Whether to leave the Cassandra data/commitlog/etc directories intact between revisions.
    keep_page_cache - Whether to leave the linux page cache intact between revisions.
    git_fetch_before_test (bool): If True, will update the cassandra.git with fab_common.git_repos
    bootstrap_before_test (bool): If True, will bootstrap DSE / C* before running the operations
    teardown_after_test (bool): If True, will shutdown DSE / C* after all of the operations
    """
    validate_revisions_list(revisions)
    validate_operations_list(operations)

    pristine_config = copy.copy(fab_config)

    # initial_destroy and git_fetch_before_test can be set in the job configuration,
    # or manually in the call to this function.
    # Either is fine, but they shouldn't conflict. If they do, a ValueError is raised.
    initial_destroy = get_bool_if_method_and_config_values_do_not_conflict('initial_destroy',
                                                                           initial_destroy,
                                                                           pristine_config,
                                                                           method_name='stress_compare')

    if initial_destroy:
        logger.info("Cleaning up from prior runs of stress_compare ...")
        teardown(destroy=True, leave_data=False)

    # https://datastax.jira.com/browse/CSTAR-633
    git_fetch_before_test = get_bool_if_method_and_config_values_do_not_conflict('git_fetch_before_test',
                                                                                 git_fetch_before_test,
                                                                                 pristine_config,
                                                                                 method_name='stress_compare')

    stress_shas = maybe_update_cassandra_git_and_setup_stress(operations, git_fetch=git_fetch_before_test)

    # Flamegraph Setup
    if flamegraph.is_enabled():
        execute(flamegraph.setup)

    with GracefulTerminationHandler() as handler:
        for rev_num, revision_config in enumerate(revisions):
            config = copy.copy(pristine_config)
            config.update(revision_config)
            revision = revision_config['revision']
            config['log'] = log
            config['title'] = title
            config['subtitle'] = subtitle
            product = dse if config.get('product') == 'dse' else cstar

            # leave_data, bootstrap_before_test, and teardown_after_test can be set in the job configuration,
            # or manually in the call to this function.
            # Either is fine, but they shouldn't conflict. If they do, a ValueError is raised.
            leave_data = get_bool_if_method_and_config_values_do_not_conflict('leave_data',
                                                                              leave_data,
                                                                              revision_config,
                                                                              method_name='stress_compare')

            # https://datastax.jira.com/browse/CSTAR-638
            bootstrap_before_test = get_bool_if_method_and_config_values_do_not_conflict('bootstrap_before_test',
                                                                                         bootstrap_before_test,
                                                                                         revision_config,
                                                                                         method_name='stress_compare')

            # https://datastax.jira.com/browse/CSTAR-639
            teardown_after_test = get_bool_if_method_and_config_values_do_not_conflict('teardown_after_test',
                                                                                       teardown_after_test,
                                                                                       revision_config,
                                                                                       method_name='stress_compare')

            logger.info("Bringing up {revision} cluster...".format(revision=revision))

            # Drop the page cache between each revision, especially
            # important when leave_data=True :
            if not keep_page_cache:
                drop_page_cache()

            # Only fetch from git on the first run and if git_fetch_before_test is True
            git_fetch_before_bootstrap = True if rev_num == 0 and git_fetch_before_test else False
            if bootstrap_before_test:
                revision_config['git_id'] = git_id = bootstrap(config,
                                                               destroy=initial_destroy,
                                                               leave_data=leave_data,
                                                               git_fetch=git_fetch_before_bootstrap)
            else:
                revision_config['git_id'] = git_id = config['revision']

            if flamegraph.is_enabled(revision_config):
                execute(flamegraph.ensure_stopped_perf_agent)
                execute(flamegraph.start_perf_agent, rev_num)

            if capture_fincore:
                start_fincore_capture(interval=10)

            last_stress_operation_id = 'None'
            for operation_i, operation in enumerate(operations, 1):
                try:
                    start = datetime.datetime.now()
                    stats = {
                        "id": str(uuid.uuid1()),
                        "type": operation['type'],
                        "revision": revision,
                        "git_id": git_id,
                        "start_date": start.isoformat(),
                        "label": revision_config.get('label', revision_config['revision']),
                        "test": '{operation_i}_{operation}'.format(
                            operation_i=operation_i,
                            operation=operation['type'])
                    }

                    if operation['type'] == 'stress':
                        last_stress_operation_id = stats['id']
                        # Default to all the nodes of the cluster if no
                        # nodes were specified in the command:
                        if operation.has_key('nodes'):
                            cmd = "{command} -node {hosts}".format(
                                command=operation['command'],
                                hosts=",".join(operation['nodes']))
                        elif '-node' in operation['command']:
                            cmd = operation['command']
                        else:
                            cmd = "{command} -node {hosts}".format(
                                command=operation['command'],
                                hosts=",".join([n for n in fab_config['hosts']]))
                        stats['command'] = cmd
                        stats['intervals'] = []
                        stats['test'] = '{operation_i}_{operation}'.format(
                            operation_i=operation_i, operation=cmd.strip().split(' ')[0]).replace(" ", "_")
                        logger.info('Running stress operation : {cmd}  ...'.format(cmd=cmd))
                        # Run stress:
                        # (stress takes the stats as a parameter, and adds
                        #  more as it runs):
                        stress_sha = stress_shas[operation.get('stress_revision', 'default')]
                        stats = stress(cmd, revision, stress_sha, stats=stats)
                        # Wait for all compactions to finish (unless disabled):
                        if operation.get('wait_for_compaction', True):
                            compaction_throughput = revision_config.get("compaction_throughput_mb_per_sec", 16)
                            wait_for_compaction(compaction_throughput=compaction_throughput)

                    elif operation['type'] == 'nodetool':
                        if 'nodes' not in operation:
                            operation['nodes'] = 'all'
                        if operation['nodes'] in ['all','ALL']:
                            nodes = [n for n in fab_config['hosts']]
                        else:
                            nodes = operation['nodes']

                        set_nodetool_path(os.path.join(product.get_bin_path(), 'nodetool'))
                        logger.info("Running nodetool on {nodes} with command: {command}".format(nodes=operation['nodes'], command=operation['command']))
                        stats['command'] = operation['command']
                        output = nodetool_multi(nodes, operation['command'])
                        stats['output'] = output
                        logger.info("Nodetool command finished on all nodes")

                    elif operation['type'] == 'cqlsh':
                        logger.info("Running cqlsh commands on {node}".format(node=operation['node']))
                        set_cqlsh_path(os.path.join(product.get_bin_path(), 'cqlsh'))
                        output = cqlsh(operation['script'], operation['node'])
                        stats['output'] = output.split("\n")
                        stats['command'] = operation['script']
                        logger.info("Cqlsh commands finished")

                    elif operation['type'] == 'bash':
                        nodes = operation.get('nodes', [n for n in fab_config['hosts']])
                        logger.info("Running bash commands on: {nodes}".format(nodes=nodes))
                        stats['output'] = bash(operation['script'], nodes)
                        stats['command'] = operation['script']
                        logger.info("Bash commands finished")

                    elif operation['type'] == 'spark_cassandra_stress':
                        nodes = operation.get('nodes', [n for n in fab_config['hosts']])
                        stress_node = config.get('stress_node', None)
                        # Note: once we have https://datastax.jira.com/browse/CSTAR-617, we should fix this to use
                        # client-tool when DSE_VERSION >= 4.8.0
                        # https://datastax.jira.com/browse/DSP-6025: dse client-tool
                        master_regex = re.compile(r"(.|\n)*(?P<master>spark:\/\/\d+.\d+.\d+.\d+:\d+)(.|\n)*")
                        master_out = dsetool_cmd(nodes[0], options='sparkmaster')[nodes[0]]
                        master_match = master_regex.match(master_out)
                        if not master_match:
                            raise ValueError('Could not find master address from "dsetool sparkmaster" cmd\n'
                                             'Found output: {f}'.format(f=master_out))
                        master_string = master_match.group('master')
                        build_spark_cassandra_stress = bool(distutils.util.strtobool(
                            str(operation.get('build_spark_cassandra_stress', 'True'))))
                        remove_existing_spark_data = bool(distutils.util.strtobool(
                            str(operation.get('remove_existing_spark_data', 'True'))))
                        logger.info("Running spark_cassandra_stress on {stress_node} "
                                    "using spark.cassandra.connection.host={node} and "
                                    "spark-master {master}".format(stress_node=stress_node,
                                                                   node=nodes[0],
                                                                   master=master_string))
                        output = spark_cassandra_stress(operation['script'], nodes, stress_node=stress_node,
                                                        master=master_string,
                                                        build_spark_cassandra_stress=build_spark_cassandra_stress,
                                                        remove_existing_spark_data=remove_existing_spark_data)
                        stats['output'] = output.get('output', 'No output captured')
                        stats['spark_cass_stress_time_in_seconds'] = output.get('stats', {}).get('TimeInSeconds', 'No time captured')
                        stats['spark_cass_stress_ops_per_second'] = output.get('stats', {}).get('OpsPerSecond', 'No ops/s captured')
                        logger.info("spark_cassandra_stress finished")

                    elif operation['type'] == 'ctool':
                        logger.info("Running ctool with parameters: {command}".format(command=operation['command']))
                        ctool = Ctool(operation['command'], common.config)
                        output = execute(ctool.run)
                        stats['output'] = output
                        logger.info("ctool finished")

                    elif operation['type'] == 'dsetool':
                        if 'nodes' not in operation:
                            operation['nodes'] = 'all'
                        if operation['nodes'] in ['all','ALL']:
                            nodes = [n for n in fab_config['hosts']]
                        else:
                            nodes = operation['nodes']

                        dsetool_options = operation['script']
                        logger.info("Running dsetool {command} on {nodes}".format(nodes=operation['nodes'], command=dsetool_options))
                        stats['command'] = dsetool_options
                        output = dsetool_cmd(nodes=nodes, options=dsetool_options)
                        stats['output'] = output
                        logger.info("dsetool command finished on all nodes")

                    elif operation['type'] == 'dse':
                        logger.info("Running dse command on {node}".format(node=operation['node']))
                        output = dse_cmd(node=operation['node'], options=operation['script'])
                        stats['output'] = output.split("\n")
                        stats['command'] = operation['script']
                        logger.info("dse commands finished")

                    end = datetime.datetime.now()
                    stats['end_date'] = end.isoformat()
                    stats['op_duration'] = str(end - start)
                    log_stats(stats, file=log)
                finally:
                    # Copy node logs:
                    retrieve_logs_and_create_tarball(job_id=stats['id'])
                    revision_config['last_log'] = stats['id']

                if capture_fincore:
                    stop_fincore_capture()
                    log_dir = os.path.join(CSTAR_PERF_LOGS_DIR, stats['id'])
                    retrieve_fincore_logs(log_dir)
                    # Restart fincore capture if this is not the last
                    # operation:
                    if operation_i < len(operations):
                        start_fincore_capture(interval=10)

            if flamegraph.is_enabled(revision_config):
                # Generate and Copy node flamegraphs
                execute(flamegraph.stop_perf_agent)
                execute(flamegraph.generate_flamegraph, rev_num)
                flamegraph_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf', 'flamegraph')
                flamegraph_test_dir = os.path.join(flamegraph_dir, last_stress_operation_id)
                retrieve_flamegraph(flamegraph_test_dir, rev_num+1)
                sh.tar('cfvz', "{}.tar.gz".format(stats['id']), last_stress_operation_id, _cwd=flamegraph_dir)
                shutil.rmtree(flamegraph_test_dir)

            log_add_data(log, {'title':title,
                               'subtitle': subtitle,
                               'revisions': revisions})
            if teardown_after_test:
                if revisions[-1].get('leave_data', leave_data):
                    teardown(destroy=False, leave_data=True)
                else:
                    kill_delay = 300 if profiler.yourkit_is_enabled(revision_config) else 0
                    teardown(destroy=True, leave_data=False, kill_delay=kill_delay)

            if profiler.yourkit_is_enabled(revision_config):
                yourkit_config = profiler.yourkit_get_config()
                yourkit_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf', 'yourkit')
                yourkit_test_dir = os.path.join(yourkit_dir, last_stress_operation_id)
                retrieve_yourkit(yourkit_test_dir, rev_num+1)
                sh.tar('cfvz', "{}.tar.gz".format(stats['id']),
                       last_stress_operation_id, _cwd=yourkit_dir)
                shutil.rmtree(yourkit_test_dir)
Beispiel #6
0
def stress_compare(revisions,
                   title,
                   log,
                   operations=[],
                   subtitle='',
                   capture_fincore=False,
                   initial_destroy=True,
                   leave_data=False,
                   keep_page_cache=False,
                   git_fetch_before_test=True,
                   bootstrap_before_test=True,
                   teardown_after_test=True):
    """
    Run Stress on multiple C* branches and compare them.

    revisions - List of dictionaries that contain cluster configurations
                to trial. This is combined with the default config.
    title - The title of the comparison
    subtitle - A subtitle for more information (displayed smaller underneath)
    log - The json file path to record stats to
    operations - List of dictionaries indicating the operations. Example:
       [# cassandra-stress command, node defaults to cluster defined 'stress_node'
        {'type': 'stress',
         'command': 'write n=19M -rate threads=50',
         'node': 'node1',
         'wait_for_compaction': True},
        # nodetool command to run in parallel on nodes:
        {'type': 'nodetool',
         'command': 'decomission',
         'nodes': ['node1','node2']},
        # cqlsh script, node defaults to cluster defined 'stress_node'
        {'type': 'cqlsh',
         'script': "use my_ks; INSERT INTO blah (col1, col2) VALUES (val1, val2);",
         'node': 'node1'}
       ]
    capture_fincore - Enables capturing of linux-fincore logs of C* data files.
    initial_destroy - Destroy all data before the first revision is run.
    leave_data - Whether to leave the Cassandra data/commitlog/etc directories intact between revisions.
    keep_page_cache - Whether to leave the linux page cache intact between revisions.
    git_fetch_before_test (bool): If True, will update the cassandra.git with fab_common.git_repos
    bootstrap_before_test (bool): If True, will bootstrap DSE / C* before running the operations
    teardown_after_test (bool): If True, will shutdown DSE / C* after all of the operations
    """
    validate_revisions_list(revisions)
    validate_operations_list(operations)

    pristine_config = copy.copy(fab_config)

    # initial_destroy and git_fetch_before_test can be set in the job configuration,
    # or manually in the call to this function.
    # Either is fine, but they shouldn't conflict. If they do, a ValueError is raised.
    initial_destroy = get_bool_if_method_and_config_values_do_not_conflict(
        'initial_destroy',
        initial_destroy,
        pristine_config,
        method_name='stress_compare')

    if initial_destroy:
        logger.info("Cleaning up from prior runs of stress_compare ...")
        teardown(destroy=True, leave_data=False)

    # https://datastax.jira.com/browse/CSTAR-633
    git_fetch_before_test = get_bool_if_method_and_config_values_do_not_conflict(
        'git_fetch_before_test',
        git_fetch_before_test,
        pristine_config,
        method_name='stress_compare')

    stress_shas = maybe_update_cassandra_git_and_setup_stress(
        operations, git_fetch=git_fetch_before_test)

    # Flamegraph Setup
    if flamegraph.is_enabled():
        execute(flamegraph.setup)

    with GracefulTerminationHandler() as handler:
        for rev_num, revision_config in enumerate(revisions):
            config = copy.copy(pristine_config)
            config.update(revision_config)
            revision = revision_config['revision']
            config['log'] = log
            config['title'] = title
            config['subtitle'] = subtitle
            product = dse if config.get('product') == 'dse' else cstar

            # leave_data, bootstrap_before_test, and teardown_after_test can be set in the job configuration,
            # or manually in the call to this function.
            # Either is fine, but they shouldn't conflict. If they do, a ValueError is raised.
            leave_data = get_bool_if_method_and_config_values_do_not_conflict(
                'leave_data',
                leave_data,
                revision_config,
                method_name='stress_compare')

            # https://datastax.jira.com/browse/CSTAR-638
            bootstrap_before_test = get_bool_if_method_and_config_values_do_not_conflict(
                'bootstrap_before_test',
                bootstrap_before_test,
                revision_config,
                method_name='stress_compare')

            # https://datastax.jira.com/browse/CSTAR-639
            teardown_after_test = get_bool_if_method_and_config_values_do_not_conflict(
                'teardown_after_test',
                teardown_after_test,
                revision_config,
                method_name='stress_compare')

            logger.info(
                "Bringing up {revision} cluster...".format(revision=revision))

            # Drop the page cache between each revision, especially
            # important when leave_data=True :
            if not keep_page_cache:
                drop_page_cache()

            # Only fetch from git on the first run and if git_fetch_before_test is True
            git_fetch_before_bootstrap = True if rev_num == 0 and git_fetch_before_test else False
            if bootstrap_before_test:
                revision_config['git_id'] = git_id = bootstrap(
                    config,
                    destroy=initial_destroy,
                    leave_data=leave_data,
                    git_fetch=git_fetch_before_bootstrap)
            else:
                revision_config['git_id'] = git_id = config['revision']

            if flamegraph.is_enabled(revision_config):
                execute(flamegraph.ensure_stopped_perf_agent)
                execute(flamegraph.start_perf_agent, rev_num)

            if capture_fincore:
                start_fincore_capture(interval=10)

            last_stress_operation_id = 'None'
            for operation_i, operation in enumerate(operations, 1):
                try:
                    start = datetime.datetime.now()
                    stats = {
                        "id":
                        str(uuid.uuid1()),
                        "type":
                        operation['type'],
                        "revision":
                        revision,
                        "git_id":
                        git_id,
                        "start_date":
                        start.isoformat(),
                        "label":
                        revision_config.get('label',
                                            revision_config['revision']),
                        "test":
                        '{operation_i}_{operation}'.format(
                            operation_i=operation_i,
                            operation=operation['type'])
                    }

                    if operation['type'] == 'stress':
                        last_stress_operation_id = stats['id']
                        # Default to all the nodes of the cluster if no
                        # nodes were specified in the command:
                        if operation.has_key('nodes'):
                            cmd = "{command} -node {hosts}".format(
                                command=operation['command'],
                                hosts=",".join(operation['nodes']))
                        elif '-node' in operation['command']:
                            cmd = operation['command']
                        else:
                            cmd = "{command} -node {hosts}".format(
                                command=operation['command'],
                                hosts=",".join(
                                    [n for n in fab_config['hosts']]))
                        stats['command'] = cmd
                        stats['intervals'] = []
                        stats['test'] = '{operation_i}_{operation}'.format(
                            operation_i=operation_i,
                            operation=cmd.strip().split(' ')[0]).replace(
                                " ", "_")
                        logger.info(
                            'Running stress operation : {cmd}  ...'.format(
                                cmd=cmd))
                        # Run stress:
                        # (stress takes the stats as a parameter, and adds
                        #  more as it runs):
                        stress_sha = stress_shas[operation.get(
                            'stress_revision', 'default')]
                        stats = stress(cmd, revision, stress_sha, stats=stats)
                        # Wait for all compactions to finish (unless disabled):
                        if operation.get('wait_for_compaction', True):
                            compaction_throughput = revision_config.get(
                                "compaction_throughput_mb_per_sec", 16)
                            wait_for_compaction(
                                compaction_throughput=compaction_throughput)

                    elif operation['type'] == 'nodetool':
                        if 'nodes' not in operation:
                            operation['nodes'] = 'all'
                        if operation['nodes'] in ['all', 'ALL']:
                            nodes = [n for n in fab_config['hosts']]
                        else:
                            nodes = operation['nodes']

                        set_nodetool_path(
                            os.path.join(product.get_bin_path(), 'nodetool'))
                        logger.info(
                            "Running nodetool on {nodes} with command: {command}"
                            .format(nodes=operation['nodes'],
                                    command=operation['command']))
                        stats['command'] = operation['command']
                        output = nodetool_multi(nodes, operation['command'])
                        stats['output'] = output
                        logger.info("Nodetool command finished on all nodes")

                    elif operation['type'] == 'cqlsh':
                        logger.info("Running cqlsh commands on {node}".format(
                            node=operation['node']))
                        set_cqlsh_path(
                            os.path.join(product.get_bin_path(), 'cqlsh'))
                        output = cqlsh(operation['script'], operation['node'])
                        stats['output'] = output.split("\n")
                        stats['command'] = operation['script']
                        logger.info("Cqlsh commands finished")

                    elif operation['type'] == 'bash':
                        nodes = operation.get('nodes',
                                              [n for n in fab_config['hosts']])
                        logger.info("Running bash commands on: {nodes}".format(
                            nodes=nodes))
                        stats['output'] = bash(operation['script'], nodes)
                        stats['command'] = operation['script']
                        logger.info("Bash commands finished")

                    elif operation['type'] == 'spark_cassandra_stress':
                        nodes = operation.get('nodes',
                                              [n for n in fab_config['hosts']])
                        stress_node = config.get('stress_node', None)
                        # Note: once we have https://datastax.jira.com/browse/CSTAR-617, we should fix this to use
                        # client-tool when DSE_VERSION >= 4.8.0
                        # https://datastax.jira.com/browse/DSP-6025: dse client-tool
                        master_regex = re.compile(
                            r"(.|\n)*(?P<master>spark:\/\/\d+.\d+.\d+.\d+:\d+)(.|\n)*"
                        )
                        master_out = dsetool_cmd(
                            nodes[0], options='sparkmaster')[nodes[0]]
                        master_match = master_regex.match(master_out)
                        if not master_match:
                            raise ValueError(
                                'Could not find master address from "dsetool sparkmaster" cmd\n'
                                'Found output: {f}'.format(f=master_out))
                        master_string = master_match.group('master')
                        build_spark_cassandra_stress = bool(
                            distutils.util.strtobool(
                                str(
                                    operation.get(
                                        'build_spark_cassandra_stress',
                                        'True'))))
                        remove_existing_spark_data = bool(
                            distutils.util.strtobool(
                                str(
                                    operation.get('remove_existing_spark_data',
                                                  'True'))))
                        logger.info(
                            "Running spark_cassandra_stress on {stress_node} "
                            "using spark.cassandra.connection.host={node} and "
                            "spark-master {master}".format(
                                stress_node=stress_node,
                                node=nodes[0],
                                master=master_string))
                        output = spark_cassandra_stress(
                            operation['script'],
                            nodes,
                            stress_node=stress_node,
                            master=master_string,
                            build_spark_cassandra_stress=
                            build_spark_cassandra_stress,
                            remove_existing_spark_data=
                            remove_existing_spark_data)
                        stats['output'] = output.get('output',
                                                     'No output captured')
                        stats[
                            'spark_cass_stress_time_in_seconds'] = output.get(
                                'stats', {}).get('TimeInSeconds',
                                                 'No time captured')
                        stats['spark_cass_stress_ops_per_second'] = output.get(
                            'stats', {}).get('OpsPerSecond',
                                             'No ops/s captured')
                        logger.info("spark_cassandra_stress finished")

                    elif operation['type'] == 'ctool':
                        logger.info(
                            "Running ctool with parameters: {command}".format(
                                command=operation['command']))
                        ctool = Ctool(operation['command'], common.config)
                        output = execute(ctool.run)
                        stats['output'] = output
                        logger.info("ctool finished")

                    elif operation['type'] == 'dsetool':
                        if 'nodes' not in operation:
                            operation['nodes'] = 'all'
                        if operation['nodes'] in ['all', 'ALL']:
                            nodes = [n for n in fab_config['hosts']]
                        else:
                            nodes = operation['nodes']

                        dsetool_options = operation['script']
                        logger.info(
                            "Running dsetool {command} on {nodes}".format(
                                nodes=operation['nodes'],
                                command=dsetool_options))
                        stats['command'] = dsetool_options
                        output = dsetool_cmd(nodes=nodes,
                                             options=dsetool_options)
                        stats['output'] = output
                        logger.info("dsetool command finished on all nodes")

                    elif operation['type'] == 'dse':
                        logger.info("Running dse command on {node}".format(
                            node=operation['node']))
                        output = dse_cmd(node=operation['node'],
                                         options=operation['script'])
                        stats['output'] = output.split("\n")
                        stats['command'] = operation['script']
                        logger.info("dse commands finished")

                    end = datetime.datetime.now()
                    stats['end_date'] = end.isoformat()
                    stats['op_duration'] = str(end - start)
                    log_stats(stats, file=log)
                finally:
                    # Copy node logs:
                    retrieve_logs_and_create_tarball(job_id=stats['id'])
                    revision_config['last_log'] = stats['id']

                if capture_fincore:
                    stop_fincore_capture()
                    log_dir = os.path.join(CSTAR_PERF_LOGS_DIR, stats['id'])
                    retrieve_fincore_logs(log_dir)
                    # Restart fincore capture if this is not the last
                    # operation:
                    if operation_i < len(operations):
                        start_fincore_capture(interval=10)

            if flamegraph.is_enabled(revision_config):
                # Generate and Copy node flamegraphs
                execute(flamegraph.stop_perf_agent)
                execute(flamegraph.generate_flamegraph, rev_num)
                flamegraph_dir = os.path.join(os.path.expanduser('~'),
                                              '.cstar_perf', 'flamegraph')
                flamegraph_test_dir = os.path.join(flamegraph_dir,
                                                   last_stress_operation_id)
                retrieve_flamegraph(flamegraph_test_dir, rev_num + 1)
                sh.tar('cfvz',
                       "{}.tar.gz".format(stats['id']),
                       last_stress_operation_id,
                       _cwd=flamegraph_dir)
                shutil.rmtree(flamegraph_test_dir)

            log_add_data(log, {
                'title': title,
                'subtitle': subtitle,
                'revisions': revisions
            })
            if teardown_after_test:
                if revisions[-1].get('leave_data', leave_data):
                    teardown(destroy=False, leave_data=True)
                else:
                    kill_delay = 300 if profiler.yourkit_is_enabled(
                        revision_config) else 0
                    teardown(destroy=True,
                             leave_data=False,
                             kill_delay=kill_delay)

            if profiler.yourkit_is_enabled(revision_config):
                yourkit_config = profiler.yourkit_get_config()
                yourkit_dir = os.path.join(os.path.expanduser('~'),
                                           '.cstar_perf', 'yourkit')
                yourkit_test_dir = os.path.join(yourkit_dir,
                                                last_stress_operation_id)
                retrieve_yourkit(yourkit_test_dir, rev_num + 1)
                sh.tar('cfvz',
                       "{}.tar.gz".format(stats['id']),
                       last_stress_operation_id,
                       _cwd=yourkit_dir)
                shutil.rmtree(yourkit_test_dir)