コード例 #1
0
def stress_compare(revisions,
                   title,
                   log,
                   operations = [],
                   subtitle = '',
                   capture_fincore=False,
                   initial_destroy=True,
                   leave_data=False,
                   keep_page_cache=False
               ):
    """
    Run Stress on multiple C* branches and compare them.

    revisions - List of dictionaries that contain cluster configurations
                to trial. This is combined with the default config.
    title - The title of the comparison
    subtitle - A subtitle for more information (displayed smaller underneath)
    log - The json file path to record stats to
    operations - List of dictionaries indicating the operations. Example:
       [# cassandra-stress command, node defaults to cluster defined 'stress_node'
        {'type': 'stress',
         'command': 'write n=19M -rate threads=50',
         'node': 'node1',
         'wait_for_compaction': True},
        # nodetool command to run in parallel on nodes:
        {'type': 'nodetool',
         'command': 'decomission',
         'nodes': ['node1','node2']},
        # cqlsh script, node defaults to cluster defined 'stress_node'
        {'type': 'cqlsh',
         'script': "use my_ks; INSERT INTO blah (col1, col2) VALUES (val1, val2);",
         'node': 'node1'}
       ]
    capture_fincore - Enables capturing of linux-fincore logs of C* data files.
    initial_destroy - Destroy all data before the first revision is run.
    leave_data - Whether to leave the Cassandra data/commitlog/etc directories intact between revisions.
    keep_page_cache - Whether to leave the linux page cache intact between revisions.
    """
    validate_revisions_list(revisions)
    validate_operations_list(operations)

    pristine_config = copy.copy(fab_config)

    # initial_destroy settting can be set in the job
    # configuration, or manually in the call to this function. Either
    # is fine, but they shouldn't conflict. If they do, ValueError is
    # raised.
    if initial_destroy == True and pristine_config.get('initial_destroy', None) == False:
        raise ValueError('setting for initial_destroy conflicts in job config and stress_compare() call')
    else:
        initial_destroy = pristine_config.get('initial_destroy', initial_destroy)

    if initial_destroy:
        logger.info("Cleaning up from prior runs of stress_compare ...")
        teardown(destroy=True, leave_data=False)

    # Update our local cassandra git remotes and branches
    _, localhost_entry = get_localhost()
    with common.fab.settings(hosts=[localhost_entry]):
        execute(cstar.update_cassandra_git)

    # Flamegraph Setup
    if flamegraph.is_enabled():
        execute(flamegraph.setup)

    clean_stress()
    stress_revisions = set([operation['stress_revision'] for operation in operations if 'stress_revision' in operation])
    stress_shas = setup_stress(stress_revisions)

    for rev_num, revision_config in enumerate(revisions):
        config = copy.copy(pristine_config)
        config.update(revision_config)
        revision = revision_config['revision']
        config['log'] = log
        config['title'] = title
        config['subtitle'] = subtitle
        product = dse if config.get('product') == 'dse' else cstar

        # leave_data settting can be set in the revision
        # configuration, or manually in the call to this function.
        # Either is fine, but they shouldn't conflict. If they do,
        # ValueError is raised.
        if leave_data == True and revision_config.get('leave_data', None) == False:
            raise ValueError('setting for leave_data conflicts in job config and stress_compare() call')
        else:
            leave_data = revision_config.get('leave_data', leave_data)

        logger.info("Bringing up {revision} cluster...".format(revision=revision))

        # Drop the page cache between each revision, especially
        # important when leave_data=True :
        if not keep_page_cache:
            drop_page_cache()

        # Only fetch from git on the first run:
        git_fetch = True if rev_num == 0 else False
        revision_config['git_id'] = git_id = bootstrap(config, destroy=True, leave_data=leave_data, git_fetch=git_fetch)

        if flamegraph.is_enabled(revision_config):
            execute(flamegraph.ensure_stopped_perf_agent)
            execute(flamegraph.start_perf_agent, rev_num)

        if capture_fincore:
            start_fincore_capture(interval=10)

        last_stress_operation_id = 'None'
        for operation_i, operation in enumerate(operations, 1):
            try:
                start = datetime.datetime.now()
                stats = {
                    "id": str(uuid.uuid1()),
                    "type": operation['type'],
                    "revision": revision,
                    "git_id": git_id,
                    "start_date": start.isoformat(),
                    "label": revision_config.get('label', revision_config['revision']),
                    "test": '{operation_i}_{operation}'.format(
                        operation_i=operation_i,
                        operation=operation['type'])
                }

                if operation['type'] == 'stress':
                    last_stress_operation_id = stats['id']
                    # Default to all the nodes of the cluster if no
                    # nodes were specified in the command:
                    if operation.has_key('nodes'):
                        cmd = "{command} -node {hosts}".format(
                            command=operation['command'],
                            hosts=",".join(operation['nodes']))
                    elif '-node' in operation['command']:
                        cmd = operation['command']
                    else:
                        cmd = "{command} -node {hosts}".format(
                            command=operation['command'],
                            hosts=",".join([n for n in fab_config['hosts']]))
                    stats['command'] = cmd
                    stats['intervals'] = []
                    stats['test'] = '{operation_i}_{operation}'.format(
                        operation_i=operation_i, operation=cmd.strip().split(' ')[0]).replace(" ", "_")
                    logger.info('Running stress operation : {cmd}  ...'.format(cmd=cmd))
                    # Run stress:
                    # (stress takes the stats as a parameter, and adds
                    #  more as it runs):
                    stress_sha = stress_shas[operation.get('stress_revision', 'default')]
                    stats = stress(cmd, revision, stress_sha, stats=stats)
                    # Wait for all compactions to finish (unless disabled):
                    if operation.get('wait_for_compaction', True):
                        compaction_throughput = revision_config.get("compaction_throughput_mb_per_sec", 16)
                        wait_for_compaction(compaction_throughput=compaction_throughput)

                elif operation['type'] == 'nodetool':
                    if 'nodes' not in operation:
                        operation['nodes'] = 'all'
                    if operation['nodes'] in ['all','ALL']:
                        nodes = [n for n in fab_config['hosts']]
                    else:
                        nodes = operation['nodes']

                    set_nodetool_path(os.path.join(product.get_bin_path(), 'nodetool'))
                    logger.info("Running nodetool on {nodes} with command: {command}".format(nodes=operation['nodes'], command=operation['command']))
                    stats['command'] = operation['command']
                    output = nodetool_multi(nodes, operation['command'])
                    stats['output'] = output
                    logger.info("Nodetool command finished on all nodes")

                elif operation['type'] == 'cqlsh':
                    logger.info("Running cqlsh commands on {node}".format(node=operation['node']))
                    set_cqlsh_path(os.path.join(product.get_bin_path(), 'cqlsh'))
                    output = cqlsh(operation['script'], operation['node'])
                    stats['output'] = output.split("\n")
                    stats['command'] = operation['script']
                    logger.info("Cqlsh commands finished")

                elif operation['type'] == 'bash':
                    nodes = operation.get('nodes', [n for n in fab_config['hosts']])
                    logger.info("Running bash commands on: {nodes}".format(nodes=nodes))
                    stats['output'] = bash(operation['script'], nodes)
                    stats['command'] = operation['script']
                    logger.info("Bash commands finished")

                elif operation['type'] == 'spark_cassandra_stress':
                    node = operation['node']
                    logger.info("Running spark_cassandra_stress on {node}".format(node=node))
                    output = spark_cassandra_stress(operation['script'], node)
                    stats['output'] = output
                    logger.info("spark_cassandra_stress finished")

                elif operation['type'] == 'ctool':
                    logger.info("Running ctool with parameters: {command}".format(command=operation['command']))
                    ctool = Ctool(operation['command'], common.config)
                    output = execute(ctool.run)
                    stats['output'] = output
                    logger.info("ctool finished")

                end = datetime.datetime.now()
                stats['end_date'] = end.isoformat()
                stats['op_duration'] = str(end - start)
                log_stats(stats, file=log)
            finally:
                # Copy node logs:
                logs_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf','logs')
                log_dir = os.path.join(logs_dir, stats['id'])
                os.makedirs(log_dir)
                retrieve_logs(log_dir)
                revision_config['last_log'] = stats['id']
                # Tar them for archiving:
                subprocess.Popen(shlex.split('tar cfvz {id}.tar.gz {id}'.format(id=stats['id'])), cwd=logs_dir).communicate()
                shutil.rmtree(log_dir)

            if capture_fincore:
                stop_fincore_capture()
                retrieve_fincore_logs(log_dir)
                # Restart fincore capture if this is not the last
                # operation:
                if operation_i < len(operations):
                    start_fincore_capture(interval=10)

        if flamegraph.is_enabled(revision_config):
            # Generate and Copy node flamegraphs
            execute(flamegraph.stop_perf_agent)
            execute(flamegraph.generate_flamegraph, rev_num)
            flamegraph_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf', 'flamegraph')
            flamegraph_test_dir = os.path.join(flamegraph_dir, last_stress_operation_id)
            retrieve_flamegraph(flamegraph_test_dir, rev_num+1)
            sh.tar('cfvz', "{}.tar.gz".format(stats['id']), last_stress_operation_id, _cwd=flamegraph_dir)
            shutil.rmtree(flamegraph_test_dir)

        log_add_data(log, {'title':title,
                           'subtitle': subtitle,
                           'revisions': revisions})

        if revisions[-1].get('leave_data', leave_data):
            teardown(destroy=False, leave_data=True)
        else:
            kill_delay = 300 if profiler.yourkit_is_enabled(revision_config) else 0
            teardown(destroy=True, leave_data=False, kill_delay=kill_delay)

        if profiler.yourkit_is_enabled(revision_config):
            yourkit_config = profiler.yourkit_get_config()
            yourkit_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf', 'yourkit')
            yourkit_test_dir = os.path.join(yourkit_dir, last_stress_operation_id)
            retrieve_yourkit(yourkit_test_dir, rev_num+1)
            sh.tar('cfvz', "{}.tar.gz".format(stats['id']),
                   last_stress_operation_id, _cwd=yourkit_dir)
            shutil.rmtree(yourkit_test_dir)
コード例 #2
0
def stress_compare(revisions, 
                   title,
                   log,
                   operations = [],
                   subtitle = '',
                   capture_fincore=False,
                   initial_destroy=True,
                   leave_data=False,
                   keep_page_cache=False
               ):
    """
    Run Stress on multiple C* branches and compare them.
    
    revisions - List of dictionaries that contain cluster configurations
                to trial. This is combined with the default config.
    title - The title of the comparison
    subtitle - A subtitle for more information (displayed smaller underneath)
    log - The json file path to record stats to
    operations - List of dictionaries indicating the operations. Example:
       [# cassandra-stress command, node defaults to cluster defined 'stress_node'
        {'type': 'stress',
         'command': 'write n=19000000 -rate threads=50',
         'node': 'node1',
         'wait_for_compaction': True},
        # nodetool command to run in parallel on nodes:
        {'type': 'nodetool',
         'command': 'decomission',
         'nodes': ['node1','node2']},
        # cqlsh script, node defaults to cluster defined 'stress_node'
        {'type': 'cqlsh',
         'script': "use my_ks; INSERT INTO blah (col1, col2) VALUES (val1, val2);",
         'node': 'node1'}
       ]
    capture_fincore - Enables capturing of linux-fincore logs of C* data files.
    initial_destroy - Destroy all data before the first revision is run.
    leave_data - Whether to leave the Cassandra data/commitlog/etc directories intact between revisions.
    keep_page_cache - Whether to leave the linux page cache intact between revisions.
    """
    validate_revisions_list(revisions)
    validate_operations_list(operations)

    pristine_config = copy.copy(fab_config)

    # initial_destroy settting can be set in the job
    # configuration, or manually in the call to this function. Either
    # is fine, but they shouldn't conflict. If they do, ValueError is
    # raised.
    if initial_destroy == True and pristine_config.get('initial_destroy', None) == False:
        raise ValueError('setting for initial_destroy conflicts in job config and stress_compare() call')
    else:
        initial_destroy = pristine_config.get('initial_destroy', initial_destroy)
        
    if initial_destroy:
        logger.info("Cleaning up from prior runs of stress_compare ...")
        teardown(destroy=True, leave_data=False)

    # Clean stress builds
    stress_builds = [b for b in os.listdir(CASSANDRA_STRESS_PATH)
                     if b not in ['default', 'trunk']]
    for stress_build in stress_builds:
        path = os.path.join(CASSANDRA_STRESS_PATH, stress_build)
        logger.info("Removing stress build '{}'".format(path))
        shutil.rmtree(path)

    for rev_num, revision_config in enumerate(revisions):
        config = copy.copy(pristine_config)
        config.update(revision_config)
        revision = revision_config['revision']
        config['log'] = log
        config['title'] = title
        config['subtitle'] = subtitle

        # leave_data settting can be set in the revision
        # configuration, or manually in the call to this function.
        # Either is fine, but they shouldn't conflict. If they do,
        # ValueError is raised.
        if leave_data == True and revision_config.get('leave_data', None) == False:
            raise ValueError('setting for leave_data conflicts in job config and stress_compare() call')
        else:
            leave_data = revision_config.get('leave_data', leave_data)
                
        logger.info("Bringing up {revision} cluster...".format(revision=revision))
        
        # Drop the page cache between each revision, especially 
        # important when leave_data=True : 
        if not keep_page_cache:
            drop_page_cache()

        #Only fetch from git on the first run:
        git_fetch = True if rev_num == 0 else False
        revision_config['git_id'] = git_id = bootstrap(config, destroy=True, leave_data=leave_data, git_fetch=git_fetch)
    
        if capture_fincore:
            start_fincore_capture(interval=10)

        for operation_i, operation in enumerate(operations, 1):
            start = datetime.datetime.now()
            stats = {"id":str(uuid.uuid1()), "type":operation['type'], 
                     "revision": revision, "git_id": git_id, "start_date":start.isoformat(),
                     "label":revision_config.get('label', revision_config['revision'])}

            if operation['type'] == 'stress':
                # Default to all the nodes of the cluster if no 
                # nodes were specified in the command:
                if operation.has_key('nodes'):
                    cmd = "{command} -node {hosts}".format(
                        command=operation['command'], 
                        hosts=",".join(host=operation['nodes']))
                elif '-node' in operation['command']:
                    cmd = operation['command']
                else:
                    cmd = "{command} -node {hosts}".format(
                        command=operation['command'], 
                        hosts=",".join([n for n in fab_config['hosts']]))
                stats['command'] = cmd
                stats['intervals'] = []
                stats['test'] = '{operation_i}_{operation}'.format(
                    operation_i=operation_i, operation=cmd.strip().split(' ')[0]).replace(" ","_")
                logger.info('Running stress operation : {cmd}  ...'.format(cmd=cmd))
                # Run stress:
                # (stress takes the stats as a parameter, and adds
                #  more as it runs):
                stats = stress(cmd, revision, stats, stress_revision=revision_config.get('stress_revision', None))
                # Wait for all compactions to finish (unless disabled):
                if operation.get('wait_for_compaction', True):
                    compaction_throughput = revision_config.get("compaction_throughput_mb_per_sec", 16)
                    wait_for_compaction(compaction_throughput=compaction_throughput)

            elif operation['type'] == 'nodetool':
                if 'nodes' not in operation:
                    operation['nodes'] = 'all'
                if operation['nodes'] in ['all','ALL']:
                    nodes = [n for n in fab_config['hosts']]
                else:
                    nodes = operation['nodes']

                logger.info("Running nodetool on {nodes} with command: {command}".format(nodes=operation['nodes'], command=operation['command']))
                stats['command'] = operation['command']
                output = nodetool_multi(nodes, operation['command'])
                stats['output'] = output
                logger.info("Nodetool command finished on all nodes")

            elif operation['type'] == 'cqlsh':
                logger.info("Running cqlsh commands on {node}".format(node=operation['node']))
                output = cqlsh(operation['script'], operation['node'])
                stats['output'] = output.split("\n")
                logger.info("Cqlsh commands finished")

            elif operation['type'] == 'bash':
                nodes = operation.get('nodes', [n for n in fab_config['hosts']])
                logger.info("Running bash commands on {node}".format(nodes=nodes))
                output = bash(operation['script'], nodes)
                stats['output'] = output.split("\n")
                logger.info("Bash commands finished")


            end = datetime.datetime.now()
            stats['end_date'] = end.isoformat()
            stats['op_duration'] = str(end - start)
            log_stats(stats, file=log)

            #Copy node logs:
            logs_dir = os.path.join(os.path.expanduser('~'),'.cstar_perf','logs')
            log_dir = os.path.join(logs_dir, stats['id'])
            os.makedirs(log_dir)
            retrieve_logs(log_dir)
            revision_config['last_log'] = stats['id']
            #Tar them for archiving:
            subprocess.Popen(shlex.split('tar cfvz {id}.tar.gz {id}'.format(id=stats['id'])), cwd=logs_dir).communicate()
            shutil.rmtree(log_dir)

            if capture_fincore:
                stop_fincore_capture()
                retrieve_fincore_logs(log_dir)
                # Restart fincore capture if this is not the last
                # operation:
                if operation_i < len(operations):
                    start_fincore_capture(interval=10)

        log_add_data(log, {'title':title,
                           'subtitle': subtitle,
                           'revisions': revisions})

        if revisions[-1].get('leave_data', False):
            teardown(destroy=False, leave_data=True)
        else:
            teardown(destroy=True, leave_data=False)
コード例 #3
0
def stress_compare(revisions,
                   title,
                   log,
                   operations=[],
                   subtitle='',
                   capture_fincore=False,
                   initial_destroy=True,
                   leave_data=False,
                   keep_page_cache=False):
    """
    Run Stress on multiple C* branches and compare them.

    revisions - List of dictionaries that contain cluster configurations
                to trial. This is combined with the default config.
    title - The title of the comparison
    subtitle - A subtitle for more information (displayed smaller underneath)
    log - The json file path to record stats to
    operations - List of dictionaries indicating the operations. Example:
       [# cassandra-stress command, node defaults to cluster defined 'stress_node'
        {'type': 'stress',
         'command': 'write n=19M -rate threads=50',
         'node': 'node1',
         'wait_for_compaction': True},
        # nodetool command to run in parallel on nodes:
        {'type': 'nodetool',
         'command': 'decomission',
         'nodes': ['node1','node2']},
        # cqlsh script, node defaults to cluster defined 'stress_node'
        {'type': 'cqlsh',
         'script': "use my_ks; INSERT INTO blah (col1, col2) VALUES (val1, val2);",
         'node': 'node1'}
       ]
    capture_fincore - Enables capturing of linux-fincore logs of C* data files.
    initial_destroy - Destroy all data before the first revision is run.
    leave_data - Whether to leave the Cassandra data/commitlog/etc directories intact between revisions.
    keep_page_cache - Whether to leave the linux page cache intact between revisions.
    """
    validate_revisions_list(revisions)
    validate_operations_list(operations)

    pristine_config = copy.copy(fab_config)

    # initial_destroy settting can be set in the job
    # configuration, or manually in the call to this function. Either
    # is fine, but they shouldn't conflict. If they do, ValueError is
    # raised.
    if initial_destroy == True and pristine_config.get('initial_destroy',
                                                       None) == False:
        raise ValueError(
            'setting for initial_destroy conflicts in job config and stress_compare() call'
        )
    else:
        initial_destroy = pristine_config.get('initial_destroy',
                                              initial_destroy)

    if initial_destroy:
        logger.info("Cleaning up from prior runs of stress_compare ...")
        teardown(destroy=True, leave_data=False)

    # Update our local cassandra git remotes and branches
    _, localhost_entry = get_localhost()
    with common.fab.settings(hosts=[localhost_entry]):
        execute(cstar.update_cassandra_git)

    clean_stress()
    stress_revisions = set([
        operation['stress_revision'] for operation in operations
        if 'stress_revision' in operation
    ])
    stress_shas = setup_stress(stress_revisions)

    for rev_num, revision_config in enumerate(revisions):
        config = copy.copy(pristine_config)
        config.update(revision_config)
        revision = revision_config['revision']
        config['log'] = log
        config['title'] = title
        config['subtitle'] = subtitle
        product = dse if config['product'] == 'dse' else cstar

        # leave_data settting can be set in the revision
        # configuration, or manually in the call to this function.
        # Either is fine, but they shouldn't conflict. If they do,
        # ValueError is raised.
        if leave_data == True and revision_config.get('leave_data',
                                                      None) == False:
            raise ValueError(
                'setting for leave_data conflicts in job config and stress_compare() call'
            )
        else:
            leave_data = revision_config.get('leave_data', leave_data)

        logger.info(
            "Bringing up {revision} cluster...".format(revision=revision))

        # Drop the page cache between each revision, especially
        # important when leave_data=True :
        if not keep_page_cache:
            drop_page_cache()

        #Only fetch from git on the first run:
        git_fetch = True if rev_num == 0 else False
        revision_config['git_id'] = git_id = bootstrap(config,
                                                       destroy=True,
                                                       leave_data=leave_data,
                                                       git_fetch=git_fetch)

        if capture_fincore:
            start_fincore_capture(interval=10)

        for operation_i, operation in enumerate(operations, 1):
            start = datetime.datetime.now()
            stats = {
                "id": str(uuid.uuid1()),
                "type": operation['type'],
                "revision": revision,
                "git_id": git_id,
                "start_date": start.isoformat(),
                "label": revision_config.get('label',
                                             revision_config['revision'])
            }

            if operation['type'] == 'stress':
                # Default to all the nodes of the cluster if no
                # nodes were specified in the command:
                if operation.has_key('nodes'):
                    cmd = "{command} -node {hosts}".format(
                        command=operation['command'],
                        hosts=",".join(host=operation['nodes']))
                elif '-node' in operation['command']:
                    cmd = operation['command']
                else:
                    cmd = "{command} -node {hosts}".format(
                        command=operation['command'],
                        hosts=",".join([n for n in fab_config['hosts']]))
                stats['command'] = cmd
                stats['intervals'] = []
                stats['test'] = '{operation_i}_{operation}'.format(
                    operation_i=operation_i,
                    operation=cmd.strip().split(' ')[0]).replace(" ", "_")
                logger.info(
                    'Running stress operation : {cmd}  ...'.format(cmd=cmd))
                # Run stress:
                # (stress takes the stats as a parameter, and adds
                #  more as it runs):
                stress_sha = stress_shas[operation.get('stress_revision',
                                                       'default')]
                stats = stress(cmd, revision, stress_sha, stats=stats)
                # Wait for all compactions to finish (unless disabled):
                if operation.get('wait_for_compaction', True):
                    compaction_throughput = revision_config.get(
                        "compaction_throughput_mb_per_sec", 16)
                    wait_for_compaction(
                        compaction_throughput=compaction_throughput)

            elif operation['type'] == 'nodetool':
                if 'nodes' not in operation:
                    operation['nodes'] = 'all'
                if operation['nodes'] in ['all', 'ALL']:
                    nodes = [n for n in fab_config['hosts']]
                else:
                    nodes = operation['nodes']

                set_nodetool_path(
                    os.path.join(product.get_bin_path(), 'nodetool'))
                logger.info(
                    "Running nodetool on {nodes} with command: {command}".
                    format(nodes=operation['nodes'],
                           command=operation['command']))
                stats['command'] = operation['command']
                output = nodetool_multi(nodes, operation['command'])
                stats['output'] = output
                logger.info("Nodetool command finished on all nodes")

            elif operation['type'] == 'cqlsh':
                logger.info("Running cqlsh commands on {node}".format(
                    node=operation['node']))
                set_cqlsh_path(os.path.join(product.get_bin_path(), 'cqlsh'))
                output = cqlsh(operation['script'], operation['node'])
                stats['output'] = output.split("\n")
                logger.info("Cqlsh commands finished")

            elif operation['type'] == 'bash':
                nodes = operation.get('nodes',
                                      [n for n in fab_config['hosts']])
                logger.info(
                    "Running bash commands on: {nodes}".format(nodes=nodes))
                stats['output'] = bash(operation['script'], nodes)
                logger.info("Bash commands finished")

            end = datetime.datetime.now()
            stats['end_date'] = end.isoformat()
            stats['op_duration'] = str(end - start)
            log_stats(stats, file=log)

            #Copy node logs:
            logs_dir = os.path.join(os.path.expanduser('~'), '.cstar_perf',
                                    'logs')
            log_dir = os.path.join(logs_dir, stats['id'])
            os.makedirs(log_dir)
            retrieve_logs(log_dir)
            revision_config['last_log'] = stats['id']
            #Tar them for archiving:
            subprocess.Popen(shlex.split(
                'tar cfvz {id}.tar.gz {id}'.format(id=stats['id'])),
                             cwd=logs_dir).communicate()
            shutil.rmtree(log_dir)

            if capture_fincore:
                stop_fincore_capture()
                retrieve_fincore_logs(log_dir)
                # Restart fincore capture if this is not the last
                # operation:
                if operation_i < len(operations):
                    start_fincore_capture(interval=10)

        log_add_data(log, {
            'title': title,
            'subtitle': subtitle,
            'revisions': revisions
        })

        if revisions[-1].get('leave_data', leave_data):
            teardown(destroy=False, leave_data=True)
        else:
            teardown(destroy=True, leave_data=False)
コード例 #4
0
def stress_compare(
    revisions,
    title,
    log,
    operations=[],
    subtitle="",
    capture_fincore=False,
    initial_destroy=True,
    leave_data=False,
    keep_page_cache=False,
):
    """
    Run Stress on multiple C* branches and compare them.

    revisions - List of dictionaries that contain cluster configurations
                to trial. This is combined with the default config.
    title - The title of the comparison
    subtitle - A subtitle for more information (displayed smaller underneath)
    log - The json file path to record stats to
    operations - List of dictionaries indicating the operations. Example:
       [# cassandra-stress command, node defaults to cluster defined 'stress_node'
        {'type': 'stress',
         'command': 'write n=19M -rate threads=50',
         'node': 'node1',
         'wait_for_compaction': True},
        # nodetool command to run in parallel on nodes:
        {'type': 'nodetool',
         'command': 'decomission',
         'nodes': ['node1','node2']},
        # cqlsh script, node defaults to cluster defined 'stress_node'
        {'type': 'cqlsh',
         'script': "use my_ks; INSERT INTO blah (col1, col2) VALUES (val1, val2);",
         'node': 'node1'}
       ]
    capture_fincore - Enables capturing of linux-fincore logs of C* data files.
    initial_destroy - Destroy all data before the first revision is run.
    leave_data - Whether to leave the Cassandra data/commitlog/etc directories intact between revisions.
    keep_page_cache - Whether to leave the linux page cache intact between revisions.
    """
    validate_revisions_list(revisions)
    validate_operations_list(operations)

    pristine_config = copy.copy(fab_config)

    # initial_destroy settting can be set in the job
    # configuration, or manually in the call to this function. Either
    # is fine, but they shouldn't conflict. If they do, ValueError is
    # raised.
    if initial_destroy == True and pristine_config.get("initial_destroy", None) == False:
        raise ValueError("setting for initial_destroy conflicts in job config and stress_compare() call")
    else:
        initial_destroy = pristine_config.get("initial_destroy", initial_destroy)

    if initial_destroy:
        logger.info("Cleaning up from prior runs of stress_compare ...")
        teardown(destroy=True, leave_data=False)

    # Update our local cassandra git remotes and branches
    _, localhost_entry = get_localhost()
    with common.fab.settings(hosts=[localhost_entry]):
        execute(cstar.update_cassandra_git)

    clean_stress()
    stress_revisions = set([operation["stress_revision"] for operation in operations if "stress_revision" in operation])
    stress_shas = setup_stress(stress_revisions)

    for rev_num, revision_config in enumerate(revisions):
        config = copy.copy(pristine_config)
        config.update(revision_config)
        revision = revision_config["revision"]
        config["log"] = log
        config["title"] = title
        config["subtitle"] = subtitle
        product = dse if config["product"] == "dse" else cstar

        # leave_data settting can be set in the revision
        # configuration, or manually in the call to this function.
        # Either is fine, but they shouldn't conflict. If they do,
        # ValueError is raised.
        if leave_data == True and revision_config.get("leave_data", None) == False:
            raise ValueError("setting for leave_data conflicts in job config and stress_compare() call")
        else:
            leave_data = revision_config.get("leave_data", leave_data)

        logger.info("Bringing up {revision} cluster...".format(revision=revision))

        # Drop the page cache between each revision, especially
        # important when leave_data=True :
        if not keep_page_cache:
            drop_page_cache()

        # Only fetch from git on the first run:
        git_fetch = True if rev_num == 0 else False
        revision_config["git_id"] = git_id = bootstrap(config, destroy=True, leave_data=leave_data, git_fetch=git_fetch)

        if capture_fincore:
            start_fincore_capture(interval=10)

        for operation_i, operation in enumerate(operations, 1):
            start = datetime.datetime.now()
            stats = {
                "id": str(uuid.uuid1()),
                "type": operation["type"],
                "revision": revision,
                "git_id": git_id,
                "start_date": start.isoformat(),
                "label": revision_config.get("label", revision_config["revision"]),
            }

            if operation["type"] == "stress":
                # Default to all the nodes of the cluster if no
                # nodes were specified in the command:
                if operation.has_key("nodes"):
                    cmd = "{command} -node {hosts}".format(
                        command=operation["command"], hosts=",".join(host=operation["nodes"])
                    )
                elif "-node" in operation["command"]:
                    cmd = operation["command"]
                else:
                    cmd = "{command} -node {hosts}".format(
                        command=operation["command"], hosts=",".join([n for n in fab_config["hosts"]])
                    )
                stats["command"] = cmd
                stats["intervals"] = []
                stats["test"] = "{operation_i}_{operation}".format(
                    operation_i=operation_i, operation=cmd.strip().split(" ")[0]
                ).replace(" ", "_")
                logger.info("Running stress operation : {cmd}  ...".format(cmd=cmd))
                # Run stress:
                # (stress takes the stats as a parameter, and adds
                #  more as it runs):
                stress_sha = stress_shas[operation.get("stress_revision", "default")]
                stats = stress(cmd, revision, stress_sha, stats=stats)
                # Wait for all compactions to finish (unless disabled):
                if operation.get("wait_for_compaction", True):
                    compaction_throughput = revision_config.get("compaction_throughput_mb_per_sec", 16)
                    wait_for_compaction(compaction_throughput=compaction_throughput)

            elif operation["type"] == "nodetool":
                if "nodes" not in operation:
                    operation["nodes"] = "all"
                if operation["nodes"] in ["all", "ALL"]:
                    nodes = [n for n in fab_config["hosts"]]
                else:
                    nodes = operation["nodes"]

                set_nodetool_path(os.path.join(product.get_bin_path(), "nodetool"))
                logger.info(
                    "Running nodetool on {nodes} with command: {command}".format(
                        nodes=operation["nodes"], command=operation["command"]
                    )
                )
                stats["command"] = operation["command"]
                output = nodetool_multi(nodes, operation["command"])
                stats["output"] = output
                logger.info("Nodetool command finished on all nodes")

            elif operation["type"] == "cqlsh":
                logger.info("Running cqlsh commands on {node}".format(node=operation["node"]))
                set_cqlsh_path(os.path.join(product.get_bin_path(), "cqlsh"))
                output = cqlsh(operation["script"], operation["node"])
                stats["output"] = output.split("\n")
                logger.info("Cqlsh commands finished")

            elif operation["type"] == "bash":
                nodes = operation.get("nodes", [n for n in fab_config["hosts"]])
                logger.info("Running bash commands on: {nodes}".format(nodes=nodes))
                stats["output"] = bash(operation["script"], nodes)
                logger.info("Bash commands finished")

            end = datetime.datetime.now()
            stats["end_date"] = end.isoformat()
            stats["op_duration"] = str(end - start)
            log_stats(stats, file=log)

            # Copy node logs:
            logs_dir = os.path.join(os.path.expanduser("~"), ".cstar_perf", "logs")
            log_dir = os.path.join(logs_dir, stats["id"])
            os.makedirs(log_dir)
            retrieve_logs(log_dir)
            revision_config["last_log"] = stats["id"]
            # Tar them for archiving:
            subprocess.Popen(
                shlex.split("tar cfvz {id}.tar.gz {id}".format(id=stats["id"])), cwd=logs_dir
            ).communicate()
            shutil.rmtree(log_dir)

            if capture_fincore:
                stop_fincore_capture()
                retrieve_fincore_logs(log_dir)
                # Restart fincore capture if this is not the last
                # operation:
                if operation_i < len(operations):
                    start_fincore_capture(interval=10)

        log_add_data(log, {"title": title, "subtitle": subtitle, "revisions": revisions})

        if revisions[-1].get("leave_data", leave_data):
            teardown(destroy=False, leave_data=True)
        else:
            teardown(destroy=True, leave_data=False)