def postprocess(runtime_info, experiment_directory):
    postprocessed_data = []

    stats_info = runtime_info["stats"]
    stage_info = runtime_info["stage_info"]

    # Emit epochs in the order in which they were executed
    job_sequence = utils.job_sequence(experiment_directory)

    descriptions = {}

    for job in job_sequence:
        description_dir = utils.job_description(experiment_directory, job)
        if os.path.exists(description_dir):
            descriptions[job] = Description(description_dir)

    phase_sequence = {}

    for job in job_sequence:
        if job in descriptions:
            phase_sequence[job] = descriptions[job].getPhaseList()
        else:
            phase_sequence[job] = []

    comp_function = epoch_comparator(job_sequence, phase_sequence)

    sorted_epochs = sorted(stats_info.keys(), key=comp_function)

    for (job, phase, epoch) in sorted_epochs:
        epoch_info = stats_info[(job, phase, epoch)]

        if job in descriptions:
            description = descriptions[job]
        else:
            description = None

        postprocessed_epoch_info = postprocess_epoch(
            epoch_info, stage_info[(job, phase, epoch)], phase, description)

        postprocessed_epoch_info["job"] = job
        postprocessed_epoch_info["phase"] = phase
        postprocessed_epoch_info["epoch"] = epoch

        postprocessed_data.append(postprocessed_epoch_info)

    return postprocessed_data
def run_benchmark_iterations(
    binary,
    log_directory,
    config,
    peer_ips,
    profiler,
    profiler_options,
    iterations,
    sleep,
    delete_output,
    per_peer_config,
    dump_core_directory,
    solo_mode,
    stage_stats,
    interfaces,
    params="",
):

    # Get ssh username and themis directory
    username, themis_directory = read_conf_file("cluster.conf", "cluster", ["username", "themis_directory"])
    themis_directory = os.path.expanduser(themis_directory)
    # Get cloud provider if applicable.
    provider = read_conf_file("cluster.conf", "cluster", "provider")

    if interfaces == None:
        vnstat_interface = None
    else:
        interface_list = filter(lambda x: len(x) > 0, interfaces.split(","))
        vnstat_interface = interface_list[0]

    if not os.path.exists(config):
        sys.exit("Config file %s does not exist." % config)

    with open(config, "r") as fp:
        app_config = yaml.load(fp)

    # If we're using more than 1 network interface per peer, the peer list is
    # going to look like:
    # Peer1_interface1, Peer1_interface2, Peer2_interface1, Peer2_interface2, ..
    # In this case, we only want to launch the benchmark once per peer, so
    # make sure we only look at the first interface for each peer, and let
    # the application itself deal with the other interfaces.
    num_interfaces = 1
    if "NUM_INTERFACES" in app_config:
        num_interfaces = app_config["NUM_INTERFACES"]

    # Remove trailing comma if any from the IP list. This will be the string we
    # pass into the benchmark binary.
    peer_list = peer_ips.rstrip(",")

    # If we're using multiple interfaces, only launch the benchmark once per
    # node.
    node_list = peer_list.split(",")[::num_interfaces]

    # Look for description files in the same directory as the binary.
    binary_dir = os.path.dirname(binary)
    description_directory = os.path.join(binary_dir, "description")

    if not os.path.exists(description_directory):
        sys.exit("Could not find description directory %s" % (description_directory))

    # Check for the phase name. For simplicity we're going to require that
    # the benchmark have only 1 phase
    description = Description(description_directory)
    phases = description.getPhaseList()
    if len(phases) != 1:
        sys.exit("Benchmark must have exactly one phase. Got %s" % phases)
    phase_name = phases[0]

    data_size_per_node = int(app_config["BENCHMARK_DATA_SIZE_PER_NODE"][phase_name])
    data_size = data_size_per_node * len(node_list)

    total_throughputs = {}
    if stage_stats is not None:
        stage_stats = stage_stats.split(",")
        for stage in stage_stats:
            total_throughputs[stage] = 0.0

    node_benchmark_throughputs = []

    for i in xrange(iterations):
        # Pick a unique batch ID
        batch = 0
        while os.path.exists(os.path.join(log_directory, "batch_%d" % batch)):
            batch += 1
        batch_directory = os.path.join(log_directory, "batch_%d" % batch)

        # Create directories
        phase_directory = os.path.join(batch_directory, phase_name)
        parallel_ssh(None, "mkdir -p %s" % phase_directory, username, node_list, False, True, False)

        # Copy description files and create phase directory.
        if not os.path.exists(batch_directory):
            os.makedirs(batch_directory)
        shutil.copy(os.path.join(description_directory, "stages.json"), batch_directory)
        shutil.copy(os.path.join(description_directory, "structure.json"), batch_directory)
        os.chmod(os.path.join(batch_directory, "stages.json"), 0777)
        os.chmod(os.path.join(batch_directory, "structure.json"), 0777)

        # Copy config file
        shutil.copyfile(config, os.path.join(batch_directory, "config.yaml"))

        print "\nLogging to %s" % (batch_directory)
        print "Running %s with batch ID %d on %d nodes..." % (phase_name, batch, len(node_list))

        (elapsed, elapsed_times, completed_ips) = run_benchmark(
            binary,
            config,
            batch_directory,
            phase_directory,
            profiler,
            profiler_options,
            peer_list,
            node_list,
            per_peer_config,
            dump_core_directory,
            solo_mode,
            vnstat_interface,
            params,
        )

        # Compute overall throughput
        throughput = (data_size / elapsed) / 1000000
        per_node_throughput = (data_size_per_node / elapsed) / 1000000
        print "Completed in %.2f seconds." % elapsed
        print "  Throughput: %.2f MB/s" % throughput
        print "  Per-server: %.2f MB/s" % per_node_throughput

        # Record individual throughputs
        throughputs = [(data_size_per_node / x) / 1000000 for x in elapsed_times]
        node_benchmark_throughputs += throughputs

        # Dump these results to a file in the batch directory
        results_file = open(os.path.join(batch_directory, "results"), "w")
        results_file.write(
            "Runtime: %.2f seconds\nThroughput: %.2f MB/s\nPer-server: "
            "%.2f MB/s\n\n" % (elapsed, throughput, per_node_throughput)
        )
        results_file.write("Node throughputs: %s\n\n" % throughputs)
        for ip, elapsed_time, throughput in zip(completed_ips, elapsed_times, throughputs):
            results_file.write("Node %s completed in %.2f seconds (%.2f MB/s)\n" % (ip, elapsed_time, throughput))
        results_file.write("\n")

        if stage_stats is not None:
            # Compute runtime stat throughputs

            done = False
            while not done:
                # Upload all logs.
                upload_logs()

                # Download logs locally.
                download_logs()

                try:
                    runtime_info = gather_runtime_info(batch_directory, False)
                    done = True
                except ValueError:
                    print "Runtime info script failed. Retrying log upload/downloads."

            stage_info = runtime_info[0]["stages"]
            node_throughputs = {}
            for worker_info in stage_info:
                stats_info = worker_info["stats_info"]
                # We only want to look at the overall stats, which includes all
                # nodes (hostname or worker ID won't be specified)
                if len(stats_info) == 1:
                    stage_name = stats_info["stage"]

                    if stage_name in stage_stats:
                        # This is one of the stages we care about
                        node_throughputs[stage_name] = worker_info["observed_processing_rate_per_node"]
                        total_throughputs[stage_name] += node_throughputs[stage_name]

            # Print throughputs in the correct order.
            for stage_name in stage_stats:
                print "  %s throughput: %.2f MB/s/node" % (stage_name, node_throughputs[stage_name])
                results_file.write("%s throughput: %.2f MB/s\n" % (stage_name, node_throughputs[stage_name]))

        results_file.close()

        if delete_output and "OUTPUT_DISK_LIST" in app_config and phase_name in app_config["OUTPUT_DISK_LIST"]:
            output_disk_list = app_config["OUTPUT_DISK_LIST"][phase_name]
            output_disks = output_disk_list.split(",")
            for disk in output_disks:
                print "Clearing %s" % disk
                parallel_ssh(None, "rm -rf %s" % disk, username, node_list, False, False, False)

        if sleep > 0 and i != iterations - 1:
            print "Sleeping %d seconds" % sleep
            time.sleep(sleep)

    print "\nCompleted %d iterations\n" % iterations
    # Format node throughputs
    node_benchmark_throughput_strings = ["%.2f" % x for x in node_benchmark_throughputs]
    print "  Node throughputs (MB/s):"
    print "    %s" % node_benchmark_throughput_strings
    print "  Average node throughput: %.2f MB/s" % (numpy.mean(node_benchmark_throughputs))
    print "  Standard deviation: %.2f MB/s" % (numpy.std(node_benchmark_throughputs))
    print "  Min node throughput: %.2f MB/s" % (numpy.min(node_benchmark_throughputs))
    print "  Max node throughput: %.2f MB/s\n" % (numpy.max(node_benchmark_throughputs))

    if stage_stats is not None:
        for stage_name in stage_stats:
            print "  Average %s throughput: %.2f MB/s/node" % (stage_name, total_throughputs[stage_name] / iterations)
def run_benchmark_iterations(
    binary, log_directory, config, peer_ips, profiler, profiler_options,
    iterations, sleep, delete_output, per_peer_config, dump_core_directory,
    solo_mode, stage_stats, interfaces, params=""):

    # Get ssh username and themis directory
    username, themis_directory = read_conf_file(
        "cluster.conf", "cluster", ["username", "themis_directory"])
    themis_directory = os.path.expanduser(themis_directory)
    # Get cloud provider if applicable.
    provider = read_conf_file("cluster.conf", "cluster", "provider")

    if interfaces == None:
        vnstat_interface = None
    else:
        interface_list = filter(lambda x: len(x) > 0, interfaces.split(','))
        vnstat_interface = interface_list[0]

    if not os.path.exists(config):
        sys.exit("Config file %s does not exist." % config)

    with open(config, 'r') as fp:
        app_config = yaml.load(fp)

    # If we're using more than 1 network interface per peer, the peer list is
    # going to look like:
    # Peer1_interface1, Peer1_interface2, Peer2_interface1, Peer2_interface2, ..
    # In this case, we only want to launch the benchmark once per peer, so
    # make sure we only look at the first interface for each peer, and let
    # the application itself deal with the other interfaces.
    num_interfaces = 1
    if "NUM_INTERFACES" in app_config:
        num_interfaces = app_config["NUM_INTERFACES"]

    # Remove trailing comma if any from the IP list. This will be the string we
    # pass into the benchmark binary.
    peer_list = peer_ips.rstrip(",")

    # If we're using multiple interfaces, only launch the benchmark once per
    # node.
    node_list = peer_list.split(",")[::num_interfaces]

    # Look for description files in the same directory as the binary.
    binary_dir = os.path.dirname(binary)
    description_directory = os.path.join(binary_dir, "description")

    if not os.path.exists(description_directory):
        sys.exit("Could not find description directory %s" % (
                description_directory))

    # Check for the phase name. For simplicity we're going to require that
    # the benchmark have only 1 phase
    description = Description(description_directory)
    phases = description.getPhaseList()
    if len(phases) != 1:
        sys.exit("Benchmark must have exactly one phase. Got %s" % phases)
    phase_name = phases[0]

    data_size_per_node = int(
        app_config["BENCHMARK_DATA_SIZE_PER_NODE"][phase_name])
    data_size = data_size_per_node * len(node_list)

    total_throughputs = {}
    if stage_stats is not None:
        stage_stats = stage_stats.split(",")
        for stage in stage_stats:
            total_throughputs[stage] = 0.0

    node_benchmark_throughputs = []

    for i in xrange(iterations):
        # Pick a unique batch ID
        batch = 0
        while os.path.exists(
            os.path.join(log_directory, "batch_%d" % batch)):
            batch += 1
        batch_directory = os.path.join(log_directory, "batch_%d" % batch)

        # Create directories
        phase_directory = os.path.join(batch_directory, phase_name)
        parallel_ssh(
            None, "mkdir -p %s" % phase_directory, username, node_list, False,
            True, False)

        # Copy description files and create phase directory.
        if not os.path.exists(batch_directory):
            os.makedirs(batch_directory)
        shutil.copy(
            os.path.join(description_directory, "stages.json"),
            batch_directory)
        shutil.copy(
            os.path.join(description_directory, "structure.json"),
            batch_directory)
        os.chmod(os.path.join(batch_directory, "stages.json"), 0777)
        os.chmod(os.path.join(batch_directory, "structure.json"), 0777)

        # Copy config file
        shutil.copyfile(config, os.path.join(batch_directory, "config.yaml"))

        print "\nLogging to %s" % (batch_directory)
        print "Running %s with batch ID %d on %d nodes..." % (
            phase_name, batch, len(node_list))

        (elapsed, elapsed_times, completed_ips) = run_benchmark(
            binary, config, batch_directory, phase_directory, profiler,
            profiler_options, peer_list, node_list, per_peer_config,
            dump_core_directory, solo_mode, vnstat_interface, params)

        # Compute overall throughput
        throughput = (data_size / elapsed) / 1000000
        per_node_throughput = (data_size_per_node / elapsed) / 1000000
        print "Completed in %.2f seconds." % elapsed
        print "  Throughput: %.2f MB/s" % throughput
        print "  Per-server: %.2f MB/s" % per_node_throughput

        # Record individual throughputs
        throughputs = [(data_size_per_node / x) / 1000000 \
                           for x in elapsed_times]
        node_benchmark_throughputs += throughputs

        # Dump these results to a file in the batch directory
        results_file = open(os.path.join(batch_directory, "results"), "w")
        results_file.write(
            "Runtime: %.2f seconds\nThroughput: %.2f MB/s\nPer-server: " \
                "%.2f MB/s\n\n" % (elapsed, throughput, per_node_throughput))
        results_file.write("Node throughputs: %s\n\n" % throughputs)
        for ip, elapsed_time, throughput in zip(
            completed_ips, elapsed_times, throughputs):
            results_file.write(
                "Node %s completed in %.2f seconds (%.2f MB/s)\n" % (
                    ip, elapsed_time, throughput))
        results_file.write("\n")

        if stage_stats is not None:
            # Compute runtime stat throughputs

            done = False
            while not done:
                # Upload all logs.
                upload_logs()

                # Download logs locally.
                download_logs()

                try:
                    runtime_info = gather_runtime_info(batch_directory, False)
                    done = True
                except ValueError:
                    print "Runtime info script failed. Retrying log upload/downloads."

            stage_info = runtime_info[0]["stages"]
            node_throughputs = {}
            for worker_info in stage_info:
                stats_info = worker_info["stats_info"]
                # We only want to look at the overall stats, which includes all
                # nodes (hostname or worker ID won't be specified)
                if len(stats_info) == 1:
                    stage_name = stats_info["stage"]

                    if stage_name in stage_stats:
                        # This is one of the stages we care about
                        node_throughputs[stage_name] = \
                            worker_info["observed_processing_rate_per_node"]
                        total_throughputs[stage_name] += \
                            node_throughputs[stage_name]

            # Print throughputs in the correct order.
            for stage_name in stage_stats:
                print "  %s throughput: %.2f MB/s/node" % (
                    stage_name, node_throughputs[stage_name])
                results_file.write("%s throughput: %.2f MB/s\n" % (
                        stage_name, node_throughputs[stage_name]))

        results_file.close()

        if delete_output and "OUTPUT_DISK_LIST" in app_config and \
                phase_name in app_config["OUTPUT_DISK_LIST"]:
            output_disk_list = app_config["OUTPUT_DISK_LIST"][phase_name]
            output_disks = output_disk_list.split(",")
            for disk in output_disks:
                print "Clearing %s" % disk
                parallel_ssh(
                    None, "rm -rf %s" % disk, username, node_list, False,
                    False, False)

        if sleep > 0 and i != iterations - 1:
            print "Sleeping %d seconds" % sleep
            time.sleep(sleep)

    print "\nCompleted %d iterations\n" % iterations
    # Format node throughputs
    node_benchmark_throughput_strings = [
        "%.2f" % x for x in node_benchmark_throughputs]
    print "  Node throughputs (MB/s):"
    print "    %s" % node_benchmark_throughput_strings
    print "  Average node throughput: %.2f MB/s" % (
        numpy.mean(node_benchmark_throughputs))
    print "  Standard deviation: %.2f MB/s" % (
        numpy.std(node_benchmark_throughputs))
    print "  Min node throughput: %.2f MB/s" % (
        numpy.min(node_benchmark_throughputs))
    print "  Max node throughput: %.2f MB/s\n" % (
        numpy.max(node_benchmark_throughputs))

    if stage_stats is not None:
        for stage_name in stage_stats:
            print "  Average %s throughput: %.2f MB/s/node" % (
                stage_name, total_throughputs[stage_name] / iterations)
def plot_timeline_for_phase(log_directory, job, phase, phase_data):
    min_timestamp = phase_data["min_timestamp"]
    max_timestamp = phase_data["max_timestamp"]

    description = Description(os.path.join(log_directory, job, "description"))
    stage_ordering = description.getStageOrdering(phase)

    duration_lists = {}

    for stage in stage_ordering:
        duration_lists[stage] = []

    for key in phase_data:
        if key in ["min_timestamp", "max_timestamp"]:
            continue

        hostname, stage, worker_id = key

        worker_duration_info = Duration(
            hostname.split('.')[0], stage, worker_id,
            (phase_data[key][0] - min_timestamp) / 1000000.0,
            (phase_data[key][1] - min_timestamp) / 1000000.0)

        duration_lists[stage].append(worker_duration_info)

    def sort_function(x):
        return (x.hostname, x.worker_id, x.start_time, x.stop_time)

    layout = PlotLayout()

    for stage in stage_ordering:
        duration_list = duration_lists[stage]

        duration_list.sort(key=sort_function)

        bars = {}

        # Set up a "padding" bar that will appear to move bars up so that they
        # start when the worker starts
        start_bar = Bar()
        start_bar.linewidth = 0
        start_bar.color = "white"

        for i, duration in enumerate(duration_list):
            if duration.hostname not in bars:
                bars[duration.hostname] = Bar()

            bars[duration.hostname].yValues.append(duration.stop_time -
                                                   duration.start_time)
            start_bar.yValues.append(duration.start_time)

        # Make sure that all bars have the same number of y-axis values,
        # give them x-axis values and set their colors

        start_bar.xValues = range(len(start_bar.yValues))
        start_bar.xTickLabelProperties = {"rotation": 90}

        bar_colors = [
            "red", "blue", "green", "orange", "gray", "pink", "purple", "black"
        ]

        offset = 0

        for i, (hostname, bar) in enumerate(bars.items()):
            # Pad y axis with zeroes so that bars can be laid out next to
            # each other with a StackedBars
            num_y_values = len(bar.yValues)

            bar.yValues = (([0] * offset) + bar.yValues +
                           ([0] * (len(duration_list) -
                                   (num_y_values + offset))))

            # Put the label for this hostname roughly in the middle of its bar
            # cluster
            start_bar.xTickLabels.append(hostname)
            # Subtracting 0.5 to account for half the width of the bar
            start_bar.xTickLabelPoints.append(offset + (num_y_values / 2.0) -
                                              0.5)

            offset += num_y_values

            bar.xValues = range(len(bar.yValues))
            bar.color = bar_colors[i % len(bar_colors)]
            bar.label = hostname

        stacked_bars = StackedBars()
        stacked_bars.add(start_bar)

        for hostname in sorted(bars.keys()):
            stacked_bars.add(bars[hostname])

        plot = Plot()
        plot.setYLimits(0,
                        ((max_timestamp - min_timestamp) / 1000000.0) * 1.05)
        plot.setXLabel("Worker")
        plot.setYLabel("Time (s)")
        plot.setTitle(stage)

        plot.add(stacked_bars)
        layout.addPlot(plot)

    return layout
def plot_timeline_for_phase(log_directory, job, phase, phase_data):
    min_timestamp = phase_data["min_timestamp"]
    max_timestamp = phase_data["max_timestamp"]

    description = Description(os.path.join(log_directory, job, "description"))
    stage_ordering = description.getStageOrdering(phase)

    duration_lists = {}

    for stage in stage_ordering:
        duration_lists[stage] = []

    for key in phase_data:
        if key in ["min_timestamp", "max_timestamp"]:
            continue

        hostname, stage, worker_id = key

        worker_duration_info = Duration(
            hostname.split('.')[0], stage, worker_id,
            (phase_data[key][0] - min_timestamp) / 1000000.0,
            (phase_data[key][1] - min_timestamp) / 1000000.0)

        duration_lists[stage].append(worker_duration_info)

    def sort_function(x):
        return (x.hostname, x.worker_id, x.start_time, x.stop_time)

    layout = PlotLayout()

    for stage in stage_ordering:
        duration_list = duration_lists[stage]

        duration_list.sort(key=sort_function)

        bars = {}

        # Set up a "padding" bar that will appear to move bars up so that they
        # start when the worker starts
        start_bar = Bar()
        start_bar.linewidth = 0
        start_bar.color = "white"

        for i, duration in enumerate(duration_list):
            if duration.hostname not in bars:
                bars[duration.hostname] = Bar()

            bars[duration.hostname].yValues.append(
                duration.stop_time - duration.start_time)
            start_bar.yValues.append(duration.start_time)

        # Make sure that all bars have the same number of y-axis values,
        # give them x-axis values and set their colors

        start_bar.xValues = range(len(start_bar.yValues))
        start_bar.xTickLabelProperties = {
            "rotation" : 90
            }

        bar_colors = ["red", "blue", "green", "orange", "gray", "pink",
                      "purple", "black"]

        offset = 0

        for i, (hostname, bar) in enumerate(bars.items()):
            # Pad y axis with zeroes so that bars can be laid out next to
            # each other with a StackedBars
            num_y_values = len(bar.yValues)

            bar.yValues = (([0] * offset) + bar.yValues +
                           ([0] *
                            (len(duration_list) - (num_y_values + offset))))

            # Put the label for this hostname roughly in the middle of its bar
            # cluster
            start_bar.xTickLabels.append(hostname)
            # Subtracting 0.5 to account for half the width of the bar
            start_bar.xTickLabelPoints.append(offset + (num_y_values / 2.0)
                                              - 0.5)

            offset += num_y_values

            bar.xValues = range(len(bar.yValues))
            bar.color = bar_colors[i % len(bar_colors)]
            bar.label = hostname

        stacked_bars = StackedBars()
        stacked_bars.add(start_bar)

        for hostname in sorted(bars.keys()):
            stacked_bars.add(bars[hostname])

        plot = Plot()
        plot.setYLimits(0, ((max_timestamp - min_timestamp) / 1000000.0) * 1.05)
        plot.setXLabel("Worker")
        plot.setYLabel("Time (s)")
        plot.setTitle(stage)

        plot.add(stacked_bars)
        layout.addPlot(plot)

    return layout