def execute_query(aws_key_id, aws_key, args, query, branch, is_first_branch):
    """
  Executes the specified query using the branch that is currently checked out (whose name is
  `branch`). Copies the event log and continuous monitors file to  `args.output_dir`.

  `is_first_branch` should be set to True if this is the first branch to be tested.
  """
    print_heading("Executing {} {} of query {} using branch '{}'".format(
        args.num_trials, "trial" if (args.num_trials == 1) else "trials",
        query, branch))

    # Restart Spark and the Thrift server in order to make sure that we are using the correct version,
    # and to start using new log files.
    stop_thriftserver()
    stop_spark()
    start_spark()

    benchmark_runner_dir = path.join(args.benchmark_dir, "runner")
    driver_addr = subprocess.check_output(
        "curl -s http://169.254.169.254/latest/meta-data/public-hostname",
        shell=True)

    print "Creating benchmark tables and starting the Thrift server"
    prepare_benchmark_script = path.join(benchmark_runner_dir,
                                         "prepare-benchmark.sh")
    prepare_benchmark_command = "{} \
    --spark \
    --aws-key-id={} \
    --aws-key={} \
    --spark-host={} \
    --spark-identity-file={} \
    --scale-factor={} \
    --file-format={} \
    --skip-s3-import".format(prepare_benchmark_script, aws_key_id, aws_key,
                             driver_addr, args.identity_file,
                             args.scale_factor, args.file_format)

    if args.parquet:
        prepare_benchmark_command += " --parquet"
        if not is_first_branch or args.skip_parquet_conversion:
            prepare_benchmark_command += " --skip-parquet-conversion"
    execute_shell_command(prepare_benchmark_command)

    if args.memory:
        cache_table_for_query(query)

    print "Executing query"
    run_query_script = path.join(benchmark_runner_dir, "run-query.sh")
    run_query_command = "{} \
    --spark \
    --spark-host={} \
    --spark-identity-file={} \
    --query-num={} \
    --num-trials={} \
    --clear-buffer-cache".format(run_query_script, driver_addr,
                                 args.identity_file, query, args.num_trials)
    if args.compress_output:
        run_query_command += " --compress"
    if args.memory:
        run_query_command += " --spark-cache-output-tables"
    execute_shell_command(run_query_command)

    # Stop the Thrift server and Spark in order to stop using the Spark log files.
    stop_thriftserver()
    stop_spark()

    print "Retrieving logs"
    parameters = [query, branch]
    log_dir = utils.copy_all_logs(parameters, utils.get_workers())
    utils.copy_all_traces(log_dir, driver_addr, utils.get_workers())
    log_files = path.join(log_dir, "*")

    # Move the logs into a new directory: output_dir/query/branch/
    output_dir = path.join(args.output_dir, query, branch)
    execute_shell_command("mkdir -pv {}".format(output_dir))
    execute_shell_command("mv -v {} {}".format(log_files, output_dir))
    execute_shell_command("rm -rf {}".format(log_dir))
Beispiel #2
0
def solver(data):
    # initialise solver
    solver = pywrapcp.Solver("allocations")

    tasks = utils.get_tasks(data['scheduledTasks'])
    workers = utils.get_workers(data['workers'])

    cost_matrix = data['costMatrix']
    solver_option = data['solverOption']
    time_limit = data['timeLimit']
    extra_constraints = data['constraints'] if 'constraints' in data else {}

    print('solver_option', solver_option)

    num_tasks = len(tasks)
    num_workers = len(workers)

    # declare decision variables and a reference matrix
    assignment_costs = []
    assignments = []
    assignments_ref = []
    for worker in workers:
        worker_assignments = []
        worker_assignments_ref = []
        worker_assignment_costs = []
        for task in tasks:
            worker_assignments.append(
                solver.IntVar(0, 1, f'worker: , task: {task.id}'))
            worker_assignments_ref.append(Worker_task(worker, task))
            worker_assignment_costs.append(cost_matrix[str(
                worker.id)][task.id])
        assignments.append(worker_assignments)
        assignments_ref.append(worker_assignments_ref)
        assignment_costs.append(worker_assignment_costs)

    constraints = Constraints(
        tasks,
        workers,
        assignment_costs,
        assignments,
        assignments_ref,
    )

    # objective

    # Only add objective if optimisation requested
    if solver_option != 'noOptimisation':
        total_cost = solver.IntVar(0, 3000, "total_cost")

        solver.Add(total_cost == solver.Sum([
            assignment_costs[i][j] * assignments[i][j]
            for i in range(num_workers) for j in range(num_tasks)
        ]))

        objective = solver.Minimize(total_cost, 5)

    # constraints

    # each task assigned it's given qty
    constraints.add_task_qty_constraint(solver)

    # a worker cannot work on two tasks that are on at the same time
    constraints.add_same_worker_same_task_time(solver)

    # a worker can at most be assigned to the same orderTask date once (i.e cannot take up multiple qty)
    # maybe add any cannot work constraints
    # maybe add any must work constraints
    must_map = extra_constraints[
        'mustWork'] if 'mustWork' in extra_constraints else None
    cannot_map = extra_constraints[
        'cannotWork'] if 'cannotWork' in extra_constraints else None
    constraints.must_cannot_work(solver, must_map, cannot_map)

    # add must combined must work
    if 'combinedMustWork' in extra_constraints:
        constraints.combined_must_work_all(
            solver, extra_constraints['combinedMustWork'])

    # add at least has to work constraint
    if 'atLeastWork' in extra_constraints:
        constraints.add_at_least_work_task(solver,
                                           extra_constraints['atLeastWork'])

    # add total time fatigue constraints
    if 'timeFatigueTotal' in extra_constraints:
        constraints.add_time_fatigue_total(
            solver, extra_constraints['timeFatigueTotal'])

    # add total overall time fatigue constraints
    if 'overallTimeFatigueTotal' in extra_constraints:
        constraints.add_overall_total_fatigue_time(
            solver, extra_constraints['overallTimeFatigueTotal'])

    # add consecutive fatigue constaints
    if 'overallTimeFatigueConsecutive' in extra_constraints:
        constraints.add_overall_consecutive_total_fatigue_time(
            solver, extra_constraints['overallTimeFatigueConsecutive'])

    # add unavailable time constraints
    if 'unavailable' in extra_constraints:
        constraints.add_unavailability(solver,
                                       extra_constraints['unavailable'])

    # add buddy constraints
    if 'buddy' in extra_constraints:
        constraints.add_buddy(solver, extra_constraints['buddy'])

    # add nemesis constraints
    if 'nemesis' in extra_constraints:
        constraints.add_nemesis(solver, extra_constraints['nemesis'])

    # works must be assigned to at least n tasks (this could change later per worker)
    # [solver.Add(solver.Sum(assignments[i][j] for j in range(num_tasks)) >= 3) for i in range(num_workers)]

    # Create the decision builder.

    # Want to sort the decision variables by least cost to the solution

    if solver_option != 'noOptimisation':
        assignment_ref_copy = copy.deepcopy(assignments_ref)
        assignment_ref_copy_flat = [
            assignment_ref_copy[i][j] for i in range(num_workers)
            for j in range(num_tasks)
        ]
        # Sort by least cost
        assignment_ref_copy_flat.sort(key=lambda wrk_tsk: cost_matrix[str(
            wrk_tsk.worker.id)][wrk_tsk.task.id])
        # map to assignment vars
        assignments_flat = [
            assignments[ref.worker.index][ref.task.index]
            for ref in assignment_ref_copy_flat
        ]
    else:
        assignments_flat = [
            assignments[i][j] for i in range(num_workers)
            for j in range(num_tasks)
        ]

    db = solver.Phase(assignments_flat, solver.CHOOSE_FIRST_UNBOUND,
                      solver.ASSIGN_MAX_VALUE)

    # Create solution collector depending on solver option requested
    if (solver_option == 'optimise'
            and time_limit != None) or solver_option == 'optimal':
        collector = solver.BestValueSolutionCollector(
            False)  # False finds minimum as best solution
    else:
        collector = solver.FirstSolutionCollector()

    # Add decision vars to collector
    collector.Add(assignments_flat)

    monitor = pywrapcp.SearchMonitor(solver)

    monitor.RestartSearch()

    # Set time limit if given
    if solver_option == 'optimise' and time_limit != None:
        print('time_limit', time_limit)
        solver_time_limit = solver.TimeLimit(time_limit * 60 * 1000)

    # Solve appropriately
    if solver_option == 'optimal':
        collector.AddObjective(total_cost)
        status = solver.Solve(db, [objective, collector, monitor])
    elif solver_option == 'optimise' and time_limit != None:
        collector.AddObjective(total_cost)
        status = solver.Solve(
            db, [objective, collector, solver_time_limit, monitor])
    else:
        status = solver.Solve(db, [collector])

    print("Time:", solver.WallTime(), "ms")
    print('status', status)

    # If solution found, collect all assignments
    if status:
        solution_by_task = {}
        solution_by_worker = {}
        for i in range(num_workers):
            for j in range(num_tasks):
                if collector.Value(0, assignments[i][j]) == 1:
                    worker_task = assignments_ref[i][j]
                    # Group solution by worker and task

                    if worker_task.task.id in solution_by_task:
                        solution_by_task[worker_task.task.id] = [
                            *solution_by_task[worker_task.task.id],
                            worker_task.worker.id
                        ]
                    else:
                        solution_by_task[worker_task.task.id] = [
                            worker_task.worker.id
                        ]

                    if worker_task.worker.id in solution_by_worker:
                        solution_by_worker[worker_task.worker.id] = [
                            *solution_by_worker[worker_task.worker.id],
                            worker_task.task.id
                        ]
                    else:
                        solution_by_worker[worker_task.worker.id] = [
                            worker_task.task.id
                        ]

        if solver_option == 'optimal' or (solver_option == 'optimise'
                                          and time_limit != None):
            objective_value = collector.ObjectiveValue(0)
        else:
            objective_value = get_non_optimised_cost(cost_matrix,
                                                     solution_by_task)

        return {
            "status": status,
            "solutionByTask": solution_by_task,
            "solutionByWorker": solution_by_worker,
            "objectiveValue": objective_value
        }

    return {
        "status": status,
        "solutionByTask": None,
        "solutionByWorker": None,
        "objectiveValue": None
    }
Beispiel #3
0
"""
This script runs a matrix workload that solves a least squares problem using
a series of matrix multiplications.
"""

import subprocess

import utils

CORES_PER_WORKER = 8
# Figure out the public hostname of the machine.
get_hostname_command = "curl -s http://169.254.169.254/latest/meta-data/public-hostname"
MASTER_HOSTNAME = subprocess.check_output(get_hostname_command, shell=True)
print "Running job with master", MASTER_HOSTNAME

workers = utils.get_workers()
total_cores = len(workers) * CORES_PER_WORKER

# Compute the parameters for the experiment.

# Increasing the number of rows increases the CPU time.
total_rows_values = [1024 * 1024, 2 * 1024 * 1024]
# This is basically the number of tasks; increase this for more tasks.
num_row_blocks_values = [total_cores, total_cores * 2]
# Reducing this will reduce computation by reduction^2
cols_per_block = 4096
# This is the number of times the shuffle stage will happen.
# To use less memory, reduce this number, and instead increase
# num_repeats (there's one RDD stored in memory for each shuffle
# block).
num_col_blocks = 5
This script runs jobs that process the same amount of data, but use different
numbers of tasks to do so.

Each job reads data from in-memory, sorts the data (saving intermediate shuffle
data in-memory) and stores the out in-memory.
"""

import os
import subprocess
import time

import utils

MEGABYTES_PER_GIGABYTE = 1024

slaves = utils.get_workers()
print "Running experiment assuming slaves %s" % slaves

num_machines = len(slaves)
values_per_key = 8
num_shuffles = 5

base_num_tasks = num_machines * 8
num_tasks_multipliers = [8, 4]
target_total_data_gb = num_machines * 0.5

for num_tasks_multiplier in num_tasks_multipliers:
    num_tasks = base_num_tasks * num_tasks_multiplier

    total_num_items = target_total_data_gb / (4.9 + values_per_key * 1.92) * (
        64 * 4000000)