Esempio n. 1
0
def main(args):
    # Set up jobs.
    jobs_to_complete = set()
    jobs, arrival_times = utils.parse_trace(args.trace_file)
    if args.window_start is not None and args.window_end is not None:
        for i in range(args.window_start, args.window_end):
            jobs_to_complete.add(JobIdPair(i, None))
    else:
        for i in range(len(jobs)):
            jobs_to_complete.add(JobIdPair(i, None))
    job_queue = queue.Queue()
    for (job, arrival_time) in zip(jobs, arrival_times):
        job_queue.put((job, arrival_time))

    # Instantiate scheduler.
    policy = utils.get_policy(args.policy, solver=args.solver, seed=args.seed)
    sched = scheduler.Scheduler(policy,
                                seed=args.seed,
                                throughputs_file=args.throughputs_file,
                                time_per_iteration=args.time_per_iteration,
                                expected_num_workers=args.expected_num_workers,
                                max_rounds=args.max_rounds)

    try:
        # Submit jobs to the scheduler.
        start_time = datetime.datetime.now()
        while not job_queue.empty() and not sched.is_done(jobs_to_complete):
            job, arrival_time = job_queue.get()
            while True:
                current_time = datetime.datetime.now()
                elapsed_seconds = (current_time - start_time).seconds
                remaining_time = arrival_time - elapsed_seconds
                if remaining_time <= 0:
                    job_id = sched.add_job(job)
                    break
                elif sched.is_done(jobs_to_complete):
                    break
                else:
                    time.sleep(SLEEP_TIME)

        # Wait for scheduler to complete.
        while not sched.is_done(jobs_to_complete):
            time.sleep(SLEEP_TIME)

        # Print summary information.
        sched.get_average_jct(jobs_to_complete)
        sched.get_completed_steps(jobs_to_complete)
        sched.get_cluster_utilization()
        sched.get_num_lease_extensions()
        if args.timeline_dir is not None:
            sched.save_job_timelines(args.timeline_dir)
        elapsed_time = (datetime.datetime.now() - start_time).seconds
        print('Total time taken: %d seconds' % (elapsed_time))
    except KeyboardInterrupt as e:
        pass
    finally:
        sched.shutdown()
Esempio n. 2
0
def sweep(policy_names_and_num_sub_clusters,
          all_num_jobs,
          num_trials,
          introduce_skew=False):
    all_runtimes = {}
    all_effective_throughputs = {}
    for num_jobs in all_num_jobs:
        all_runtimes[num_jobs] = []
        all_effective_throughputs[num_jobs] = []
        cluster_spec = {
            'v100': max(num_jobs // 4, 1),
            'p100': max(num_jobs // 4, 1),
            'k80': max(num_jobs // 4, 1),
        }
        for i in range(num_trials):
            throughputs, scale_factors, priority_weights = \
                create_problem_instance(num_jobs, cluster_spec,
                                        policy_names_and_num_sub_clusters[0][0], seed=i,
                                        introduce_skew=introduce_skew)
            all_runtimes[num_jobs].append([])
            allocations = []
            for (policy_name,
                 num_sub_clusters) in policy_names_and_num_sub_clusters:
                policy = utils.get_policy(policy_name, solver='ECOS')
                allocation, runtime = harness(
                    policy,
                    throughputs,
                    scale_factors,
                    priority_weights,
                    cluster_spec,
                    num_sub_clusters=num_sub_clusters)
                all_runtimes[num_jobs][-1].append(runtime)
                allocations.append(allocation)

            all_effective_throughputs[num_jobs].append([])
            for allocation in allocations:
                effective_throughputs = {}
                for job_id in allocation:
                    for single_job_id in job_id.singletons():
                        if single_job_id not in effective_throughputs:
                            effective_throughputs[single_job_id] = 0.0
                    for worker_type in allocation[job_id]:
                        if job_id.is_pair():
                            for i, single_job_id in enumerate(
                                    job_id.singletons()):
                                effective_throughputs[single_job_id] += (
                                    allocation[job_id][worker_type] *
                                    throughputs[job_id][worker_type][i])
                        else:
                            effective_throughputs[job_id] += (
                                allocation[job_id][worker_type] *
                                throughputs[job_id][worker_type])
                all_effective_throughputs[num_jobs][-1].append(
                    effective_throughputs)
    return all_runtimes, all_effective_throughputs
def main(args):
    jobs, arrival_times = utils.parse_trace(args.trace_file)
    policy = utils.get_policy(args.policy, solver=args.solver, seed=args.seed)

    sched = scheduler.Scheduler(policy,
                                throughputs_file=args.throughputs_file,
                                simulate=True,
                                seed=args.seed,
                                time_per_iteration=args.time_per_iteration)

    num_gpus = args.cluster_spec.split(':')
    cluster_spec = {
        'v100': int(num_gpus[0]),
        'p100': int(num_gpus[1]),
        'k80': int(num_gpus[2]),
    }
    num_gpus_per_server_split = args.num_gpus_per_server.split(':')
    num_gpus_per_server = {
        'v100': int(num_gpus_per_server_split[0]),
        'p100': int(num_gpus_per_server_split[1]),
        'k80': int(num_gpus_per_server_split[2]),
    }
    if args.window_start is not None and args.window_end is not None:
        jobs_to_complete = set()
        for i in range(args.window_start, args.window_end):
            jobs_to_complete.add(JobIdPair(i, None))
    else:
        jobs_to_complete = None

    sched.simulate(cluster_spec,
                   arrival_times,
                   jobs,
                   debug=args.debug,
                   checkpoint_threshold=args.checkpoint_threshold,
                   checkpoint_file=args.checkpoint_file,
                   num_gpus_per_server=num_gpus_per_server,
                   jobs_to_complete=jobs_to_complete)
    sched.get_average_jct(jobs_to_complete)
    sched.get_cluster_utilization()
    sched.get_num_lease_extensions()
    sched.shutdown()
Esempio n. 4
0
def simulate_with_timeout(experiment_id, policy_name, throughputs_file,
                          per_instance_type_prices_dir, available_clouds,
                          assign_SLOs, cluster_spec, lam, seed, interval,
                          fixed_job_duration, generate_multi_gpu_jobs,
                          enable_global_queue, num_total_jobs, solver, log_dir,
                          timeout, verbose, num_gpus_per_server, ideal):
    # Add some random delay to prevent outputs from overlapping.
    # TODO: Replace this with postprocessing in the log parsing script.
    time.sleep(random.uniform(0, 5))
    num_total_jobs_str = 'num_total_jobs=%d.log' % (num_total_jobs)

    cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % (
        cluster_spec['v100'], cluster_spec['p100'], cluster_spec['k80'])
    policy = utils.get_policy(policy_name, seed=seed, solver=solver)
    if verbose:
        current_time = datetime.datetime.now()
        print('[%s] [Experiment ID: %2d] '
              'Configuration: cluster_spec=%s, policy=%s, '
              'seed=%d, num_total_jobs=%d' %
              (current_time, experiment_id, cluster_spec_str, policy.name,
               seed, num_total_jobs))

    with open(os.path.join(log_dir, num_total_jobs_str), 'w') as f:
        with contextlib.redirect_stdout(f), contextlib.redirect_stderr(f):
            sched = \
                scheduler.Scheduler(
                    policy, throughputs_file=throughputs_file,
                    seed=seed, time_per_iteration=interval,
                    per_instance_type_prices_dir=per_instance_type_prices_dir,
                    available_clouds = available_clouds,
                    assign_SLOs=assign_SLOs,
                    enable_global_queue=enable_global_queue,
                    simulate=True)

            cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % (
                cluster_spec['v100'], cluster_spec['p100'],
                cluster_spec['k80'])
            if timeout is None:
                sched.simulate(cluster_spec,
                               lam=lam,
                               fixed_job_duration=fixed_job_duration,
                               generate_multi_gpu_jobs=generate_multi_gpu_jobs,
                               num_total_jobs=num_total_jobs,
                               num_gpus_per_server=num_gpus_per_server,
                               ideal=ideal)
                average_jct = sched.get_average_jct()
                utilization = sched.get_cluster_utilization()
                makespan = sched.get_current_timestamp()
                total_cost = sched.get_total_cost()
            else:
                try:
                    func_timeout(timeout,
                                 sched.simulate,
                                 args=(cluster_spec, ),
                                 kwargs={
                                     'lam': lam,
                                     'fixed_job_duration': fixed_job_duration,
                                     'generate_multi_gpu_jobs':
                                     generate_multi_gpu_jobs,
                                     'num_total_jobs': num_total_jobs,
                                     'num_gpus_per_server':
                                     num_gpus_per_server,
                                     'ideal': ideal
                                 })
                    average_jct = sched.get_average_jct()
                    utilization = sched.get_cluster_utilization()
                    makespan = sched.get_current_timestamp()
                    total_cost = sched.get_total_cost()
                except FunctionTimedOut:
                    average_jct = float('inf')
                    utilization = 1.0
                    makespan = float('inf')
                    total_cost = float('inf')

    if verbose:
        current_time = datetime.datetime.now()
        print('[%s] [Experiment ID: %2d] '
              'Results: average JCT=%f, utilization=%f, '
              'makespan=%f, total_cost=$%.2f' %
              (current_time, experiment_id, average_jct, utilization, makespan,
               total_cost))

    sched.shutdown()

    return average_jct, utilization
Esempio n. 5
0
def do_train(sess, args):
    # set CPU as the default device for the graph. Some of the operations will be moved to GPU later.
    with tf.device('/cpu:0'):

        # Images and labels placeholders
        images_ph = tf.placeholder(tf.float32,
                                   shape=(None, ) + tuple(args.processed_size),
                                   name='input')
        labels_ph = tf.placeholder(tf.int32, shape=(None), name='label')

        # a placeholder for determining if we train or validate the network. This placeholder will be used to set dropout rates and batchnorm paramaters.
        is_training_ph = tf.placeholder(tf.bool, name='is_training')

        #epoch number
        epoch_number = tf.get_variable(
            'epoch_number', [],
            dtype=tf.int32,
            initializer=tf.constant_initializer(0),
            trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES])
        global_step = tf.get_variable(
            'global_step', [],
            dtype=tf.int32,
            initializer=tf.constant_initializer(0),
            trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES])

        # Weight Decay policy
        wd = utils.get_policy(args.WD_policy, args.WD_details)

        # Learning rate decay policy (if needed)
        lr = utils.get_policy(args.LR_policy, args.LR_details)

        # Create an optimizer that performs gradient descent.
        optimizer = utils.get_optimizer(args.optimizer, lr)

        # build the computational graph using the provided configuration.
        dnn_model = model(images_ph,
                          labels_ph,
                          utils.loss,
                          optimizer,
                          wd,
                          args.architecture,
                          args.num_classes,
                          is_training_ph,
                          args.transfer_mode,
                          num_gpus=args.num_gpus)

        # Create a pipeline to read data from disk
        # a placeholder for setting the input pipeline batch size. This is employed to ensure that we feed each validation example only once to the network.
        # Because we only use 1 GPU for validation, the validation batch size should not be more than 512.
        batch_size_tf = tf.placeholder_with_default(min(512, args.batch_size),
                                                    shape=())

        # A data loader pipeline to read training images and their labels
        train_loader = loader(args.train_info, args.delimiter, args.raw_size,
                              args.processed_size, True,
                              args.chunked_batch_size, args.num_prefetch,
                              args.num_threads, args.path_prefix, args.shuffle)
        # The loader returns images, their labels, and their paths
        images, labels, info = train_loader.load()

        # If validation data are provided, we create an input pipeline to load the validation data
        if args.run_validation:
            val_loader = loader(args.val_info, args.delimiter, args.raw_size,
                                args.processed_size, False, batch_size_tf,
                                args.num_prefetch, args.num_threads,
                                args.path_prefix)
            val_images, val_labels, val_info = val_loader.load()

        # Get training operations to run from the deep learning model
        train_ops = dnn_model.train_ops()

        # Build an initialization operation to run below.
        init = tf.group(tf.global_variables_initializer(),
                        tf.local_variables_initializer())
        sess.run(init)

        if args.retrain_from is not None:
            dnn_model.load(sess, args.retrain_from)

        # Set the start epoch number
        start_epoch = sess.run(epoch_number + 1)

        # Start the queue runners.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # Setup a summary writer
        summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)

        # The main training loop
        for epoch in range(start_epoch, start_epoch + args.num_epochs):

            # update epoch_number
            sess.run(epoch_number.assign(epoch))

            print("Epoch %d of %d started" %
                  (epoch, start_epoch + args.num_epochs - 1))
            # Trainig batches
            for step in range(args.num_batches):
                sess.run(global_step.assign(step + epoch * args.num_batches))
                # train the network on a batch of data (It also measures time)
                start_time = time.time()

                # load a batch from input pipeline
                img, lbl = sess.run([images, labels],
                                    options=args.run_options,
                                    run_metadata=args.run_metadata)

                # train on the loaded batch of data
                _, loss_value, top1_accuracy, topn_accuracy = sess.run(
                    train_ops,
                    feed_dict={
                        images_ph: img,
                        labels_ph: lbl,
                        is_training_ph: True
                    },
                    options=args.run_options,
                    run_metadata=args.run_metadata)
                duration = time.time() - start_time

                # Check for errors
                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                # Logging every ten batches and writing tensorboard summaries every hundred batches
                if step % 10 == 0:

                    num_examples_per_step = args.chunked_batch_size * args.num_gpus
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = duration / args.num_gpus

                    # Log
                    format_str = (
                        '%s: epoch %d of %d, step %d of %d, loss = %.2f, Top-1 = %.2f Top-'
                        + str(args.top_n) +
                        ' = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    print(
                        format_str %
                        (datetime.now(), epoch, start_epoch + args.num_epochs -
                         1, step, args.num_batches, loss_value, top1_accuracy,
                         topn_accuracy, examples_per_sec, sec_per_batch))
                    sys.stdout.flush()

                if step % 100 == 0:
                    summary_str = sess.run(tf.summary.merge_all(),
                                           feed_dict={
                                               images_ph: img,
                                               labels_ph: lbl,
                                               is_training_ph: True
                                           })
                    summary_writer.add_summary(summary_str,
                                               args.num_batches * epoch + step)
                    if args.log_debug_info:
                        summary_writer.add_run_metadata(
                            run_metadata, 'epoch%d step%d' % (epoch, step))

            # Save the model checkpoint periodically after each training epoch
            checkpoint_path = os.path.join(args.log_dir, args.snapshot_prefix)
            dnn_model.save(sess, checkpoint_path, global_step=epoch)

            print("Epoch %d of %d ended. a checkpoint saved at %s" %
                  (epoch, start_epoch + args.num_epochs - 1, args.log_dir))
            sys.stdout.flush()
            # if validation data are provided, evaluate accuracy on the validation set after the end of each epoch
            if args.run_validation:

                print("Evaluating on validation set")
                total_loss = utils.AverageMeter(
                )  # Measures cross entropy loss
                top1 = utils.AverageMeter()  # Measures top-1 accuracy
                topn = utils.AverageMeter()  # Measures top-n accuracy

                # The validation loop
                for step in range(args.num_val_batches):
                    # Load a batch of data
                    val_img, val_lbl = sess.run(
                        [val_images, val_labels],
                        feed_dict={
                            batch_size_tf:
                            args.num_val_samples % min(512, args.batch_size)
                        } if step == args.num_val_batches - 1 else None,
                        options=args.run_options,
                        run_metadata=args.run_metadata)

                    # validate the network on the loaded batch
                    val_loss, top1_predictions, topn_predictions = sess.run(
                        [train_ops[1], train_ops[2], train_ops[3]],
                        feed_dict={
                            images_ph: val_img,
                            labels_ph: val_lbl,
                            is_training_ph: False
                        },
                        options=args.run_options,
                        run_metadata=args.run_metadata)

                    current_batch_size = val_lbl.shape[0]
                    total_loss.update(val_loss, current_batch_size)
                    top1.update(top1_predictions, current_batch_size)
                    topn.update(topn_predictions, current_batch_size)

                    if step % 10 == 0 or step == args.num_val_batches - 1:
                        print(
                            "Validation step %d of %d, Loss %.2f, Top-1 Accuracy %.2f, Top-%d Accuracy %.2f "
                            % (step, args.num_val_batches, total_loss.avg,
                               top1.avg, args.top_n, topn.avg))
                        sys.stdout.flush()

        coord.request_stop()
        coord.join(threads)
        sess.close()
Esempio n. 6
0
def simulate(policy_name, throughputs_file, cluster_spec, lam, seed, interval,
             jobs_to_complete, fixed_job_duration, generate_multi_gpu_jobs,
             generate_multi_priority_jobs, simulate_steady_state, solver,
             debug, checkpoint_threshold, checkpoint_file,
             profiling_percentage, per_instance_type_prices_dir,
             available_clouds, assign_SLOs, enable_global_queue,
             num_gpus_per_server, output_trace_file_name):
    policy = utils.get_policy(policy_name, solver=solver, seed=seed)
    sched = scheduler.Scheduler(
        policy,
        throughputs_file=throughputs_file,
        seed=seed,
        time_per_iteration=interval,
        simulate=True,
        profiling_percentage=profiling_percentage,
        per_instance_type_prices_dir=per_instance_type_prices_dir,
        available_clouds=available_clouds,
        assign_SLOs=assign_SLOs,
        enable_global_queue=enable_global_queue)

    cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % (
        cluster_spec['v100'], cluster_spec['p100'], cluster_spec['k80'])
    current_time = datetime.datetime.now()
    print('[%s] Configuration: cluster_spec=%s, policy=%s, '
          'seed=%d, lam=%f' %
          (current_time, cluster_spec_str, policy.name, seed, lam),
          file=sys.stderr)

    if lam == 0:
        num_total_jobs = len(jobs_to_complete)
    else:
        num_total_jobs = None

    sched.simulate(cluster_spec,
                   lam=lam,
                   jobs_to_complete=jobs_to_complete,
                   fixed_job_duration=fixed_job_duration,
                   generate_multi_gpu_jobs=generate_multi_gpu_jobs,
                   generate_multi_priority_jobs=generate_multi_priority_jobs,
                   simulate_steady_state=simulate_steady_state,
                   num_total_jobs=num_total_jobs,
                   debug=debug,
                   checkpoint_threshold=checkpoint_threshold,
                   checkpoint_file=checkpoint_file,
                   num_gpus_per_server=num_gpus_per_server,
                   output_trace_file_name=output_trace_file_name)
    average_jct = sched.get_average_jct(jobs_to_complete)
    utilization = sched.get_cluster_utilization()
    total_cost = sched.get_total_cost()
    num_SLO_violations = sched.get_num_SLO_violations()
    lease_extension_freq = sched.get_num_lease_extensions()

    current_time = datetime.datetime.now()
    print('[%s] Results: average JCT=%f, utilization=%f, '
          'total_cost=$%.2f, num_SLO_violations=%d, '
          'lease_extension_frequency=%.2f%%' %
          (current_time, average_jct, utilization, total_cost,
           num_SLO_violations, lease_extension_freq),
          file=sys.stderr)

    sched.shutdown()
Esempio n. 7
0
def measure_runtime(num_active_jobs, policy_name, oracle_throughputs,
                    generate_multi_gpu_jobs, generate_multi_priority_jobs,
                    num_trials, solver):
    cluster_spec = {
        'v100': num_active_jobs // 4,
        'p100': num_active_jobs // 4,
        'k80': num_active_jobs // 4,
    }
    print(cluster_spec)

    results_str = '%s,%d' % (policy_name, num_active_jobs)
    results = []
    for trial in range(num_trials):
        throughputs, jobs, scale_factors = generate_input(
            num_active_jobs,
            cluster_spec,
            policy_name,
            oracle_throughputs,
            generate_multi_gpu_jobs,
            generate_multi_priority_jobs,
            seed=trial + 2)
        if "water_filling" in policy_name:
            num_entities = 5
            priority_reweighting_policies = {}
            entity_to_job_mapping = {}
            entity_weights = {}
            for i in range(num_entities):
                entity_id = 'entity%d' % i
                priority_reweighting_policies[entity_id] = 'fairness'
                entity_to_job_mapping[entity_id] = []
                entity_weights[entity_id] = random.randint(1, 3)
            policy = utils.get_policy(
                policy_name,
                solver=solver,
                priority_reweighting_policies=priority_reweighting_policies)
        else:
            policy = utils.get_policy(policy_name, solver=solver)
        start_time = time.time()
        with open('/dev/null', 'w') as f:
            with contextlib.redirect_stdout(f):
                if policy.name.startswith('MaxMinFairness'):
                    priority_weights = {
                        JobIdPair(i, None): jobs[i].priority_weight
                        for i in range(num_active_jobs)
                    }
                    if "WaterFilling" in policy.name:
                        for i in range(num_active_jobs):
                            entity_id = 'entity%d' % random.randint(
                                0, num_entities - 1)
                            entity_to_job_mapping[entity_id].append(
                                JobIdPair(i, None))
                        policy.get_allocation(
                            throughputs,
                            scale_factors,
                            priority_weights,
                            cluster_spec,
                            entity_weights=entity_weights,
                            entity_to_job_mapping=entity_to_job_mapping)
                    else:
                        policy.get_allocation(throughputs, scale_factors,
                                              priority_weights, cluster_spec)
                elif policy.name.startswith('MinTotalDuration'):
                    num_steps_remaining = {
                        JobIdPair(i, None): jobs[i].num_steps
                        for i in range(num_active_jobs)
                    }
                    policy.get_allocation(throughputs, scale_factors,
                                          num_steps_remaining, cluster_spec)
                else:
                    policy.get_allocation(throughputs, scale_factors,
                                          cluster_spec)

        runtime = time.time() - start_time
        results.append(runtime)
    for result in results:
        results_str += ',' + str(result)
    results_str += ',' + str(np.mean(results))
    return results_str
Esempio n. 8
0
def simulate_with_timeout(experiment_id, policy_name, throughputs_file,
                          cluster_spec, lam, seed, interval, jobs_to_complete,
                          fixed_job_duration, solver, generate_multi_gpu_jobs,
                          generate_multi_priority_jobs, simulate_steady_state,
                          log_dir, timeout, verbose, checkpoint_threshold,
                          profiling_percentage, num_reference_models,
                          num_gpus_per_server, ideal):
    lam_str = 'lambda=%f.log' % (lam)
    checkpoint_file = None
    if checkpoint_threshold is not None:
        checkpoint_file = os.path.join(log_dir, 'lambda=%f.pickle' % lam)

    cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % (
        cluster_spec['v100'], cluster_spec['p100'], cluster_spec['k80'])
    policy = utils.get_policy(policy_name, solver=solver, seed=seed)
    if verbose:
        current_time = datetime.datetime.now()
        print('[%s] [Experiment ID: %2d] '
              'Configuration: cluster_spec=%s, policy=%s, '
              'seed=%d, lam=%f, '
              'profiling_percentage=%f, '
              'num_reference_models=%d' %
              (current_time, experiment_id, cluster_spec_str, policy.name,
               seed, lam, profiling_percentage, num_reference_models))

    with open(os.path.join(log_dir, lam_str), 'w') as f:
        with contextlib.redirect_stderr(f), contextlib.redirect_stdout(f):
            sched = scheduler.Scheduler(
                policy,
                throughputs_file=throughputs_file,
                seed=seed,
                time_per_iteration=interval,
                simulate=True,
                profiling_percentage=profiling_percentage,
                num_reference_models=num_reference_models)

            if timeout is None:
                sched.simulate(
                    cluster_spec,
                    lam=lam,
                    jobs_to_complete=jobs_to_complete,
                    fixed_job_duration=fixed_job_duration,
                    generate_multi_gpu_jobs=generate_multi_gpu_jobs,
                    generate_multi_priority_jobs=generate_multi_priority_jobs,
                    simulate_steady_state=simulate_steady_state,
                    checkpoint_file=checkpoint_file,
                    checkpoint_threshold=checkpoint_threshold,
                    num_gpus_per_server=num_gpus_per_server,
                    ideal=ideal)
                average_jct = sched.get_average_jct(jobs_to_complete)
                utilization = 1.0
                if not ideal:
                    utilization = sched.get_cluster_utilization()
            else:
                try:
                    func_timeout(
                        timeout,
                        sched.simulate,
                        args=(cluster_spec, ),
                        kwargs={
                            'lam': lam,
                            'jobs_to_complete': jobs_to_complete,
                            'fixed_job_duration': fixed_job_duration,
                            'generate_multi_gpu_jobs': generate_multi_gpu_jobs,
                            'generate_multi_priority_jobs':
                            generate_multi_priority_jobs,
                            'simulate_steady_state': simulate_steady_state,
                            'checkpoint_file': checkpoint_file,
                            'checkpoint_threshold': checkpoint_threshold,
                            'num_gpus_per_server': num_gpus_per_server,
                            'ideal': ideal
                        })
                    average_jct = sched.get_average_jct(jobs_to_complete)
                    utilization = sched.get_cluster_utilization()
                except FunctionTimedOut:
                    average_jct = float('inf')
                    utilization = 1.0

    if verbose:
        current_time = datetime.datetime.now()
        print('[%s] [Experiment ID: %2d] '
              'Results: average JCT=%f, utilization=%f' %
              (current_time, experiment_id, average_jct, utilization))
    sched.shutdown()

    return average_jct, utilization
Esempio n. 9
0
def create_experiment_config(workspace):
    ########################################
    ### Creating data prep Pipeline Step ###
    ########################################

    # Load settings
    print("Loading settings")
    data_prep_step_path = os.path.join("steps", "data_prep")
    with open(os.path.join(data_prep_step_path, "step.json")) as f:
        data_prep_settings = json.load(f)

    # Setup datasets of first step
    print("Setting up datasets")
    data_prep_input = Dataset.get_by_name(workspace=workspace,
                                          name=data_prep_settings.get(
                                              "dataset_input_name",
                                              None)).as_named_input(
                                                  data_prep_settings.get(
                                                      "dataset_input_name",
                                                      None)).as_mount()
    data_prep_output = PipelineData(
        name=data_prep_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_output_name",
                                "workspaceblobstore")),
        output_mode="mount").as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #data_prep_output.register(
    #    name=data_prep_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    data_prep_dependencies = CondaDependencies.create(
        pip_packages=data_prep_settings.get("pip_packages", []),
        conda_packages=data_prep_settings.get("conda_packages", []),
        python_version=data_prep_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=data_prep_dependencies,
        framework=data_prep_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    data_prep_compute_target = ComputeTarget(workspace=workspace,
                                             name=data_prep_settings.get(
                                                 "compute_target_name", None))

    # Create python step
    print("Creating Step")
    data_prep = PythonScriptStep(
        name=data_prep_settings.get("step_name", None),
        script_name=data_prep_settings.get("script_name", None),
        arguments=data_prep_settings.get("arguments", []),
        compute_target=data_prep_compute_target,
        runconfig=data_prep_run_config,
        inputs=[data_prep_input],
        outputs=[data_prep_output],
        params=data_prep_settings.get("parameters", []),
        source_directory=data_prep_step_path,
        allow_reuse=data_prep_settings.get("allow_reuse", True),
        version=data_prep_settings.get("version", None),
    )

    ###############################################
    ### Creating data model train Pipeline Step ###
    ###############################################

    # Load settings
    print("Loading settings")
    model_train_step_path = os.path.join("steps", "model_train")
    with open(os.path.join(model_train_step_path, "step.json")) as f:
        model_train_settings = json.load(f)
    hyperparameter_sampling_settings = model_train_settings.get(
        "hyperparameter_sampling", {})

    # Setup datasets of first step
    print("Setting up datasets")
    model_train_input = data_prep_output.as_named_input(
        name=model_train_settings.get("dataset_input_name", None))
    model_train_output = PipelineData(
        name=model_train_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=model_train_settings.get(
                                "datastore_output_name", None)),
        output_mode="mount",
    ).as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #model_train_output.register(
    #    name=model_train_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    model_train_dependencies = CondaDependencies.create(
        pip_packages=model_train_settings.get("pip_packages", []),
        conda_packages=model_train_settings.get("conda_packages", []),
        python_version=model_train_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    model_train_run_config = RunConfiguration(
        conda_dependencies=model_train_dependencies,
        framework=model_train_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    model_train_compute_target = ComputeTarget(workspace=workspace,
                                               name=model_train_settings.get(
                                                   "compute_target_name",
                                                   None))

    # Create distributed training backend
    print("Creating distributed training backend")
    distributed_training_backend = get_distributed_backend(
        backend_name=model_train_settings.get("distributed_backend", None))

    # Create Estimator for Training
    print("Creating Estimator for training")
    model_train_estimator = Estimator(
        source_directory=model_train_step_path,
        entry_script=model_train_settings.get("script_name", None),
        environment_variables=model_train_settings.get("parameters", None),
        compute_target=model_train_compute_target,
        node_count=model_train_settings.get("node_count", None),
        distributed_training=distributed_training_backend,
        conda_packages=model_train_settings.get("conda_packages", None),
        pip_packages=model_train_settings.get("pip_packages", None),
    )

    try:
        # Create parameter sampling
        print("Creating Parameter Sampling")
        parameter_dict = {}
        parameters = hyperparameter_sampling_settings.get(
            "parameters",
            {}) if "parameters" in hyperparameter_sampling_settings else {}
        for parameter_name, parameter_details in parameters.items():
            parameter_distr = get_parameter_distribution(
                distribution=parameter_details.get("distribution", None),
                **parameter_details.get("settings", {}))
            parameter_dict[f"--{parameter_name}"] = parameter_distr
        model_train_ps = get_parameter_sampling(
            sampling_method=hyperparameter_sampling_settings.get(
                "method", None),
            parameter_dict=parameter_dict)

        # Get Policy definition
        policy_settings = hyperparameter_sampling_settings.get("policy", {})
        kwargs = {
            key: value
            for key, value in policy_settings.items() if key not in
            ["policy_method", "evaluation_interval", "delay_evaluation"]
        }

        # Create termination policy
        print("Creating early termination policy")
        model_train_policy = get_policy(
            policy_method=policy_settings.get("method", ""),
            evaluation_interval=policy_settings.get("evaluation_interval",
                                                    None),
            delay_evaluation=policy_settings.get("delay_evaluation", None),
            **kwargs)

        # Create HyperDriveConfig
        print("Creating HyperDriveConfig")
        model_train_hyperdrive_config = HyperDriveConfig(
            estimator=model_train_estimator,
            hyperparameter_sampling=model_train_ps,
            policy=model_train_policy,
            primary_metric_name=hyperparameter_sampling_settings.get(
                "primary_metric", None),
            primary_metric_goal=PrimaryMetricGoal.MINIMIZE
            if "min" in hyperparameter_sampling_settings.get(
                "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE,
            max_total_runs=hyperparameter_sampling_settings.get(
                "max_total_runs", 1),
            max_concurrent_runs=hyperparameter_sampling_settings.get(
                "max_concurrent_runs", 1),
            max_duration_minutes=hyperparameter_sampling_settings.get(
                "max_duration_minutes", None))

        # Create HyperDriveStep
        print("Creating HyperDriveStep")
        model_train = HyperDriveStep(
            name=model_train_settings.get("step_name", None),
            hyperdrive_config=model_train_hyperdrive_config,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))
    except:
        print("Not all required parameters specified for HyperDrive step")

        # Create EstimatorStep
        print("Creating EstimatorStep")
        model_train = EstimatorStep(
            name=model_train_settings.get("step_name", None),
            estimator=model_train_estimator,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            compute_target=model_train_compute_target,
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))

    #########################
    ### Creating Pipeline ###
    #########################

    # Create Pipeline
    print("Creating Pipeline")
    pipeline = Pipeline(
        workspace=workspace,
        steps=[model_train],
        description="Training Pipeline",
    )

    # Validate pipeline
    print("Validating pipeline")
    pipeline.validate()

    return pipeline
Esempio n. 10
0
def main(args):
    rng = random.Random()
    rng.seed(0)
    v100s, p100s, k80s = args.cluster_spec.split(':')
    cluster_spec = {
        'v100': int(v100s),
        'p100': int(p100s),
        'k80': int(k80s),
    }
    worker_types = sorted(cluster_spec.keys())
    oracle_throughputs =\
        utils.read_all_throughputs_json_v2(args.throughputs_file)
    jobs = []
    for i in range(args.num_jobs):
        job_template = JobTable[i % len(JobTable)]
        job_id = JobIdPair(i, None)
        job = utils.generate_job(
            throughputs=oracle_throughputs,
            reference_worker_type='v100',
            rng=rng,
            job_id=job_id,
            generate_multi_gpu_jobs=args.generate_multi_gpu_jobs,
            generate_multi_priority_jobs=args.generate_multi_priority_jobs)
        jobs.append(job)
    policy = utils.get_policy('max_min_fairness_packed', solver=args.solver)
    scale_factors = {job.job_id: job.scale_factor for job in jobs}
    priority_weights = {job.job_id: job.priority_weight for job in jobs}

    start = datetime.datetime.now()
    original_allocation = get_allocation(policy, jobs, oracle_throughputs,
                                         cluster_spec, worker_types,
                                         scale_factors, priority_weights)
    original_runtime = datetime.datetime.now() - start

    start = datetime.datetime.now()
    job_type_allocation = \
        get_allocation_using_job_type_throughputs(policy, jobs,
                                                  oracle_throughputs,
                                                  cluster_spec, worker_types,
                                                  scale_factors,
                                                  priority_weights)
    job_id_to_job_type_key = \
        {job.job_id: (job.job_type, job.scale_factor) for job in jobs}
    job_type_runtime = datetime.datetime.now() - start

    if args.verbose:
        print('Original allocation:')
        utils.print_allocation(original_allocation)
        print('')

        print('Allocation using job type throughputs:')
        utils.print_allocation(job_type_allocation)
        print('')

    print('Original effective throughputs:')
    print_effective_throughputs(original_allocation, oracle_throughputs,
                                job_id_to_job_type_key)
    print('')

    print('Effective_throughputs using job type throughputs:')
    print_effective_throughputs(job_type_allocation, oracle_throughputs,
                                job_id_to_job_type_key)

    print('Original runtime:',
          original_runtime.seconds + original_runtime.microseconds / 1.0e6)
    print('Runtime using job type throughputs:',
          job_type_runtime.seconds + job_type_runtime.microseconds / 1.0e6)
Esempio n. 11
0
def do_train(args):
    # create model
    dnn_model = model(args.architecture, args.num_classes)

    if args.num_gpus == 1:
        dnn_model = dnn_model.cuda()
    else:
        dnn_model = torch.nn.DataParallel(dnn_model, device_ids = range(0, args.num_gpus)).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    lr = utils.get_policy(args.LR_policy, args.LR_details)
    wd = utils.get_policy(args.WD_policy, args.WD_details)
    optimizer = utils.get_optimizer(args.optimizer, dnn_model.parameters(), 0.01)
    train_loader = data_loader.CSVDataset(args.train_info, args.delimiter, args.raw_size, args.processed_size, args.batch_size, args.num_workers, args.path_prefix, True, shuffle = True).load()
    start_epoch = 0
    if args.retrain_from is not None:
        checkpoint = torch.load(utils.smart_load(args.retrain_from))
        dnn_model.module.load_state_dict(checkpoint['model'])
        if args.transfer_mode[0] == 0:
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
        if args.transfer_mode[0] == 1 or args.transfer_mode[0] == 3:
            dnn_model.freeze()
    

    if args.run_validation:
        val_loader= data_loader.CSVDataset(args.val_info, args.delimiter, args.raw_size, args.processed_size, args.batch_size, args.num_workers, args.path_prefix, False, shuffle = False).load()
    for epoch in range(start_epoch, start_epoch + args.num_epochs):
        if args.optimizer not in ['adam', 'adadelta']:
          utils.adjust_param(optimizer, 'lr', lr, epoch)
        utils.adjust_param(optimizer, 'weight_decay', wd, epoch)
        if args.transfer_mode[0] == 3 and epoch==args.transfer_mode[1]:
            dnn_model.unfreeze()
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()
        losses = utils.AverageMeter()
        top1 = utils.AverageMeter()
        topn = utils.AverageMeter()

        # switch to train mode
        dnn_model.train()
        end = time.time()
        for step, (input, target, _) in islice(enumerate(train_loader), args.num_batches):
            # measure data loading time
            data_time.update(time.time() - end)
            input = input.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

            # compute output
            output = dnn_model(input)
            #print(output,target)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, precn = utils.accuracy(output, target, topk=(1, args.top_n))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1[0], input.size(0))
            topn.update(precn[0], input.size(0))
            #print(loss.item(), prec1[0], precn[0])
            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if step % 10 == 0:
                format_str = ('%s: epoch %d, step %d, loss = %.2f, Top-1 = %.2f Top-' + str(args.top_n) + ' = %.2f')
                print(format_str % (datetime.now(), epoch, step, losses.val, top1.val, topn.val))
                sys.stdout.flush()
        state= {'epoch': epoch + 1,
              'arch': args.architecture,
              'num_classes': args.num_classes,
              'model': dnn_model.module.state_dict(),
              'optimizer': optimizer.state_dict()
               }
        torch.save(state, utils.smart_save(os.path.join(args.log_dir, 'checkpoint%04d.pth.tar'%(epoch)), max_to_keep = args.max_to_keep))
            # if validation data are provided, evaluate accuracy on the validation set after the end of each epoch
        if args.run_validation:
            valbatch_time = utils.AverageMeter()
            vallosses = utils.AverageMeter()
            valtop1 = utils.AverageMeter()
            valtop5 = utils.AverageMeter()

            # switch to evaluate mode
            dnn_model.eval()

            with torch.no_grad():
                end = time.time()
                for i, (input, target, _) in enumerate(val_loader):
                    input = input.cuda(non_blocking = True)
                    target = target.cuda(non_blocking = True)

                    # compute output
                    output = dnn_model(input)
                    loss = criterion(output, target)

                    # measure accuracy and record loss
                    prec1, prec5 = utils.accuracy(output, target, topk = (1, 5))
                    vallosses.update(loss.item(), input.size(0))
                    valtop1.update(prec1[0], input.size(0))
                    valtop5.update(prec5[0], input.size(0))

                    # measure elapsed time
                    valbatch_time.update(time.time() - end)
                    end = time.time()

                    print('Test: [{0}/{1}]\t'
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                        'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                        i, len(val_loader), batch_time = valbatch_time, loss = vallosses,
                        top1 = valtop1, top5 = valtop5))
                    sys.stdout.flush()
    print('Training finished')