def main(args): # Set up jobs. jobs_to_complete = set() jobs, arrival_times = utils.parse_trace(args.trace_file) if args.window_start is not None and args.window_end is not None: for i in range(args.window_start, args.window_end): jobs_to_complete.add(JobIdPair(i, None)) else: for i in range(len(jobs)): jobs_to_complete.add(JobIdPair(i, None)) job_queue = queue.Queue() for (job, arrival_time) in zip(jobs, arrival_times): job_queue.put((job, arrival_time)) # Instantiate scheduler. policy = utils.get_policy(args.policy, solver=args.solver, seed=args.seed) sched = scheduler.Scheduler(policy, seed=args.seed, throughputs_file=args.throughputs_file, time_per_iteration=args.time_per_iteration, expected_num_workers=args.expected_num_workers, max_rounds=args.max_rounds) try: # Submit jobs to the scheduler. start_time = datetime.datetime.now() while not job_queue.empty() and not sched.is_done(jobs_to_complete): job, arrival_time = job_queue.get() while True: current_time = datetime.datetime.now() elapsed_seconds = (current_time - start_time).seconds remaining_time = arrival_time - elapsed_seconds if remaining_time <= 0: job_id = sched.add_job(job) break elif sched.is_done(jobs_to_complete): break else: time.sleep(SLEEP_TIME) # Wait for scheduler to complete. while not sched.is_done(jobs_to_complete): time.sleep(SLEEP_TIME) # Print summary information. sched.get_average_jct(jobs_to_complete) sched.get_completed_steps(jobs_to_complete) sched.get_cluster_utilization() sched.get_num_lease_extensions() if args.timeline_dir is not None: sched.save_job_timelines(args.timeline_dir) elapsed_time = (datetime.datetime.now() - start_time).seconds print('Total time taken: %d seconds' % (elapsed_time)) except KeyboardInterrupt as e: pass finally: sched.shutdown()
def sweep(policy_names_and_num_sub_clusters, all_num_jobs, num_trials, introduce_skew=False): all_runtimes = {} all_effective_throughputs = {} for num_jobs in all_num_jobs: all_runtimes[num_jobs] = [] all_effective_throughputs[num_jobs] = [] cluster_spec = { 'v100': max(num_jobs // 4, 1), 'p100': max(num_jobs // 4, 1), 'k80': max(num_jobs // 4, 1), } for i in range(num_trials): throughputs, scale_factors, priority_weights = \ create_problem_instance(num_jobs, cluster_spec, policy_names_and_num_sub_clusters[0][0], seed=i, introduce_skew=introduce_skew) all_runtimes[num_jobs].append([]) allocations = [] for (policy_name, num_sub_clusters) in policy_names_and_num_sub_clusters: policy = utils.get_policy(policy_name, solver='ECOS') allocation, runtime = harness( policy, throughputs, scale_factors, priority_weights, cluster_spec, num_sub_clusters=num_sub_clusters) all_runtimes[num_jobs][-1].append(runtime) allocations.append(allocation) all_effective_throughputs[num_jobs].append([]) for allocation in allocations: effective_throughputs = {} for job_id in allocation: for single_job_id in job_id.singletons(): if single_job_id not in effective_throughputs: effective_throughputs[single_job_id] = 0.0 for worker_type in allocation[job_id]: if job_id.is_pair(): for i, single_job_id in enumerate( job_id.singletons()): effective_throughputs[single_job_id] += ( allocation[job_id][worker_type] * throughputs[job_id][worker_type][i]) else: effective_throughputs[job_id] += ( allocation[job_id][worker_type] * throughputs[job_id][worker_type]) all_effective_throughputs[num_jobs][-1].append( effective_throughputs) return all_runtimes, all_effective_throughputs
def main(args): jobs, arrival_times = utils.parse_trace(args.trace_file) policy = utils.get_policy(args.policy, solver=args.solver, seed=args.seed) sched = scheduler.Scheduler(policy, throughputs_file=args.throughputs_file, simulate=True, seed=args.seed, time_per_iteration=args.time_per_iteration) num_gpus = args.cluster_spec.split(':') cluster_spec = { 'v100': int(num_gpus[0]), 'p100': int(num_gpus[1]), 'k80': int(num_gpus[2]), } num_gpus_per_server_split = args.num_gpus_per_server.split(':') num_gpus_per_server = { 'v100': int(num_gpus_per_server_split[0]), 'p100': int(num_gpus_per_server_split[1]), 'k80': int(num_gpus_per_server_split[2]), } if args.window_start is not None and args.window_end is not None: jobs_to_complete = set() for i in range(args.window_start, args.window_end): jobs_to_complete.add(JobIdPair(i, None)) else: jobs_to_complete = None sched.simulate(cluster_spec, arrival_times, jobs, debug=args.debug, checkpoint_threshold=args.checkpoint_threshold, checkpoint_file=args.checkpoint_file, num_gpus_per_server=num_gpus_per_server, jobs_to_complete=jobs_to_complete) sched.get_average_jct(jobs_to_complete) sched.get_cluster_utilization() sched.get_num_lease_extensions() sched.shutdown()
def simulate_with_timeout(experiment_id, policy_name, throughputs_file, per_instance_type_prices_dir, available_clouds, assign_SLOs, cluster_spec, lam, seed, interval, fixed_job_duration, generate_multi_gpu_jobs, enable_global_queue, num_total_jobs, solver, log_dir, timeout, verbose, num_gpus_per_server, ideal): # Add some random delay to prevent outputs from overlapping. # TODO: Replace this with postprocessing in the log parsing script. time.sleep(random.uniform(0, 5)) num_total_jobs_str = 'num_total_jobs=%d.log' % (num_total_jobs) cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % ( cluster_spec['v100'], cluster_spec['p100'], cluster_spec['k80']) policy = utils.get_policy(policy_name, seed=seed, solver=solver) if verbose: current_time = datetime.datetime.now() print('[%s] [Experiment ID: %2d] ' 'Configuration: cluster_spec=%s, policy=%s, ' 'seed=%d, num_total_jobs=%d' % (current_time, experiment_id, cluster_spec_str, policy.name, seed, num_total_jobs)) with open(os.path.join(log_dir, num_total_jobs_str), 'w') as f: with contextlib.redirect_stdout(f), contextlib.redirect_stderr(f): sched = \ scheduler.Scheduler( policy, throughputs_file=throughputs_file, seed=seed, time_per_iteration=interval, per_instance_type_prices_dir=per_instance_type_prices_dir, available_clouds = available_clouds, assign_SLOs=assign_SLOs, enable_global_queue=enable_global_queue, simulate=True) cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % ( cluster_spec['v100'], cluster_spec['p100'], cluster_spec['k80']) if timeout is None: sched.simulate(cluster_spec, lam=lam, fixed_job_duration=fixed_job_duration, generate_multi_gpu_jobs=generate_multi_gpu_jobs, num_total_jobs=num_total_jobs, num_gpus_per_server=num_gpus_per_server, ideal=ideal) average_jct = sched.get_average_jct() utilization = sched.get_cluster_utilization() makespan = sched.get_current_timestamp() total_cost = sched.get_total_cost() else: try: func_timeout(timeout, sched.simulate, args=(cluster_spec, ), kwargs={ 'lam': lam, 'fixed_job_duration': fixed_job_duration, 'generate_multi_gpu_jobs': generate_multi_gpu_jobs, 'num_total_jobs': num_total_jobs, 'num_gpus_per_server': num_gpus_per_server, 'ideal': ideal }) average_jct = sched.get_average_jct() utilization = sched.get_cluster_utilization() makespan = sched.get_current_timestamp() total_cost = sched.get_total_cost() except FunctionTimedOut: average_jct = float('inf') utilization = 1.0 makespan = float('inf') total_cost = float('inf') if verbose: current_time = datetime.datetime.now() print('[%s] [Experiment ID: %2d] ' 'Results: average JCT=%f, utilization=%f, ' 'makespan=%f, total_cost=$%.2f' % (current_time, experiment_id, average_jct, utilization, makespan, total_cost)) sched.shutdown() return average_jct, utilization
def do_train(sess, args): # set CPU as the default device for the graph. Some of the operations will be moved to GPU later. with tf.device('/cpu:0'): # Images and labels placeholders images_ph = tf.placeholder(tf.float32, shape=(None, ) + tuple(args.processed_size), name='input') labels_ph = tf.placeholder(tf.int32, shape=(None), name='label') # a placeholder for determining if we train or validate the network. This placeholder will be used to set dropout rates and batchnorm paramaters. is_training_ph = tf.placeholder(tf.bool, name='is_training') #epoch number epoch_number = tf.get_variable( 'epoch_number', [], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES]) global_step = tf.get_variable( 'global_step', [], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES]) # Weight Decay policy wd = utils.get_policy(args.WD_policy, args.WD_details) # Learning rate decay policy (if needed) lr = utils.get_policy(args.LR_policy, args.LR_details) # Create an optimizer that performs gradient descent. optimizer = utils.get_optimizer(args.optimizer, lr) # build the computational graph using the provided configuration. dnn_model = model(images_ph, labels_ph, utils.loss, optimizer, wd, args.architecture, args.num_classes, is_training_ph, args.transfer_mode, num_gpus=args.num_gpus) # Create a pipeline to read data from disk # a placeholder for setting the input pipeline batch size. This is employed to ensure that we feed each validation example only once to the network. # Because we only use 1 GPU for validation, the validation batch size should not be more than 512. batch_size_tf = tf.placeholder_with_default(min(512, args.batch_size), shape=()) # A data loader pipeline to read training images and their labels train_loader = loader(args.train_info, args.delimiter, args.raw_size, args.processed_size, True, args.chunked_batch_size, args.num_prefetch, args.num_threads, args.path_prefix, args.shuffle) # The loader returns images, their labels, and their paths images, labels, info = train_loader.load() # If validation data are provided, we create an input pipeline to load the validation data if args.run_validation: val_loader = loader(args.val_info, args.delimiter, args.raw_size, args.processed_size, False, batch_size_tf, args.num_prefetch, args.num_threads, args.path_prefix) val_images, val_labels, val_info = val_loader.load() # Get training operations to run from the deep learning model train_ops = dnn_model.train_ops() # Build an initialization operation to run below. init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) if args.retrain_from is not None: dnn_model.load(sess, args.retrain_from) # Set the start epoch number start_epoch = sess.run(epoch_number + 1) # Start the queue runners. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Setup a summary writer summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # The main training loop for epoch in range(start_epoch, start_epoch + args.num_epochs): # update epoch_number sess.run(epoch_number.assign(epoch)) print("Epoch %d of %d started" % (epoch, start_epoch + args.num_epochs - 1)) # Trainig batches for step in range(args.num_batches): sess.run(global_step.assign(step + epoch * args.num_batches)) # train the network on a batch of data (It also measures time) start_time = time.time() # load a batch from input pipeline img, lbl = sess.run([images, labels], options=args.run_options, run_metadata=args.run_metadata) # train on the loaded batch of data _, loss_value, top1_accuracy, topn_accuracy = sess.run( train_ops, feed_dict={ images_ph: img, labels_ph: lbl, is_training_ph: True }, options=args.run_options, run_metadata=args.run_metadata) duration = time.time() - start_time # Check for errors assert not np.isnan( loss_value), 'Model diverged with loss = NaN' # Logging every ten batches and writing tensorboard summaries every hundred batches if step % 10 == 0: num_examples_per_step = args.chunked_batch_size * args.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / args.num_gpus # Log format_str = ( '%s: epoch %d of %d, step %d of %d, loss = %.2f, Top-1 = %.2f Top-' + str(args.top_n) + ' = %.2f (%.1f examples/sec; %.3f sec/batch)') print( format_str % (datetime.now(), epoch, start_epoch + args.num_epochs - 1, step, args.num_batches, loss_value, top1_accuracy, topn_accuracy, examples_per_sec, sec_per_batch)) sys.stdout.flush() if step % 100 == 0: summary_str = sess.run(tf.summary.merge_all(), feed_dict={ images_ph: img, labels_ph: lbl, is_training_ph: True }) summary_writer.add_summary(summary_str, args.num_batches * epoch + step) if args.log_debug_info: summary_writer.add_run_metadata( run_metadata, 'epoch%d step%d' % (epoch, step)) # Save the model checkpoint periodically after each training epoch checkpoint_path = os.path.join(args.log_dir, args.snapshot_prefix) dnn_model.save(sess, checkpoint_path, global_step=epoch) print("Epoch %d of %d ended. a checkpoint saved at %s" % (epoch, start_epoch + args.num_epochs - 1, args.log_dir)) sys.stdout.flush() # if validation data are provided, evaluate accuracy on the validation set after the end of each epoch if args.run_validation: print("Evaluating on validation set") total_loss = utils.AverageMeter( ) # Measures cross entropy loss top1 = utils.AverageMeter() # Measures top-1 accuracy topn = utils.AverageMeter() # Measures top-n accuracy # The validation loop for step in range(args.num_val_batches): # Load a batch of data val_img, val_lbl = sess.run( [val_images, val_labels], feed_dict={ batch_size_tf: args.num_val_samples % min(512, args.batch_size) } if step == args.num_val_batches - 1 else None, options=args.run_options, run_metadata=args.run_metadata) # validate the network on the loaded batch val_loss, top1_predictions, topn_predictions = sess.run( [train_ops[1], train_ops[2], train_ops[3]], feed_dict={ images_ph: val_img, labels_ph: val_lbl, is_training_ph: False }, options=args.run_options, run_metadata=args.run_metadata) current_batch_size = val_lbl.shape[0] total_loss.update(val_loss, current_batch_size) top1.update(top1_predictions, current_batch_size) topn.update(topn_predictions, current_batch_size) if step % 10 == 0 or step == args.num_val_batches - 1: print( "Validation step %d of %d, Loss %.2f, Top-1 Accuracy %.2f, Top-%d Accuracy %.2f " % (step, args.num_val_batches, total_loss.avg, top1.avg, args.top_n, topn.avg)) sys.stdout.flush() coord.request_stop() coord.join(threads) sess.close()
def simulate(policy_name, throughputs_file, cluster_spec, lam, seed, interval, jobs_to_complete, fixed_job_duration, generate_multi_gpu_jobs, generate_multi_priority_jobs, simulate_steady_state, solver, debug, checkpoint_threshold, checkpoint_file, profiling_percentage, per_instance_type_prices_dir, available_clouds, assign_SLOs, enable_global_queue, num_gpus_per_server, output_trace_file_name): policy = utils.get_policy(policy_name, solver=solver, seed=seed) sched = scheduler.Scheduler( policy, throughputs_file=throughputs_file, seed=seed, time_per_iteration=interval, simulate=True, profiling_percentage=profiling_percentage, per_instance_type_prices_dir=per_instance_type_prices_dir, available_clouds=available_clouds, assign_SLOs=assign_SLOs, enable_global_queue=enable_global_queue) cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % ( cluster_spec['v100'], cluster_spec['p100'], cluster_spec['k80']) current_time = datetime.datetime.now() print('[%s] Configuration: cluster_spec=%s, policy=%s, ' 'seed=%d, lam=%f' % (current_time, cluster_spec_str, policy.name, seed, lam), file=sys.stderr) if lam == 0: num_total_jobs = len(jobs_to_complete) else: num_total_jobs = None sched.simulate(cluster_spec, lam=lam, jobs_to_complete=jobs_to_complete, fixed_job_duration=fixed_job_duration, generate_multi_gpu_jobs=generate_multi_gpu_jobs, generate_multi_priority_jobs=generate_multi_priority_jobs, simulate_steady_state=simulate_steady_state, num_total_jobs=num_total_jobs, debug=debug, checkpoint_threshold=checkpoint_threshold, checkpoint_file=checkpoint_file, num_gpus_per_server=num_gpus_per_server, output_trace_file_name=output_trace_file_name) average_jct = sched.get_average_jct(jobs_to_complete) utilization = sched.get_cluster_utilization() total_cost = sched.get_total_cost() num_SLO_violations = sched.get_num_SLO_violations() lease_extension_freq = sched.get_num_lease_extensions() current_time = datetime.datetime.now() print('[%s] Results: average JCT=%f, utilization=%f, ' 'total_cost=$%.2f, num_SLO_violations=%d, ' 'lease_extension_frequency=%.2f%%' % (current_time, average_jct, utilization, total_cost, num_SLO_violations, lease_extension_freq), file=sys.stderr) sched.shutdown()
def measure_runtime(num_active_jobs, policy_name, oracle_throughputs, generate_multi_gpu_jobs, generate_multi_priority_jobs, num_trials, solver): cluster_spec = { 'v100': num_active_jobs // 4, 'p100': num_active_jobs // 4, 'k80': num_active_jobs // 4, } print(cluster_spec) results_str = '%s,%d' % (policy_name, num_active_jobs) results = [] for trial in range(num_trials): throughputs, jobs, scale_factors = generate_input( num_active_jobs, cluster_spec, policy_name, oracle_throughputs, generate_multi_gpu_jobs, generate_multi_priority_jobs, seed=trial + 2) if "water_filling" in policy_name: num_entities = 5 priority_reweighting_policies = {} entity_to_job_mapping = {} entity_weights = {} for i in range(num_entities): entity_id = 'entity%d' % i priority_reweighting_policies[entity_id] = 'fairness' entity_to_job_mapping[entity_id] = [] entity_weights[entity_id] = random.randint(1, 3) policy = utils.get_policy( policy_name, solver=solver, priority_reweighting_policies=priority_reweighting_policies) else: policy = utils.get_policy(policy_name, solver=solver) start_time = time.time() with open('/dev/null', 'w') as f: with contextlib.redirect_stdout(f): if policy.name.startswith('MaxMinFairness'): priority_weights = { JobIdPair(i, None): jobs[i].priority_weight for i in range(num_active_jobs) } if "WaterFilling" in policy.name: for i in range(num_active_jobs): entity_id = 'entity%d' % random.randint( 0, num_entities - 1) entity_to_job_mapping[entity_id].append( JobIdPair(i, None)) policy.get_allocation( throughputs, scale_factors, priority_weights, cluster_spec, entity_weights=entity_weights, entity_to_job_mapping=entity_to_job_mapping) else: policy.get_allocation(throughputs, scale_factors, priority_weights, cluster_spec) elif policy.name.startswith('MinTotalDuration'): num_steps_remaining = { JobIdPair(i, None): jobs[i].num_steps for i in range(num_active_jobs) } policy.get_allocation(throughputs, scale_factors, num_steps_remaining, cluster_spec) else: policy.get_allocation(throughputs, scale_factors, cluster_spec) runtime = time.time() - start_time results.append(runtime) for result in results: results_str += ',' + str(result) results_str += ',' + str(np.mean(results)) return results_str
def simulate_with_timeout(experiment_id, policy_name, throughputs_file, cluster_spec, lam, seed, interval, jobs_to_complete, fixed_job_duration, solver, generate_multi_gpu_jobs, generate_multi_priority_jobs, simulate_steady_state, log_dir, timeout, verbose, checkpoint_threshold, profiling_percentage, num_reference_models, num_gpus_per_server, ideal): lam_str = 'lambda=%f.log' % (lam) checkpoint_file = None if checkpoint_threshold is not None: checkpoint_file = os.path.join(log_dir, 'lambda=%f.pickle' % lam) cluster_spec_str = 'v100:%d|p100:%d|k80:%d' % ( cluster_spec['v100'], cluster_spec['p100'], cluster_spec['k80']) policy = utils.get_policy(policy_name, solver=solver, seed=seed) if verbose: current_time = datetime.datetime.now() print('[%s] [Experiment ID: %2d] ' 'Configuration: cluster_spec=%s, policy=%s, ' 'seed=%d, lam=%f, ' 'profiling_percentage=%f, ' 'num_reference_models=%d' % (current_time, experiment_id, cluster_spec_str, policy.name, seed, lam, profiling_percentage, num_reference_models)) with open(os.path.join(log_dir, lam_str), 'w') as f: with contextlib.redirect_stderr(f), contextlib.redirect_stdout(f): sched = scheduler.Scheduler( policy, throughputs_file=throughputs_file, seed=seed, time_per_iteration=interval, simulate=True, profiling_percentage=profiling_percentage, num_reference_models=num_reference_models) if timeout is None: sched.simulate( cluster_spec, lam=lam, jobs_to_complete=jobs_to_complete, fixed_job_duration=fixed_job_duration, generate_multi_gpu_jobs=generate_multi_gpu_jobs, generate_multi_priority_jobs=generate_multi_priority_jobs, simulate_steady_state=simulate_steady_state, checkpoint_file=checkpoint_file, checkpoint_threshold=checkpoint_threshold, num_gpus_per_server=num_gpus_per_server, ideal=ideal) average_jct = sched.get_average_jct(jobs_to_complete) utilization = 1.0 if not ideal: utilization = sched.get_cluster_utilization() else: try: func_timeout( timeout, sched.simulate, args=(cluster_spec, ), kwargs={ 'lam': lam, 'jobs_to_complete': jobs_to_complete, 'fixed_job_duration': fixed_job_duration, 'generate_multi_gpu_jobs': generate_multi_gpu_jobs, 'generate_multi_priority_jobs': generate_multi_priority_jobs, 'simulate_steady_state': simulate_steady_state, 'checkpoint_file': checkpoint_file, 'checkpoint_threshold': checkpoint_threshold, 'num_gpus_per_server': num_gpus_per_server, 'ideal': ideal }) average_jct = sched.get_average_jct(jobs_to_complete) utilization = sched.get_cluster_utilization() except FunctionTimedOut: average_jct = float('inf') utilization = 1.0 if verbose: current_time = datetime.datetime.now() print('[%s] [Experiment ID: %2d] ' 'Results: average JCT=%f, utilization=%f' % (current_time, experiment_id, average_jct, utilization)) sched.shutdown() return average_jct, utilization
def create_experiment_config(workspace): ######################################## ### Creating data prep Pipeline Step ### ######################################## # Load settings print("Loading settings") data_prep_step_path = os.path.join("steps", "data_prep") with open(os.path.join(data_prep_step_path, "step.json")) as f: data_prep_settings = json.load(f) # Setup datasets of first step print("Setting up datasets") data_prep_input = Dataset.get_by_name(workspace=workspace, name=data_prep_settings.get( "dataset_input_name", None)).as_named_input( data_prep_settings.get( "dataset_input_name", None)).as_mount() data_prep_output = PipelineData( name=data_prep_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_output_name", "workspaceblobstore")), output_mode="mount").as_dataset() # Uncomment next lines, if you want to register intermediate dataset #data_prep_output.register( # name=data_prep_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") data_prep_dependencies = CondaDependencies.create( pip_packages=data_prep_settings.get("pip_packages", []), conda_packages=data_prep_settings.get("conda_packages", []), python_version=data_prep_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=data_prep_dependencies, framework=data_prep_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") data_prep_compute_target = ComputeTarget(workspace=workspace, name=data_prep_settings.get( "compute_target_name", None)) # Create python step print("Creating Step") data_prep = PythonScriptStep( name=data_prep_settings.get("step_name", None), script_name=data_prep_settings.get("script_name", None), arguments=data_prep_settings.get("arguments", []), compute_target=data_prep_compute_target, runconfig=data_prep_run_config, inputs=[data_prep_input], outputs=[data_prep_output], params=data_prep_settings.get("parameters", []), source_directory=data_prep_step_path, allow_reuse=data_prep_settings.get("allow_reuse", True), version=data_prep_settings.get("version", None), ) ############################################### ### Creating data model train Pipeline Step ### ############################################### # Load settings print("Loading settings") model_train_step_path = os.path.join("steps", "model_train") with open(os.path.join(model_train_step_path, "step.json")) as f: model_train_settings = json.load(f) hyperparameter_sampling_settings = model_train_settings.get( "hyperparameter_sampling", {}) # Setup datasets of first step print("Setting up datasets") model_train_input = data_prep_output.as_named_input( name=model_train_settings.get("dataset_input_name", None)) model_train_output = PipelineData( name=model_train_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=model_train_settings.get( "datastore_output_name", None)), output_mode="mount", ).as_dataset() # Uncomment next lines, if you want to register intermediate dataset #model_train_output.register( # name=model_train_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") model_train_dependencies = CondaDependencies.create( pip_packages=model_train_settings.get("pip_packages", []), conda_packages=model_train_settings.get("conda_packages", []), python_version=model_train_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") model_train_run_config = RunConfiguration( conda_dependencies=model_train_dependencies, framework=model_train_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") model_train_compute_target = ComputeTarget(workspace=workspace, name=model_train_settings.get( "compute_target_name", None)) # Create distributed training backend print("Creating distributed training backend") distributed_training_backend = get_distributed_backend( backend_name=model_train_settings.get("distributed_backend", None)) # Create Estimator for Training print("Creating Estimator for training") model_train_estimator = Estimator( source_directory=model_train_step_path, entry_script=model_train_settings.get("script_name", None), environment_variables=model_train_settings.get("parameters", None), compute_target=model_train_compute_target, node_count=model_train_settings.get("node_count", None), distributed_training=distributed_training_backend, conda_packages=model_train_settings.get("conda_packages", None), pip_packages=model_train_settings.get("pip_packages", None), ) try: # Create parameter sampling print("Creating Parameter Sampling") parameter_dict = {} parameters = hyperparameter_sampling_settings.get( "parameters", {}) if "parameters" in hyperparameter_sampling_settings else {} for parameter_name, parameter_details in parameters.items(): parameter_distr = get_parameter_distribution( distribution=parameter_details.get("distribution", None), **parameter_details.get("settings", {})) parameter_dict[f"--{parameter_name}"] = parameter_distr model_train_ps = get_parameter_sampling( sampling_method=hyperparameter_sampling_settings.get( "method", None), parameter_dict=parameter_dict) # Get Policy definition policy_settings = hyperparameter_sampling_settings.get("policy", {}) kwargs = { key: value for key, value in policy_settings.items() if key not in ["policy_method", "evaluation_interval", "delay_evaluation"] } # Create termination policy print("Creating early termination policy") model_train_policy = get_policy( policy_method=policy_settings.get("method", ""), evaluation_interval=policy_settings.get("evaluation_interval", None), delay_evaluation=policy_settings.get("delay_evaluation", None), **kwargs) # Create HyperDriveConfig print("Creating HyperDriveConfig") model_train_hyperdrive_config = HyperDriveConfig( estimator=model_train_estimator, hyperparameter_sampling=model_train_ps, policy=model_train_policy, primary_metric_name=hyperparameter_sampling_settings.get( "primary_metric", None), primary_metric_goal=PrimaryMetricGoal.MINIMIZE if "min" in hyperparameter_sampling_settings.get( "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE, max_total_runs=hyperparameter_sampling_settings.get( "max_total_runs", 1), max_concurrent_runs=hyperparameter_sampling_settings.get( "max_concurrent_runs", 1), max_duration_minutes=hyperparameter_sampling_settings.get( "max_duration_minutes", None)) # Create HyperDriveStep print("Creating HyperDriveStep") model_train = HyperDriveStep( name=model_train_settings.get("step_name", None), hyperdrive_config=model_train_hyperdrive_config, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) except: print("Not all required parameters specified for HyperDrive step") # Create EstimatorStep print("Creating EstimatorStep") model_train = EstimatorStep( name=model_train_settings.get("step_name", None), estimator=model_train_estimator, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], compute_target=model_train_compute_target, allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) ######################### ### Creating Pipeline ### ######################### # Create Pipeline print("Creating Pipeline") pipeline = Pipeline( workspace=workspace, steps=[model_train], description="Training Pipeline", ) # Validate pipeline print("Validating pipeline") pipeline.validate() return pipeline
def main(args): rng = random.Random() rng.seed(0) v100s, p100s, k80s = args.cluster_spec.split(':') cluster_spec = { 'v100': int(v100s), 'p100': int(p100s), 'k80': int(k80s), } worker_types = sorted(cluster_spec.keys()) oracle_throughputs =\ utils.read_all_throughputs_json_v2(args.throughputs_file) jobs = [] for i in range(args.num_jobs): job_template = JobTable[i % len(JobTable)] job_id = JobIdPair(i, None) job = utils.generate_job( throughputs=oracle_throughputs, reference_worker_type='v100', rng=rng, job_id=job_id, generate_multi_gpu_jobs=args.generate_multi_gpu_jobs, generate_multi_priority_jobs=args.generate_multi_priority_jobs) jobs.append(job) policy = utils.get_policy('max_min_fairness_packed', solver=args.solver) scale_factors = {job.job_id: job.scale_factor for job in jobs} priority_weights = {job.job_id: job.priority_weight for job in jobs} start = datetime.datetime.now() original_allocation = get_allocation(policy, jobs, oracle_throughputs, cluster_spec, worker_types, scale_factors, priority_weights) original_runtime = datetime.datetime.now() - start start = datetime.datetime.now() job_type_allocation = \ get_allocation_using_job_type_throughputs(policy, jobs, oracle_throughputs, cluster_spec, worker_types, scale_factors, priority_weights) job_id_to_job_type_key = \ {job.job_id: (job.job_type, job.scale_factor) for job in jobs} job_type_runtime = datetime.datetime.now() - start if args.verbose: print('Original allocation:') utils.print_allocation(original_allocation) print('') print('Allocation using job type throughputs:') utils.print_allocation(job_type_allocation) print('') print('Original effective throughputs:') print_effective_throughputs(original_allocation, oracle_throughputs, job_id_to_job_type_key) print('') print('Effective_throughputs using job type throughputs:') print_effective_throughputs(job_type_allocation, oracle_throughputs, job_id_to_job_type_key) print('Original runtime:', original_runtime.seconds + original_runtime.microseconds / 1.0e6) print('Runtime using job type throughputs:', job_type_runtime.seconds + job_type_runtime.microseconds / 1.0e6)
def do_train(args): # create model dnn_model = model(args.architecture, args.num_classes) if args.num_gpus == 1: dnn_model = dnn_model.cuda() else: dnn_model = torch.nn.DataParallel(dnn_model, device_ids = range(0, args.num_gpus)).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() lr = utils.get_policy(args.LR_policy, args.LR_details) wd = utils.get_policy(args.WD_policy, args.WD_details) optimizer = utils.get_optimizer(args.optimizer, dnn_model.parameters(), 0.01) train_loader = data_loader.CSVDataset(args.train_info, args.delimiter, args.raw_size, args.processed_size, args.batch_size, args.num_workers, args.path_prefix, True, shuffle = True).load() start_epoch = 0 if args.retrain_from is not None: checkpoint = torch.load(utils.smart_load(args.retrain_from)) dnn_model.module.load_state_dict(checkpoint['model']) if args.transfer_mode[0] == 0: optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] if args.transfer_mode[0] == 1 or args.transfer_mode[0] == 3: dnn_model.freeze() if args.run_validation: val_loader= data_loader.CSVDataset(args.val_info, args.delimiter, args.raw_size, args.processed_size, args.batch_size, args.num_workers, args.path_prefix, False, shuffle = False).load() for epoch in range(start_epoch, start_epoch + args.num_epochs): if args.optimizer not in ['adam', 'adadelta']: utils.adjust_param(optimizer, 'lr', lr, epoch) utils.adjust_param(optimizer, 'weight_decay', wd, epoch) if args.transfer_mode[0] == 3 and epoch==args.transfer_mode[1]: dnn_model.unfreeze() batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses = utils.AverageMeter() top1 = utils.AverageMeter() topn = utils.AverageMeter() # switch to train mode dnn_model.train() end = time.time() for step, (input, target, _) in islice(enumerate(train_loader), args.num_batches): # measure data loading time data_time.update(time.time() - end) input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output = dnn_model(input) #print(output,target) loss = criterion(output, target) # measure accuracy and record loss prec1, precn = utils.accuracy(output, target, topk=(1, args.top_n)) losses.update(loss.item(), input.size(0)) top1.update(prec1[0], input.size(0)) topn.update(precn[0], input.size(0)) #print(loss.item(), prec1[0], precn[0]) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % 10 == 0: format_str = ('%s: epoch %d, step %d, loss = %.2f, Top-1 = %.2f Top-' + str(args.top_n) + ' = %.2f') print(format_str % (datetime.now(), epoch, step, losses.val, top1.val, topn.val)) sys.stdout.flush() state= {'epoch': epoch + 1, 'arch': args.architecture, 'num_classes': args.num_classes, 'model': dnn_model.module.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(state, utils.smart_save(os.path.join(args.log_dir, 'checkpoint%04d.pth.tar'%(epoch)), max_to_keep = args.max_to_keep)) # if validation data are provided, evaluate accuracy on the validation set after the end of each epoch if args.run_validation: valbatch_time = utils.AverageMeter() vallosses = utils.AverageMeter() valtop1 = utils.AverageMeter() valtop5 = utils.AverageMeter() # switch to evaluate mode dnn_model.eval() with torch.no_grad(): end = time.time() for i, (input, target, _) in enumerate(val_loader): input = input.cuda(non_blocking = True) target = target.cuda(non_blocking = True) # compute output output = dnn_model(input) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = utils.accuracy(output, target, topk = (1, 5)) vallosses.update(loss.item(), input.size(0)) valtop1.update(prec1[0], input.size(0)) valtop5.update(prec5[0], input.size(0)) # measure elapsed time valbatch_time.update(time.time() - end) end = time.time() print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time = valbatch_time, loss = vallosses, top1 = valtop1, top5 = valtop5)) sys.stdout.flush() print('Training finished')