def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir): """ This function is the entry point for performance testing family: List A family may contain one or more algorithm based on data generation script used algo: List Input algorithms exec_type: String Contains the execution type singlenode / hybrid_spark mat_type: List Type of matrix to generate dense, sparse, all mat_shape: List Dimensions of the input matrix with rows and columns config_dir: String Location to store all configuration mode: List Type of workload to run. data-gen, train ... temp_dir: String Location to store all output files created during perf test """ # algos to run is a list of tuples with # [(m-svm, binomial), (m-svm, multinomial)...] # Basic block for execution of scripts algos_to_run = [] # Sections below build algos_to_run in our performance test # Handles algorithms like m-svm and MultiLogReg which have multiple # data generation scripts (dual datagen) # --family is taken into consideration only when there are multiple datagen for an algo if family is not None and algo is not None: for current_algo in algo: family_list = get_families(current_algo, ML_ALGO) if len(family_list) == 1: algos_to_run.append((current_algo, family_list[0])) else: intersection = set(family).intersection(family_list) for valid_family in intersection: algos_to_run.append((current_algo, valid_family)) # When the user inputs just algorithms to run elif algo is not None: for current_algo in algo: family_list = get_families(current_algo, ML_ALGO) for f in family_list: algos_to_run.append((current_algo, f)) # When the user just specifies only families to run elif family is not None: for current_family in family: algos = ML_ALGO[current_family] for current_algo in algos: algos_to_run.append((current_algo, current_family)) if 'data-gen' in mode: # Create config directories data_gen_config_dir = join(config_dir, 'data-gen') create_dir_local(data_gen_config_dir) # Create output path data_gen_dir = join(temp_dir, 'data-gen') conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir, DENSE_TYPE_ALGOS, data_gen_config_dir) for family_name, config_folders in conf_packet.items(): for config in config_folders: file_name = ML_GENDATA[family_name] success_file = algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen', data_gen_dir) # Statistic family do not require to be split if family_name not in ['stats1', 'stats2']: if not success_file: exec_test_data(exec_type, backend_args_dict, systemml_args_dict, data_gen_dir, config) if 'train' in mode: # Create config directories train_config_dir = join(config_dir, 'train') create_dir_local(train_config_dir) # Create output path data_gen_dir = join(temp_dir, 'data-gen') train_dir = join(temp_dir, 'train') conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir, train_dir, DENSE_TYPE_ALGOS, train_config_dir) for algo_name, config_files in conf_packet.items(): for config in config_files: file_name = ML_TRAIN[algo_name] algorithm_workflow(algo_name, exec_type, config, file_name, 'train', train_dir) if 'predict' in mode: # Create config directories predict_config_dir = join(config_dir, 'predict') create_dir_local(predict_config_dir) # Create output path data_gen_dir = join(temp_dir, 'data-gen') train_dir = join(temp_dir, 'train') predict_dir = join(temp_dir, 'predict') algos_to_run = list( filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run)) if len(algos_to_run) < 1: # No algorithms with predict found pass conf_packet = config_packets_predict(algos_to_run, mat_type, mat_shape, data_gen_dir, train_dir, predict_dir, DENSE_TYPE_ALGOS, predict_config_dir) for algo_name, config_files in conf_packet.items(): for config in config_files: file_name = ML_PREDICT[algo_name] algorithm_workflow(algo_name, exec_type, config, file_name, 'predict', predict_dir)
def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir, file_system_type): """ This function is the entry point for performance testing family: List A family may contain one or more algorithm based on data generation script used algo: List Input algorithms exec_type: String Contains the execution type singlenode / hybrid_spark mat_type: List Type of matrix to generate dense, sparse, all mat_shape: List Dimensions of the input matrix with rows and columns config_dir: String Location to store all configuration mode: List Type of workload to run. data-gen, train ... temp_dir: String Location to store all output files created during perf test file_system_type: String """ # algos to run is a list of tuples with # [(m-svm, binomial), (m-svm, multinomial)...] # Basic block for execution of scripts algos_to_run = [] # Sections below build algos_to_run in our performance test # Handles algorithms like m-svm and MultiLogReg which have multiple # data generation scripts (dual datagen) # --family is taken into consideration only when there are multiple datagen for an algo if family is not None and algo is not None: for current_algo in algo: family_list = get_families(current_algo, ML_ALGO) if len(family_list) == 1: algos_to_run.append((current_algo, family_list[0])) else: intersection = set(family).intersection(family_list) for valid_family in intersection: algos_to_run.append((current_algo, valid_family)) # When the user inputs just algorithms to run elif algo is not None: for current_algo in algo: family_list = get_families(current_algo, ML_ALGO) for f in family_list: algos_to_run.append((current_algo, f)) # When the user just specifies only families to run elif family is not None: for current_family in family: algos = ML_ALGO[current_family] for current_algo in algos: algos_to_run.append((current_algo, current_family)) if 'data-gen' in mode: # Create config directories data_gen_config_dir = join(config_dir, 'data-gen') create_dir_local(data_gen_config_dir) # Create output path data_gen_dir = join(temp_dir, 'data-gen') conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir, DENSE_TYPE_ALGOS, data_gen_config_dir) for family_name, config_folders in conf_packet.items(): for config in config_folders: file_name = ML_GENDATA[family_name] success_file = algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen', data_gen_dir) # Statistic family do not require to be split if family_name not in ['stats1', 'stats2']: if not success_file: exec_test_data(exec_type, backend_args_dict, systemml_args_dict, data_gen_dir, config) if 'train' in mode: # Create config directories train_config_dir = join(config_dir, 'train') create_dir_local(train_config_dir) # Create output path data_gen_dir = join(temp_dir, 'data-gen') train_dir = join(temp_dir, 'train') conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir, train_dir, DENSE_TYPE_ALGOS, train_config_dir) for algo_family_name, config_files in conf_packet.items(): for config in config_files: algo_name = algo_family_name.split('.')[0] file_name = ML_TRAIN[algo_name] algorithm_workflow(algo_family_name, exec_type, config, file_name, 'train', train_dir) if 'predict' in mode: # Create config directories predict_config_dir = join(config_dir, 'predict') create_dir_local(predict_config_dir) # Create output path data_gen_dir = join(temp_dir, 'data-gen') train_dir = join(temp_dir, 'train') predict_dir = join(temp_dir, 'predict') algos_to_run = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run)) if len(algos_to_run) < 1: # No algorithms with predict found pass conf_packet = config_packets_predict(algos_to_run, mat_type, mat_shape, data_gen_dir, train_dir, predict_dir, DENSE_TYPE_ALGOS, predict_config_dir) for algo_family_name, config_files in conf_packet.items(): for config in config_files: algo_name = algo_family_name.split('.')[0] file_name = ML_PREDICT[algo_name] algorithm_workflow(algo_family_name, exec_type, config, file_name, 'predict', predict_dir)
if fam not in ML_ALGO.keys(): print('{} family not present in the performance test suit'. format(fam)) sys.exit() if args.algo is not None: for algo in args.algo: if algo not in all_algos: print('{} algorithm not present in the performance test suit'. format(args.algo)) sys.exit() # This section check the validity of dual datagen algorithms like m-svm algo_families = {} for current_algo in args.algo: algo_families[current_algo] = get_families(current_algo, ML_ALGO) if len(algo_families[current_algo]) > 1: if args.family is None: print('family should be present for {}'.format(current_algo)) sys.exit() valid_families = set(algo_families[current_algo]) input_families = set(args.family) common_families = input_families.intersection(valid_families) if len(common_families) == 0: print('Please specify a valid family for {} and the ' 'valid families are {}'.format(current_algo, ' '.join(valid_families))) sys.exit()
args.file_system_type = 'hdfs' else: args.file_system_type = 'local' perftest_args_dict['temp_dir'] = get_default_dir(args.file_system_type, args.temp_dir, args.exec_type, default_config_dir) # default_mat_type validity if len(args.mat_type) > 2: print('length of --mat-type argument cannot be greater than two') sys.exit() if args.algo is not None: # This section check the validity of dual datagen algorithms like m-svm algo_families = {} for current_algo in args.algo: algo_families[current_algo] = get_families(current_algo, ML_ALGO) if len(algo_families[current_algo]) > 1: if args.family is None: print('family should be present for {}'.format(current_algo)) sys.exit() valid_families = set(algo_families[current_algo]) input_families = set(args.family) common_families = input_families.intersection(valid_families) if len(common_families) == 0: sys.exit('Please specify a valid family for {} and the ' 'valid families are {}'.format(current_algo, ' '.join(valid_families))) # Set level to 0 -> debug mode # Set level to 20 -> Plain metrics