Exemple #1
0
def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir,
                    mode, temp_dir):
    """
    This function is the entry point for performance testing

    family: List
    A family may contain one or more algorithm based on data generation script used

    algo: List
    Input algorithms

    exec_type: String
    Contains the execution type singlenode / hybrid_spark

    mat_type: List
    Type of matrix to generate dense, sparse, all

    mat_shape: List
    Dimensions of the input matrix with rows and columns

    config_dir: String
    Location to store all configuration

    mode: List
    Type of workload to run. data-gen, train ...

    temp_dir: String
    Location to store all output files created during perf test
    """
    # algos to run is a list of tuples with
    # [(m-svm, binomial), (m-svm, multinomial)...]
    # Basic block for execution of scripts
    algos_to_run = []

    # Sections below build algos_to_run in our performance test
    # Handles algorithms like m-svm and MultiLogReg which have multiple
    # data generation scripts (dual datagen)
    # --family is taken into consideration only when there are multiple datagen for an algo

    if family is not None and algo is not None:
        for current_algo in algo:
            family_list = get_families(current_algo, ML_ALGO)
            if len(family_list) == 1:
                algos_to_run.append((current_algo, family_list[0]))
            else:
                intersection = set(family).intersection(family_list)
                for valid_family in intersection:
                    algos_to_run.append((current_algo, valid_family))

    # When the user inputs just algorithms to run
    elif algo is not None:
        for current_algo in algo:
            family_list = get_families(current_algo, ML_ALGO)
            for f in family_list:
                algos_to_run.append((current_algo, f))

    # When the user just specifies only families to run
    elif family is not None:
        for current_family in family:
            algos = ML_ALGO[current_family]
            for current_algo in algos:
                algos_to_run.append((current_algo, current_family))

    if 'data-gen' in mode:
        # Create config directories
        data_gen_config_dir = join(config_dir, 'data-gen')
        create_dir_local(data_gen_config_dir)

        # Create output path
        data_gen_dir = join(temp_dir, 'data-gen')
        conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape,
                                             data_gen_dir, DENSE_TYPE_ALGOS,
                                             data_gen_config_dir)

        for family_name, config_folders in conf_packet.items():
            for config in config_folders:
                file_name = ML_GENDATA[family_name]
                success_file = algorithm_workflow(family_name, exec_type,
                                                  config, file_name,
                                                  'data-gen', data_gen_dir)
                # Statistic family do not require to be split
                if family_name not in ['stats1', 'stats2']:
                    if not success_file:
                        exec_test_data(exec_type, backend_args_dict,
                                       systemml_args_dict, data_gen_dir,
                                       config)

    if 'train' in mode:
        # Create config directories
        train_config_dir = join(config_dir, 'train')
        create_dir_local(train_config_dir)

        # Create output path
        data_gen_dir = join(temp_dir, 'data-gen')
        train_dir = join(temp_dir, 'train')

        conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape,
                                           data_gen_dir, train_dir,
                                           DENSE_TYPE_ALGOS, train_config_dir)
        for algo_name, config_files in conf_packet.items():
            for config in config_files:
                file_name = ML_TRAIN[algo_name]
                algorithm_workflow(algo_name, exec_type, config, file_name,
                                   'train', train_dir)

    if 'predict' in mode:
        # Create config directories
        predict_config_dir = join(config_dir, 'predict')
        create_dir_local(predict_config_dir)

        # Create output path
        data_gen_dir = join(temp_dir, 'data-gen')
        train_dir = join(temp_dir, 'train')
        predict_dir = join(temp_dir, 'predict')

        algos_to_run = list(
            filter(lambda algo: check_predict(algo[0], ML_PREDICT),
                   algos_to_run))
        if len(algos_to_run) < 1:
            # No algorithms with predict found
            pass
        conf_packet = config_packets_predict(algos_to_run, mat_type, mat_shape,
                                             data_gen_dir, train_dir,
                                             predict_dir, DENSE_TYPE_ALGOS,
                                             predict_config_dir)
        for algo_name, config_files in conf_packet.items():
            for config in config_files:
                file_name = ML_PREDICT[algo_name]
                algorithm_workflow(algo_name, exec_type, config, file_name,
                                   'predict', predict_dir)
def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir, file_system_type):
    """
    This function is the entry point for performance testing

    family: List
    A family may contain one or more algorithm based on data generation script used

    algo: List
    Input algorithms

    exec_type: String
    Contains the execution type singlenode / hybrid_spark

    mat_type: List
    Type of matrix to generate dense, sparse, all

    mat_shape: List
    Dimensions of the input matrix with rows and columns

    config_dir: String
    Location to store all configuration

    mode: List
    Type of workload to run. data-gen, train ...

    temp_dir: String
    Location to store all output files created during perf test

    file_system_type: String

    """
    # algos to run is a list of tuples with
    # [(m-svm, binomial), (m-svm, multinomial)...]
    # Basic block for execution of scripts
    algos_to_run = []

    # Sections below build algos_to_run in our performance test
    # Handles algorithms like m-svm and MultiLogReg which have multiple
    # data generation scripts (dual datagen)
    # --family is taken into consideration only when there are multiple datagen for an algo

    if family is not None and algo is not None:
        for current_algo in algo:
            family_list = get_families(current_algo, ML_ALGO)
            if len(family_list) == 1:
                algos_to_run.append((current_algo, family_list[0]))
            else:
                intersection = set(family).intersection(family_list)
                for valid_family in intersection:
                    algos_to_run.append((current_algo, valid_family))

    # When the user inputs just algorithms to run
    elif algo is not None:
        for current_algo in algo:
            family_list = get_families(current_algo, ML_ALGO)
            for f in family_list:
                algos_to_run.append((current_algo, f))

    # When the user just specifies only families to run
    elif family is not None:
        for current_family in family:
            algos = ML_ALGO[current_family]
            for current_algo in algos:
                algos_to_run.append((current_algo, current_family))

    if 'data-gen' in mode:
        # Create config directories
        data_gen_config_dir = join(config_dir, 'data-gen')
        create_dir_local(data_gen_config_dir)

        # Create output path
        data_gen_dir = join(temp_dir, 'data-gen')
        conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir,
                                             DENSE_TYPE_ALGOS, data_gen_config_dir)

        for family_name, config_folders in conf_packet.items():
            for config in config_folders:
                file_name = ML_GENDATA[family_name]
                success_file = algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen', data_gen_dir)
                # Statistic family do not require to be split
                if family_name not in ['stats1', 'stats2']:
                    if not success_file:
                        exec_test_data(exec_type, backend_args_dict, systemml_args_dict, data_gen_dir, config)

    if 'train' in mode:
        # Create config directories
        train_config_dir = join(config_dir, 'train')
        create_dir_local(train_config_dir)

        # Create output path
        data_gen_dir = join(temp_dir, 'data-gen')
        train_dir = join(temp_dir, 'train')

        conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir,
                                           train_dir, DENSE_TYPE_ALGOS, train_config_dir)
        for algo_family_name, config_files in conf_packet.items():
            for config in config_files:
                algo_name = algo_family_name.split('.')[0]
                file_name = ML_TRAIN[algo_name]
                algorithm_workflow(algo_family_name, exec_type, config, file_name, 'train', train_dir)

    if 'predict' in mode:
        # Create config directories
        predict_config_dir = join(config_dir, 'predict')
        create_dir_local(predict_config_dir)

        # Create output path
        data_gen_dir = join(temp_dir, 'data-gen')
        train_dir = join(temp_dir, 'train')
        predict_dir = join(temp_dir, 'predict')

        algos_to_run = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run))
        if len(algos_to_run) < 1:
            # No algorithms with predict found
            pass
        conf_packet = config_packets_predict(algos_to_run, mat_type, mat_shape, data_gen_dir,
                                             train_dir, predict_dir, DENSE_TYPE_ALGOS,
                                             predict_config_dir)

        for algo_family_name, config_files in conf_packet.items():
                for config in config_files:
                    algo_name = algo_family_name.split('.')[0]
                    file_name = ML_PREDICT[algo_name]
                    algorithm_workflow(algo_family_name, exec_type, config, file_name, 'predict', predict_dir)
Exemple #3
0
            if fam not in ML_ALGO.keys():
                print('{} family not present in the performance test suit'.
                      format(fam))
                sys.exit()

    if args.algo is not None:
        for algo in args.algo:
            if algo not in all_algos:
                print('{} algorithm not present in the performance test suit'.
                      format(args.algo))
                sys.exit()

        # This section check the validity of dual datagen algorithms like m-svm
        algo_families = {}
        for current_algo in args.algo:
            algo_families[current_algo] = get_families(current_algo, ML_ALGO)

        if len(algo_families[current_algo]) > 1:
            if args.family is None:
                print('family should be present for {}'.format(current_algo))
                sys.exit()

            valid_families = set(algo_families[current_algo])
            input_families = set(args.family)
            common_families = input_families.intersection(valid_families)
            if len(common_families) == 0:
                print('Please specify a valid family for {} and the '
                      'valid families are {}'.format(current_algo,
                                                     ' '.join(valid_families)))
                sys.exit()
            args.file_system_type = 'hdfs'
        else:
            args.file_system_type = 'local'
            
    perftest_args_dict['temp_dir'] = get_default_dir(args.file_system_type, args.temp_dir, args.exec_type, default_config_dir)

    # default_mat_type validity
    if len(args.mat_type) > 2:
        print('length of --mat-type argument cannot be greater than two')
        sys.exit()

    if args.algo is not None:
        # This section check the validity of dual datagen algorithms like m-svm
        algo_families = {}
        for current_algo in args.algo:
            algo_families[current_algo] = get_families(current_algo, ML_ALGO)

        if len(algo_families[current_algo]) > 1:
            if args.family is None:
                print('family should be present for {}'.format(current_algo))
                sys.exit()

            valid_families = set(algo_families[current_algo])
            input_families = set(args.family)
            common_families = input_families.intersection(valid_families)
            if len(common_families) == 0:
                sys.exit('Please specify a valid family for {} and the '
                         'valid families are {}'.format(current_algo, ' '.join(valid_families)))

    # Set level to 0 -> debug mode
    # Set level to 20 -> Plain metrics