Esempio n. 1
0
def relevant_folders(path, algo, family, matrix_type, matrix_shape, mode):
    """
    Finds the right folder to read the data based on given parameters

    path: String
    Location of data-gen and training folders

    algo: String
    Current algorithm being processed by this function

    family: String
    Current family being processed by this function

    matrix_type: List
    Type of matrix to generate dense, sparse, all

    matrix_shape: List
    Dimensions of the input matrix with rows and columns

    mode: String
    Based on mode and arguments we read the specific folders e.g data-gen folder or train folder

    return: List
    List of folder locations to read data from
    """

    folders = []

    for current_matrix_type in matrix_type:
        for current_matrix_shape in matrix_shape:
            if path.startswith('hdfs'):
                if mode == 'data-gen':
                    sub_folder_name = '.'.join([family, current_matrix_type, current_matrix_shape])
                    cmd = ['hdfs', 'dfs', '-ls', path]
                    path_subdir = subprocess_exec(' '.join(cmd), extract='dir')

                if mode == 'train':
                    sub_folder_name = '.'.join([algo, family, current_matrix_type, current_matrix_shape])
                    cmd = ['hdfs', 'dfs', '-ls', path]
                    path_subdir = subprocess_exec(' '.join(cmd), extract='dir')

                path_folders = list(filter(lambda x: contains_dir(x, sub_folder_name), path_subdir))

            else:
                if mode == 'data-gen':
                    data_gen_path = join(path, family)
                    sub_folder_name = '.'.join([current_matrix_type, current_matrix_shape])
                    path_subdir = glob.glob(data_gen_path + '.' + sub_folder_name + "*")

                if mode == 'train':
                    train_path = join(path, algo)
                    sub_folder_name = '.'.join([family, current_matrix_type, current_matrix_shape])
                    path_subdir = glob.glob(train_path + '.' + sub_folder_name + "*")

                path_folders = list(filter(lambda x: os.path.isdir(x), path_subdir))

            folders.append(path_folders)

    folders_flat = reduce(lambda x, y: x + y, folders)
    return folders_flat
Esempio n. 2
0
def get_default_dir(temp_dir, exec_mode, config_dir):
    """
    temp_dir: String
    exec_mode: String
    config_dir: String

    return: String
    Local or HDFS home directory
    """

    if exec_mode == 'singlenode':
        if temp_dir is None:
            return config_dir
        if temp_dir is not None:
            return temp_dir

    if exec_mode == 'hybrid_spark':
        cmd = ['hdfs', 'getconf', '-confKey', 'fs.default.name']
        hdfs_base = subprocess_exec(' '.join(cmd), extract='hdfs_base')

        if temp_dir is None:
            hdfs_home = join(hdfs_base, 'user', getpass.getuser())
            check_hdfs_path(hdfs_home)
            return hdfs_home

        if temp_dir is not None:
            if temp_dir.startswith('hdfs'):
                return temp_dir
            else:
                hdfs_home = join(hdfs_base, 'user', getpass.getuser(),
                                 temp_dir)
                return hdfs_home
Esempio n. 3
0
def exec_dml_and_parse_time(exec_type,
                            dml_file_name,
                            args,
                            backend_args_dict,
                            systemml_args_dict,
                            log_file_name=None):
    """
    This function is responsible of execution of input arguments via python sub process,
    We also extract time obtained from the output of this subprocess

    exec_type: String
    Contains the execution type singlenode / hybrid_spark

    dml_file_name: String
    DML file name to be used while processing the arguments give

    args: Dictionary
    Key values pairs depending on the arg type

    backend_args_dict: Dictionary
    Spark configuration arguments / singlenode config arguments

    systemml_args_dict: Dictionary
    Supplementary arguments required by the script

    log_file_name: String
    Path to write the logfile

    return: String
    The value of time parsed from the logs / error
    """

    algorithm = dml_file_name + '.dml'

    sup_args = ''.join(
        ['{} {}'.format(k, v) for k, v in systemml_args_dict.items()])
    if exec_type == 'singlenode':
        exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin',
                           'systemml-standalone.py')
        singlenode_pre_args = ''.join(
            [' {} {} '.format(k, v) for k, v in backend_args_dict.items()])
        args = ''.join(['{} {}'.format(k, v) for k, v in args.items()])
        cmd = [
            exec_script, singlenode_pre_args, '-f', algorithm, args, sup_args
        ]
        cmd_string = ' '.join(cmd)

    if exec_type == 'hybrid_spark':
        exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin',
                           'systemml-spark-submit.py')
        spark_pre_args = ''.join(
            [' {} {} '.format(k, v) for k, v in backend_args_dict.items()])
        args = ''.join(['{} {}'.format(k, v) for k, v in args.items()])
        cmd = [exec_script, spark_pre_args, '-f', algorithm, args, sup_args]
        cmd_string = ' '.join(cmd)

    time = subprocess_exec(cmd_string, log_file_name, 'time')

    return time
Esempio n. 4
0
def check_hdfs_path(path):
    """
    Check if a path is present in HDFS
    """
    cmd = ['hdfs', 'dfs', '-test', '-e', path]
    return_code = subprocess_exec(' '.join(cmd))
    if return_code != 0:
        return sys.exit('Please create {}'.format(path))
Esempio n. 5
0
def check_hdfs_path(path):
    """
    Check if a path is present in HDFS
    """

    cmd = ['hdfs', 'dfs', '-test', '-e', path]
    return_code = subprocess_exec(' '.join(cmd))
    if return_code != 0:
        return sys.exit('Please create {}'.format(path))
Esempio n. 6
0
def write_success(time, path):
    """
    Write SUCCESS file in the given directory

    time: String
    Time taken to execute the dml script

    path: String
    Location to write the SUCCESS file
    """
    if 'data-gen' in path:
        if path.startswith('hdfs') and len(time.split('.')) == 2:
            full_path = join(path, '_SUCCESS')
            cmd = ['hdfs', 'dfs', '-touchz', full_path]
            subprocess_exec(' '.join(cmd))
        else:
            if len(time.split('.')) == 2:
                full_path = join(path, '_SUCCESS')
                open(full_path, 'w').close()
Esempio n. 7
0
def write_success(time, path):
    """
    Write SUCCESS file in the given directory

    time: String
    Time taken to execute the dml script

    path: String
    Location to write the SUCCESS file
    """

    if 'data-gen' in path:
        if path.startswith('hdfs') and len(time.split('.')) == 2:
            full_path = join(path, '_SUCCESS')
            cmd = ['hdfs', 'dfs', '-touchz', full_path]
            subprocess_exec(' '.join(cmd))
        else:
            if len(time.split('.')) == 2:
                full_path = join(path, '_SUCCESS')
                open(full_path, 'w').close()