def relevant_folders(path, algo, family, matrix_type, matrix_shape, mode): """ Finds the right folder to read the data based on given parameters path: String Location of data-gen and training folders algo: String Current algorithm being processed by this function family: String Current family being processed by this function matrix_type: List Type of matrix to generate dense, sparse, all matrix_shape: List Dimensions of the input matrix with rows and columns mode: String Based on mode and arguments we read the specific folders e.g data-gen folder or train folder return: List List of folder locations to read data from """ folders = [] for current_matrix_type in matrix_type: for current_matrix_shape in matrix_shape: if path.startswith('hdfs'): if mode == 'data-gen': sub_folder_name = '.'.join([family, current_matrix_type, current_matrix_shape]) cmd = ['hdfs', 'dfs', '-ls', path] path_subdir = subprocess_exec(' '.join(cmd), extract='dir') if mode == 'train': sub_folder_name = '.'.join([algo, family, current_matrix_type, current_matrix_shape]) cmd = ['hdfs', 'dfs', '-ls', path] path_subdir = subprocess_exec(' '.join(cmd), extract='dir') path_folders = list(filter(lambda x: contains_dir(x, sub_folder_name), path_subdir)) else: if mode == 'data-gen': data_gen_path = join(path, family) sub_folder_name = '.'.join([current_matrix_type, current_matrix_shape]) path_subdir = glob.glob(data_gen_path + '.' + sub_folder_name + "*") if mode == 'train': train_path = join(path, algo) sub_folder_name = '.'.join([family, current_matrix_type, current_matrix_shape]) path_subdir = glob.glob(train_path + '.' + sub_folder_name + "*") path_folders = list(filter(lambda x: os.path.isdir(x), path_subdir)) folders.append(path_folders) folders_flat = reduce(lambda x, y: x + y, folders) return folders_flat
def get_default_dir(temp_dir, exec_mode, config_dir): """ temp_dir: String exec_mode: String config_dir: String return: String Local or HDFS home directory """ if exec_mode == 'singlenode': if temp_dir is None: return config_dir if temp_dir is not None: return temp_dir if exec_mode == 'hybrid_spark': cmd = ['hdfs', 'getconf', '-confKey', 'fs.default.name'] hdfs_base = subprocess_exec(' '.join(cmd), extract='hdfs_base') if temp_dir is None: hdfs_home = join(hdfs_base, 'user', getpass.getuser()) check_hdfs_path(hdfs_home) return hdfs_home if temp_dir is not None: if temp_dir.startswith('hdfs'): return temp_dir else: hdfs_home = join(hdfs_base, 'user', getpass.getuser(), temp_dir) return hdfs_home
def exec_dml_and_parse_time(exec_type, dml_file_name, args, backend_args_dict, systemml_args_dict, log_file_name=None): """ This function is responsible of execution of input arguments via python sub process, We also extract time obtained from the output of this subprocess exec_type: String Contains the execution type singlenode / hybrid_spark dml_file_name: String DML file name to be used while processing the arguments give args: Dictionary Key values pairs depending on the arg type backend_args_dict: Dictionary Spark configuration arguments / singlenode config arguments systemml_args_dict: Dictionary Supplementary arguments required by the script log_file_name: String Path to write the logfile return: String The value of time parsed from the logs / error """ algorithm = dml_file_name + '.dml' sup_args = ''.join( ['{} {}'.format(k, v) for k, v in systemml_args_dict.items()]) if exec_type == 'singlenode': exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-standalone.py') singlenode_pre_args = ''.join( [' {} {} '.format(k, v) for k, v in backend_args_dict.items()]) args = ''.join(['{} {}'.format(k, v) for k, v in args.items()]) cmd = [ exec_script, singlenode_pre_args, '-f', algorithm, args, sup_args ] cmd_string = ' '.join(cmd) if exec_type == 'hybrid_spark': exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-spark-submit.py') spark_pre_args = ''.join( [' {} {} '.format(k, v) for k, v in backend_args_dict.items()]) args = ''.join(['{} {}'.format(k, v) for k, v in args.items()]) cmd = [exec_script, spark_pre_args, '-f', algorithm, args, sup_args] cmd_string = ' '.join(cmd) time = subprocess_exec(cmd_string, log_file_name, 'time') return time
def check_hdfs_path(path): """ Check if a path is present in HDFS """ cmd = ['hdfs', 'dfs', '-test', '-e', path] return_code = subprocess_exec(' '.join(cmd)) if return_code != 0: return sys.exit('Please create {}'.format(path))
def write_success(time, path): """ Write SUCCESS file in the given directory time: String Time taken to execute the dml script path: String Location to write the SUCCESS file """ if 'data-gen' in path: if path.startswith('hdfs') and len(time.split('.')) == 2: full_path = join(path, '_SUCCESS') cmd = ['hdfs', 'dfs', '-touchz', full_path] subprocess_exec(' '.join(cmd)) else: if len(time.split('.')) == 2: full_path = join(path, '_SUCCESS') open(full_path, 'w').close()