def create_dir_structure(directory_path, create_sub_dir): """ Creates required directory structures inside the parent directory figures. Args: directory_path: string Given path that already exists. create_sub_dir: string Sub directory to create a given folder path. Returns: Returns back the created directory. """ directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) for directory in create_sub_dir.split("/"): directory_path += "/" + directory if not os.path.exists(directory_path): os.makedirs(directory_path) return correct_directory_path(directory_path)
def get_all_files_from_path(directory_path, file_extension=None): """ Gets all filenames with the provided path. Args: directory_path: string Given path that already exists. file_extension: string Only return files that have a given extension. Returns: Returns back a set a filenames with the provided path. """ directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) files_in_paths = [] for (dirpath, dirnames, filenames) in os.walk(directory_path): if file_extension: file_extension = file_extension.replace(".", "") for file in filenames: if file.endswith(f'.{file_extension}'): files_in_paths.append(file) else: files_in_paths.extend(filenames) break return set(files_in_paths)
def dict_to_json_file(dict_obj, directory_path, filename, remove_file_extension=True): """ Writes a dict to a json file. Args: dict_obj: dict Dictionary object. directory_path: string Given path that already exists. filename: string Json file's name. """ directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) filename = convert_to_filename(filename, remove_file_extension=remove_file_extension) with open(f'{directory_path}{filename}.json', 'w', encoding='utf-8') as outfile: json.dump(dict_obj, outfile, ensure_ascii=False, indent=2)
def write_object_text_to_file(obj, directory_path, filename, remove_file_extension=True): """ Writes the object's string representation to a text file. Args: obj: any Any object that has a string 'repr'. directory_path: string Given path that already exists. filename: string Text file's name. """ directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) filename = convert_to_filename(filename, remove_file_extension=remove_file_extension) file_dir = f'{directory_path}{filename}.txt' f = open(file_dir, 'w') f.write('obj = ' + repr(obj) + '\n') f.close()
def pickle_object_to_file(obj, directory_path, filename, remove_file_extension=True): """ Writes the object to a pickle file. Args: obj: any object Any python object that can be pickled. directory_path: string Given path that already exists. filename: string Pickle file's name. """ try: directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) # Ensures no file extensions in filename filename = convert_to_filename( filename, remove_file_extension=remove_file_extension) file_dir = f'{directory_path}{filename}.pkl' list_pickle = open(file_dir, 'wb') pickle.dump(obj, list_pickle) finally: list_pickle.close() return file_dir
def __init__(self, dataset_name, overwrite_full_path=None): """ Args: dataset_name: string Sub directory to create on top of the directory 'PARENT_OUTPUT_FOLDER_NAME'. overwrite_full_path: string The passed directory path must already exist. Will completely ignore the project name and attempt to point to this already created directory. """ # Setup project structure if not overwrite_full_path: parent_structure = "/" + SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME \ + "/" + dataset_name + "/" create_dir_structure(os.getcwd(), parent_structure) tmp_path = correct_directory_path( os.getcwd() + parent_structure) # Trusting the user that this path must already exist else: overwrite_full_path = correct_directory_path(overwrite_full_path) # Path doesn't contain eflow's main output if f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/" not in overwrite_full_path: raise UnsatisfiedRequirments(f"Directory path must have {SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME} " f"as a directory name or this program will not work correctly.") # Unknown path found if not os.path.exists(overwrite_full_path): raise SystemError("The path must already be defined in full on " "your system to use a different directory " "structure than orginally intended.") tmp_path = overwrite_full_path from eflow._hidden.general_objects import enum self.__PROJECT = enum(PATH_TO_OUTPUT_FOLDER=tmp_path, RELATIVE_PATH_TO_OUTPUT_FOLDER=tmp_path.split(f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/")[1])
def __create_dataframe_snapshot_json_file(self, df, output_folder_path): """ Creates a json file based on the dataframe's generated snapshot dict. Args: df: pd.Dataframe Pandas Dataframe object output_folder_path: string Output path the json object will move to. """ output_folder_path = correct_directory_path(output_folder_path) meta_dict = self.__generate_dataframe_snapshot_dict(df) dict_to_json_file(meta_dict, output_folder_path, "Dataframe Snapshot")
def create_plt_png(directory_path, sub_dir, filename, sharpness=1.7): """ Saves the plt based image in the correct directory. Args: directory_path: Already existing directory path. sub_dir: Directory structure to create on top of the already generated path of 'directory_path'. filename: Filename to save into the full path of 'directory_path' + 'sub_dir'. sharpness: Changes the image's sharpness to look better. """ directory_path = correct_directory_path(directory_path) # Ensure directory structure is init correctly abs_path = create_dir_structure(directory_path, sub_dir) # Ensure file ext is on the file. if filename[-4:] != ".png": filename += ".png" # plt.show() plt.savefig(abs_path + "/" + filename, bbox_inches='tight') if sharpness: full_path = directory_path + sub_dir + "/" + filename adjust_sharpness(full_path, full_path, sharpness)
def get_unique_directory_path(directory_path, folder_name): """ Iterate through directory structure until a unique folder name can be found. Note: Keeps changing the folder name by appending 1 each iteration. Args: directory_path: string Given path that already exists. folder_name: string Folder name to compare against other directories that exist in the directory_path. Returns: Returns back a directory path with a unique folder name. """ # ----- directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) create_dir_structure(directory_path=directory_path, create_sub_dir="") # Ensures the folder is unique in the directory iterable = 0 while True: if iterable != 0: created_path = f'{directory_path}{folder_name} {iterable}' else: created_path = f'{directory_path}{folder_name}' if not os.path.exists(created_path): break iterable += 1 return created_path
def get_all_directories_from_path(directory_path): """ Gets directories names with the provided path. Args: directory_path: string Given path that already exists. Returns: Returns back a set a directories with the provided path. """ directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) dirs_in_paths = [] for (dirpath, dirnames, filenames) in os.walk(directory_path): dirs_in_paths.extend(dirnames) break return set(dirs_in_paths)
def __init__(self, pipeline_name, df, df_features=None, pipeline_modify_id=None, remove_past_contents=False): """ Args: pipeline_name (str): Points to/generates a folder based on the pipeline's name. pipeline_modify_id (str,NoneType): If set to 'None' then will point the 'root' or the main template of the pipeline. remove_past_contents: If an already existing folder exists for this then move to eflow's personal garbage. """ # Set up directory structure dir_path_to_pipeline = correct_directory_path( f"{os.getcwd()}/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/_Extras/Pipeline Structure/Data Pipeline/{pipeline_name}" ) configure_existing_file = False # Get json proper file name if pipeline_modify_id: json_file = f"{pipeline_modify_id.split('.')[0]}.json" else: json_file = "root_pipeline.json" self.__df_features = None # Check if folder/file exist for the pipeline if os.path.exists(dir_path_to_pipeline): if os.path.exists(dir_path_to_pipeline + json_file): print(f"The file '{json_file}' exist!") FileOutput.__init__( self, f'_Extras/Pipeline Structure/Data Pipeline/{pipeline_name}' ) configure_existing_file = True else: raise PipelineError(f"The file '{json_file}' does not exist!") # Create/Load in df_features to given object. if os.path.exists(dir_path_to_pipeline + "df_features.json"): df_features = DataFrameTypes(None) df_features.init_on_json_file(dir_path_to_pipeline + "df_features.json") else: if df_features is None: raise PipelineError( "When initializing a data pipeline structure " "you must pass a DataFrameTypes object with " "the correctly defined types!") # Create file representation else: df_features.create_json_file_representation( dir_path_to_pipeline, "df_features") # ----- if df_features is None: raise PipelineError("When initializing a data pipeline structure " "you must pass a DataFrameTypes object with " "the correctly defined types!") self.__df_features = copy.deepcopy(df_features) # Check if root file exist or if pipeline modify id self.__pipeline_name = copy.deepcopy(pipeline_name) self.__pipeline_segment_deque = deque() self.__pipeline_segment_names = set() self.__pipeline_segment_path_id = set() self.__pipeline_modify_id = copy.deepcopy(pipeline_modify_id) self.__json_file_name = json_file # Json file does exist; init DataPipeline object correctly if configure_existing_file: if remove_past_contents: print("Moving past contents to eFlow's garbage.") move_folder_to_eflow_garbage(dir_path_to_pipeline, "Data Pipelines") else: print( "Now configuring object with proper pipeline segments...") self.__configure_pipeline_with_existing_file() for removal_feature in set(df.columns) ^ set( df_features.all_features()): print(f"Removing the feature: \"{removal_feature}\"") df.drop(columns=removal_feature, inplace=True)
def generate_meta_data(df, output_folder_path, sub_dir): """ Creates files representing the shape and feature types of the dataframe. Args: df: pd.Dataframe Pandas DataFrame object output_folder_path: str Pre defined path to already existing directory to output file(s). sub_dir: str Path to be possibly generated. Returns: Creates meta data on the passed datafrane. """ create_dir_structure(output_folder_path, correct_directory_path(sub_dir + "/Meta Data")) output_folder_path = correct_directory_path(output_folder_path) # Create files relating to dataframe's shape shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]], 'Columns': [df.shape[1]]}) if shape_df.shape[0]: df_to_image(shape_df, f"{output_folder_path}/{sub_dir}", "Meta Data", "Dataframe Shape Table", show_index=False) write_object_text_to_file(shape_df.to_dict('records'), f"{output_folder_path}/{sub_dir}/Meta Data", "Dataframe Shape Text") # Create files relating to dataframe's types dtypes_df = data_types_table(df) if dtypes_df.shape[0]: df_to_image(dtypes_df, f"{output_folder_path}/{sub_dir}", "Meta Data", "Dataframe Types Table", show_index=True) plt.close("all") # Missing value table mis_val_table = missing_values_table(df) if mis_val_table.shape[0]: df_to_image(mis_val_table, f"{output_folder_path}/{sub_dir}", "Meta Data", "Missing Data Table", show_index=True) plt.close("all")
def df_to_image(df, directory_path, sub_dir, filename, sharpness=1.7, col_width=8, row_height=0.625, font_size=14, header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w', bbox=[0, 0, 1, 1], header_columns=0, ax=None, show_index=False, index_color="#add8e6", format_float_pos=None, show_plot=False, **kwargs): directory_path = correct_directory_path(directory_path) df = copy.deepcopy(df) if format_float_pos and format_float_pos >= 1: float_format = '{:,.' + str(format_float_pos) + 'f}' for col_feature in set(df.select_dtypes(include=["float"]).columns): df[col_feature] = df[col_feature].map(float_format.format) if ax is None: size = (np.array(df.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height]) fig, ax = plt.subplots(figsize=size) ax.axis('off') if show_index: df.reset_index(inplace=True) mpl_table = ax.table(cellText=df.values, bbox=bbox, colLabels=df.columns, **kwargs) mpl_table.auto_set_font_size(False) mpl_table.set_fontsize(font_size) for k, cell in six.iteritems(mpl_table._cells): cell.set_edgecolor(edge_color) if k[0] == 0 or k[1] < header_columns: cell.set_text_props(weight='bold', color='w') cell.set_facecolor(header_color) else: if index_color and show_index and k[1] == 0: cell.set_facecolor(index_color) else: cell.set_facecolor(row_colors[k[0] % len(row_colors)]) if not sub_dir: sub_dir = "" create_plt_png(directory_path, sub_dir, filename, sharpness) if show_plot: plt.show() plt.close("all")