def generate_matrix_meta_data(self, X, sub_dir): """ Generates files/graphics in the proper directory for the matrix. Args: X: list of list; numpy array of numpy array or numpy matrix Numpy matrix sub_dir: string Specify the sub directory to append to the pre-defined folder path. """ # Convert to numpy array if possible X = np.array(X) create_dir_structure(self.folder_path, correct_directory_path(sub_dir + "/Meta Data")) output_folder_path = correct_directory_path(self.folder_path) # Create files relating to dataframe's shape shape_df = pd.DataFrame.from_dict({ 'Rows': [X.shape[0]], 'Columns': [X.shape[1]] }) df_to_image(shape_df, f"{output_folder_path}/{sub_dir}", "Meta Data", "Matrix Shape Table", show_index=False)
def move_folder_to_eflow_garbage(directory_path, create_sub_dir=None): """ Renames and moves contents to a folder labeled 'Garbage' for the user/system to later handle. Args: directory_path: Path to given folder to move to 'Garbage' create_sub_dir: If the folder 'Garbage' needs further organization then you can specify a folder for the given folder to be embedded in. """ directory_path = correct_directory_path(directory_path) check_if_directory_exists(directory_path) if not create_sub_dir: create_sub_dir = "" else: correct_directory_path(create_sub_dir) garbage_folder_path = create_dir_structure( os.getcwd(), f"{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/_Extras/Garbage/{create_sub_dir}" ) path_to_folder, folder_name = directory_path[:-1].rsplit('/', 1) _, folder_name = get_unique_directory_path(garbage_folder_path, folder_name).rsplit('/', 1) os.rename(directory_path, f'{path_to_folder}/{folder_name}') shutil.move(f'{path_to_folder}/{folder_name}', garbage_folder_path)
def __init__(self, dataset_name, overwrite_full_path=None): """ Args: dataset_name: string Sub directory to create on top of the directory 'PARENT_OUTPUT_FOLDER_NAME'. overwrite_full_path: string The passed directory path must already exist. Will completely ignore the project name and attempt to point to this already created directory. """ # Setup project structure if not overwrite_full_path: parent_structure = "/" + SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME \ + "/" + dataset_name + "/" create_dir_structure(os.getcwd(), parent_structure) tmp_path = correct_directory_path( os.getcwd() + parent_structure) # Trusting the user that this path must already exist else: overwrite_full_path = correct_directory_path(overwrite_full_path) # Path doesn't contain eflow's main output if f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/" not in overwrite_full_path: raise UnsatisfiedRequirments(f"Directory path must have {SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME} " f"as a directory name or this program will not work correctly.") # Unknown path found if not os.path.exists(overwrite_full_path): raise SystemError("The path must already be defined in full on " "your system to use a different directory " "structure than orginally intended.") tmp_path = overwrite_full_path from eflow._hidden.general_objects import enum self.__PROJECT = enum(PATH_TO_OUTPUT_FOLDER=tmp_path, RELATIVE_PATH_TO_OUTPUT_FOLDER=tmp_path.split(f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/")[1])
def __create_json_pipeline_file(self): """ Returns: Creates a dict based on the given contents of the variable 'self.__pipeline_segment_deque' to convert to a json file. This file will later be used to instruct our object to execute specific code. """ # ------------- json_dict = dict() segment_order = 1 json_dict["Pipeline Name"] = self.__pipeline_name json_dict["Pipeline Segment Order"] = dict() for segment_name, segment_path_id, pipeline_segment_obj in self.__pipeline_segment_deque: json_dict["Pipeline Segment Order"][segment_order] = dict() json_dict["Pipeline Segment Order"][segment_order][ "Pipeline Segment Path"] = segment_path_id json_dict["Pipeline Segment Order"][segment_order][ "Pipeline Segment Type"] = pipeline_segment_obj.__class__.__name__ json_dict["Pipeline Segment Order"][segment_order][ "Pipeline Segment Name"] = segment_name json_dict["Pipeline Segment Order"][segment_order][ "Pipeline Segment ID"] = segment_path_id.split("/")[-1].split( ".")[0] segment_order += 1 json_dict["Pipeline Segment Count"] = segment_order - 1 # Create a folder for all non-root json files. if self.__pipeline_modify_id: create_dir_structure(self.folder_path, "/Modified Pipelines") dict_to_json_file(json_dict, self.folder_path + "/Modified Pipelines", self.__json_file_name) # Root json files only else: dict_to_json_file(json_dict, self.folder_path, self.__json_file_name)
def __save_update_best_model_clusters(self): create_dir_structure(self.folder_path, "_Extras") pickle_object_to_file(self.__models_suggested_clusters, self.folder_path + "_Extras", "All suggested clusters") write_object_text_to_file(self.__models_suggested_clusters, self.folder_path + "_Extras", "All suggested clusters") all_clusters = [] for model_name, best_clusters in self.__models_suggested_clusters.items(): write_object_text_to_file(best_clusters, self.folder_path + "_Extras", f"{model_name} suggested clusters") all_clusters += best_clusters write_object_text_to_file(round(sum(all_clusters) / len(all_clusters)), self.folder_path + "_Extras", "Average of suggested clusters")
def create_plt_png(directory_path, sub_dir, filename, sharpness=1.7): """ Saves the plt based image in the correct directory. Args: directory_path: Already existing directory path. sub_dir: Directory structure to create on top of the already generated path of 'directory_path'. filename: Filename to save into the full path of 'directory_path' + 'sub_dir'. sharpness: Changes the image's sharpness to look better. """ directory_path = correct_directory_path(directory_path) # Ensure directory structure is init correctly abs_path = create_dir_structure(directory_path, sub_dir) # Ensure file ext is on the file. if filename[-4:] != ".png": filename += ".png" # plt.show() plt.savefig(abs_path + "/" + filename, bbox_inches='tight') if sharpness: full_path = directory_path + sub_dir + "/" + filename adjust_sharpness(full_path, full_path, sharpness)
def remove_unconnected_pipeline_segments(): """ Removes all pipeline segments that aren't connected to a pipeline structure. """ pipeline_struct_dir = os.getcwd( ) + f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/_Extras/Pipeline Structure/" if not os.path.exists(pipeline_struct_dir): print( "Project structure for pipelines has yet to be initalized. Can't clean/remove any files related to pipeline..." ) else: segment_dict = dict() pipeline_segments_dict = dict() # Get all segment files by their types. if os.path.exists(pipeline_struct_dir + "/Data Pipeline Segments/"): all_segment_dirs = get_all_directories_from_path( pipeline_struct_dir + "/Data Pipeline Segments") for segment_type in all_segment_dirs: segment_dict[segment_type] = get_all_files_from_path( pipeline_struct_dir + f"/Data Pipeline Segments/{segment_type}") # Get all segments related to each pipeline. if os.path.exists(pipeline_struct_dir + "/Data Pipeline/"): all_pipeline_dirs = get_all_directories_from_path( pipeline_struct_dir + "/Data Pipeline/") for pipeline_name in all_pipeline_dirs: json_file = json_file_to_dict( f"{pipeline_struct_dir}/Data Pipeline/{pipeline_name}/root_pipeline.json" ) for i in range(1, json_file["Pipeline Segment Count"] + 1): segment_id = json_file["Pipeline Segment Order"][str( i)]['Pipeline Segment ID'] segment_type = json_file["Pipeline Segment Order"][str( i)]['Pipeline Segment Type'] if segment_type in segment_dict.keys() and segment_id + ".json" in \ segment_dict[segment_type]: segment_dict[segment_type].remove(segment_id + ".json") # Create path to eflow's garbage garbage_folder_path = create_dir_structure( os.getcwd(), f"{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/_Extras/Garbage/Data Pipeline Segments/DataTransformer/" ) # Rename files and move them to appropriate for segment_type, segment_ids in segment_dict.items(): files_in_garbage = get_all_files_from_path(garbage_folder_path) for _id in segment_ids: file_to_remove = _id i = 1 while file_to_remove in files_in_garbage: file_to_remove = _id file_to_remove = file_to_remove.split( ".")[0] + f"_{i}.json" i += 1 os.rename( pipeline_struct_dir + f"Data Pipeline Segments/{segment_type}/{_id}", pipeline_struct_dir + f"Data Pipeline Segments/{segment_type}/{file_to_remove}") shutil.move( pipeline_struct_dir + f"Data Pipeline Segments/{segment_type}/{file_to_remove}", garbage_folder_path + file_to_remove)
def check_create_snapshot(self, df, df_features, directory_path, sub_dir): """ Compares the passed pandas dataframe object to pre defined json file. Args: df: pd.Dataframe Pandas dataframe object. directory_path: string Output path of the dataset's. sub_dir: string If set to True than it will visualize the given data. Raises: Will raise a Mismatch error if the json file didn't match upp with the passed dataframe snapshot; causing the program to stop in runtime. """ if not isinstance(df, pd.DataFrame): raise TypeError( f"'df' must be a pandas datafram object not a {type(df)}") if not isinstance(df_features, DataFrameTypes): raise TypeError( f"'df_features' must be a DataFrameTypes object not a {type(df_features)}" ) output_folder_path = create_dir_structure(directory_path, sub_dir) json_file = output_folder_path + "Dataframe Snapshot.json" # Meta Data has already been generated; compare data if os.path.isfile(json_file): with open(json_file) as file: data = json.load(file) mismatch_error = None # Not using for looping; used for logic breaks while True: if self.__compare_shape: list_shape = list(df.shape) if list(data["shape"]) != list_shape: mismatch_error = f'the saved shape {data["shape"]} of the' \ f' dataframe snapshot did not match up with' \ f' the passed dataframe shape {list_shape}.' break # Ensure feature names match up if self.__compare_feature_names: snapshot_features = set(data["feature_names"]) passed_features = set(df_features.all_features()) feature_difference = snapshot_features.symmetric_difference( passed_features) if feature_difference: mismatch_error = "the following feature name conflicts feature:\n" missing_features = [] for feature in feature_difference: if feature in snapshot_features: missing_features.append(feature) extra_features = [] for feature in feature_difference: if feature in passed_features: extra_features.append(feature) if extra_features: mismatch_error += f"--- Passed dataframe has additional feature(s) than snapshot:\n {extra_features}.\n" if missing_features: mismatch_error += f"--- Passed dataframe is missing the following snapshot feature(s):\n {missing_features}.\n" if extra_features or missing_features: break # Ensure sudo random numbers are chosen again if self.__compare_random_values: compared_data = self.__create_random_values_dict( df, df_features) random_values_matchd_flag = True for k, v in data["random_values"].items(): if k in compared_data: if data["random_values"][k] != compared_data[k]: random_values_matchd_flag = False break if not random_values_matchd_flag: mismatch_error = f"the 'random' values did not match at feature name '{k}' in the dataframe " \ + "(these 'random' values are based on the shape and name of the column)" break # Break main loop break # Error found; raise it if mismatch_error is not None: raise SnapshotMismatchError( f"DataFrameSnapshot has raised an error because {mismatch_error}." + "\nThis error invoked because the directory structure saved a json file " "containing attributes of the dataframe or a 'snapshot'." "\nThe given error can be resolved by performing any of the following:" "\n\t* Pass in the same dataframe as expected." "\n\t* Disable the snapshot check by changing 'dataframe_snapshot' to False." "\n\t* Disable save file option by changing the parameter 'save_file' to False." "\n\t* Or deleting the json object file in the dataset directory under _Extras" ) # JSON file doesn't exist; create file else: self.__create_dataframe_snapshot_json_file(df, output_folder_path)
def __init__(self, df, feature_names=[], dataset_sub_dir="", dataset_name="Default Dataset Name", overwrite_full_path=None, notebook_mode=False, pca_perc=1.00): """ Args: df: pd.Dataframe pd.Dataframe dataset_sub_dir: string Sub directory to write data. dataset_name: string Main project directory overwrite_full_path: string Overwrite full directory path to a given output folder notebook_mode: bool Display and show in notebook if set to true. """ if isinstance(df, pd.DataFrame): self.__feature_names = copy.deepcopy(list(df.columns)) else: if not feature_names: raise UnsatisfiedRequirments("If passing in a matrix like object. " "You must init feature names!") else: self.__feature_names = copy.deepcopy(feature_names) AutoModeler.__init__(self, f'{dataset_name}/{dataset_sub_dir}', overwrite_full_path) # Define model self.__cluster_models_paths = dict() self.__notebook_mode = copy.deepcopy(notebook_mode) self.__models_suggested_clusters = dict() self.__pca = None self.__first_scaler = None self.__second_scaler = None self.__cutoff_index = None self.__ordered_dp_indexes = None self.__pca_perc = pca_perc # --- Apply pca --- if pca_perc: # Create scaler object scaler = StandardScaler() scaled = scaler.fit_transform(df) self.__first_scaler = copy.deepcopy(scaler) print("\nInspecting scaled results!") self.__inspect_feature_matrix(matrix=scaled, feature_names=self.__feature_names, sub_dir="PCA", filename="Applied scaler results") pca, scaled = self.__visualize_pca_variance(scaled) self.__pca = pca # Generate "dummy" feature names pca_feature_names = ["PCA_Feature_" + str(i) for i in range(1, len(self.__feature_names) + 1)] print("\nInspecting applied scaler and pca results!") self.__inspect_feature_matrix(matrix=scaled, feature_names=pca_feature_names, sub_dir="PCA", filename="Applied scaler and PCA results") if pca_perc < 1.0: # Find cut off point on cumulative sum cutoff_index = np.where( pca.explained_variance_ratio_.cumsum() > pca_perc)[0][0] else: cutoff_index = scaled.shape[1] - 1 print( "After applying pca with a cutoff percentage {0}%" " for the cumulative index. Using features 1 to {1}".format( pca_perc, cutoff_index + 1)) print("Old shape {0}".format(scaled.shape)) scaled = scaled[:, :cutoff_index + 1] pca_feature_names = pca_feature_names[0: cutoff_index + 1] print("New shape {0}".format(scaled.shape)) scaled = scaler.fit_transform(scaled) print("\nInspecting data after final scaler applied!") self.__inspect_feature_matrix(matrix=scaled, feature_names=pca_feature_names, sub_dir="PCA", filename="Applied final sclaer to process.") self.__second_scaler = copy.deepcopy(scaler) self.__scaled = scaled self.__cutoff_index = cutoff_index # Assumed PCA has already been applied; pass as matrix else: self.__scaled = df.values # Save objects to directory structure if self.__pca: pipeline_path = create_dir_structure(self.folder_path, "Data Cluster Pipeline") # Pickle data pipeline objects pickle_object_to_file(self.__pca, pipeline_path, "PCA") pickle_object_to_file(self.__first_scaler, pipeline_path, "First Scaler") pickle_object_to_file(self.__second_scaler, pipeline_path, "First Scaler") pickle_object_to_file(self.__pca_perc, pipeline_path, "PCA Percentage") # Save Dimensions and Cutoff Index write_object_text_to_file(self.__cutoff_index, pipeline_path, "Cutoff Index") write_object_text_to_file(self.__cutoff_index + 1, pipeline_path, "Dimensions")
def generate_entropy_table(df, df_features, output_folder_path, sub_dir, file_name="Entropy Table"): """ Calculate the entropy of each non-continous numerical feature in a pandas dataframe object and store in a pandas dataframe object in the proper directory structure. Args: df: pd.Dataframe Pandas DataFrame object df_features: DataFrameTypes from eflow DataFrameTypes object output_folder_path: str Pre defined path to already existing directory to output file(s). sub_dir: str Path to be possibly generated. file_name: str Name of the given file to save Returns: Nothing """ entropy_dict = dict() for feature_name in df.columns: if feature_name in df_features.all_features() and \ feature_name not in df_features.null_only_features() and \ feature_name not in df_features.continuous_numerical_features(): entropy_dict[feature_name] = calculate_entropy( df[feature_name].dropna()) entropy_table = pd.DataFrame.from_dict(entropy_dict, orient='index').rename(columns={0: "Entropy"}) entropy_table.index.name = "Features" entropy_table.sort_values(by=["Entropy"], ascending=True, inplace=True) create_dir_structure(output_folder_path, sub_dir) pickle_object_to_file(entropy_table, output_folder_path + sub_dir, file_name) df_to_image(entropy_table, output_folder_path, sub_dir, "Entropy Table", show_index=True, format_float_pos=5)
def generate_meta_data(df, output_folder_path, sub_dir): """ Creates files representing the shape and feature types of the dataframe. Args: df: pd.Dataframe Pandas DataFrame object output_folder_path: str Pre defined path to already existing directory to output file(s). sub_dir: str Path to be possibly generated. Returns: Creates meta data on the passed datafrane. """ create_dir_structure(output_folder_path, correct_directory_path(sub_dir + "/Meta Data")) output_folder_path = correct_directory_path(output_folder_path) # Create files relating to dataframe's shape shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]], 'Columns': [df.shape[1]]}) if shape_df.shape[0]: df_to_image(shape_df, f"{output_folder_path}/{sub_dir}", "Meta Data", "Dataframe Shape Table", show_index=False) write_object_text_to_file(shape_df.to_dict('records'), f"{output_folder_path}/{sub_dir}/Meta Data", "Dataframe Shape Text") # Create files relating to dataframe's types dtypes_df = data_types_table(df) if dtypes_df.shape[0]: df_to_image(dtypes_df, f"{output_folder_path}/{sub_dir}", "Meta Data", "Dataframe Types Table", show_index=True) plt.close("all") # Missing value table mis_val_table = missing_values_table(df) if mis_val_table.shape[0]: df_to_image(mis_val_table, f"{output_folder_path}/{sub_dir}", "Meta Data", "Missing Data Table", show_index=True) plt.close("all")
def __find_best_elbow_models(self, model_name, k_models, inertias, display_visuals=True): ks = range(1, len(inertias[0]) + 1) plt.figure(figsize=(13, 6)) plt.title(f"All possible {model_name} Elbow's", fontsize=15) plt.xlabel('Number of clusters, k') plt.ylabel('Inertia') plt.xticks(ks) elbow_inertias_matrix = None inertias_matrix = None elbow_models = [] elbow_sections = [] center_elbow_count = dict() proximity_elbow_count = dict() # Plot ks vs inertias for i in range(0,len(inertias)): elbow_cluster = KneeLocator(ks, inertias[i], curve='convex', direction='decreasing').knee if elbow_cluster == 1 or not elbow_cluster: print("Elbow was either one or None for the elbow seq.") continue plt.plot(ks, inertias[i], '-o', color='#367588', alpha=0.5) if str(elbow_cluster) not in center_elbow_count.keys(): center_elbow_count[str(elbow_cluster)] = 1 else: center_elbow_count[str(elbow_cluster)] += 1 for k_val in [elbow_cluster - 1, elbow_cluster, elbow_cluster + 1]: elbow_sections.append([ks[k_val - 1],inertias[i][k_val - 1]]) if str(k_val) not in proximity_elbow_count.keys(): proximity_elbow_count[str(k_val)] = 1 else: proximity_elbow_count[str(k_val)] += 1 if isinstance(elbow_inertias_matrix, type(None)): inertias_matrix = np.matrix(inertias[i]) elbow_inertias_matrix = np.matrix(inertias[i][elbow_cluster - 2:elbow_cluster + 1]) else: inertias_matrix = np.vstack([inertias_matrix, inertias[i]]) elbow_inertias_matrix = np.vstack( [elbow_inertias_matrix, inertias[i][elbow_cluster - 2:elbow_cluster + 1]]) elbow_models.append(k_models[i][elbow_cluster - 2:elbow_cluster + 1]) for elbow in elbow_sections: k_val = elbow[0] intertia = elbow[1] plt.plot(k_val, intertia, 'r*',) del inertias del k_models del elbow_cluster self.save_plot(f"Models/{model_name}",f"All possible {model_name} Elbow's",) if display_visuals and self.__notebook_mode: plt.show() plt.close("all") center_elbow_count = pd.DataFrame({"Main Knees": list(center_elbow_count.keys()), "Counts": list(center_elbow_count.values())}) center_elbow_count.sort_values(by=['Counts'], ascending=False, inplace=True) self.save_table_as_plot( center_elbow_count, sub_dir=f"Models/{model_name}", filename="Center Elbow Count") proximity_elbow_count = pd.DataFrame({"Proximity Knees": list(proximity_elbow_count.keys()), "Counts": list(proximity_elbow_count.values())}) proximity_elbow_count.sort_values(by=['Counts'], ascending=False, inplace=True) self.save_table_as_plot( proximity_elbow_count, sub_dir=f"Models/{model_name}", filename="Proximity Elbow Count") plt.figure(figsize=(13, 6)) plt.title(f"Best of all {model_name} Elbows", fontsize=15) plt.xlabel('Number of clusters, k') plt.ylabel('Inertia') plt.xticks(ks) average_elbow_inertias = elbow_inertias_matrix.mean(0) knee_vote = [] for vector in elbow_inertias_matrix: knee_vote.append( np.absolute(vector - average_elbow_inertias).sum()) best_elbow_index = np.array(knee_vote).argmin() plt.plot(ks, inertias_matrix[best_elbow_index].tolist()[0], '-o', color='#367588') best_clusters = [] for model in elbow_models[best_elbow_index]: k_val = len(model.get_clusters()) self.__all_cluster_models[f"{model_name}_Cluster_" + str(k_val)] = model create_dir_structure(self.folder_path, f"Models/{model_name}/Clusters={k_val}") try: pickle_object_to_file(model, self.folder_path + f"Models/{model_name}/Clusters={k_val}", f"{model_name}_Cluster_" + str(k_val)) except: print(f"Something went wrong when trying to save the model: {model_name}") plt.plot(ks[k_val - 1], inertias_matrix[best_elbow_index].tolist()[0][k_val - 1], 'r*') best_clusters.append(k_val) self.save_plot(f"Models/{model_name}", f"Best of all {model_name} Elbows") if display_visuals and self.__notebook_mode: plt.show() plt.close("all") best_clusters.sort() if display_visuals and self.__notebook_mode: display(proximity_elbow_count) display(center_elbow_count) return best_clusters