def __add_function_to_que(self, function_name, parameters, params_dict): """ Adds the function info the function que. If the segment has no json file name then generate one for it the given directory. Args: function_name: string Functions name params_dict: dict Parameter's name to their associated values. Note: This function should only ever be called by children of this object. """ if self.__lock_interaction: raise PipelineSegmentError( "This pipeline has be locked down and " "will prevent futher changes to the generated flat file.") for delete_key in { "self", "df", "df_features", "_add_to_que", "params_dict" }: if delete_key in params_dict.keys(): del params_dict[delete_key] for k, v in {k: v for k, v in params_dict.items()}.items(): if k not in parameters: del params_dict[k] elif isinstance(v, set): params_dict[k] = list(v) self.__function_pipe.append((function_name, params_dict)) # Generate new json file name with proper file/folder output attributes if len(self.__function_pipe) == 1 and not self.__json_file_name: FileOutput.__init__( self, f'_Extras/Pipeline Structure/Data Pipeline Segments/{self.__object_type}' ) all_json_files = get_all_files_from_path(self.folder_path, ".json") while True: random_file_name = create_hex_decimal_string().upper() if random_file_name not in all_json_files: break self.__segment_id = random_file_name self.__json_file_name = random_file_name + ".json" # Update json file if self.__create_file: self.__create_json_pipeline_segment_file()
def __init__(self, dataset_name, overwrite_full_path=None): """ Args: dataset_name: string Sub directory to create on top of the directory 'PARENT_OUTPUT_FOLDER_NAME'. overwrite_full_path: string The passed directory path must already exist. Will completely ignore the project name and attempt to point to this already created directory. """ # Create/Setup project directory FileOutput.__init__(self, dataset_name, overwrite_full_path)
def __replace_function_in_que(self, function_name, params_dict, param, param_val): raise ValueError("This function hasn't been completed yet!") if self.__lock_interaction: raise PipelineSegmentError( "This pipeline has be locked down and " "will prevent futher changes to the generated flat file.") for delete_key in { "self", "df", "df_features", "_add_to_que", "params_dict" }: if delete_key in params_dict.keys(): del params_dict[delete_key] for k, v in {k: v for k, v in params_dict.items()}.items(): if k not in parameters: del params_dict[k] elif isinstance(v, set): params_dict[k] = list(v) self.__function_pipe.append((function_name, params_dict)) # Generate new json file name with proper file/folder output attributes if len(self.__function_pipe) == 1 and not self.__json_file_name: FileOutput.__init__( self, f'_Extras/Pipeline Structure/Data Pipeline Segments/{self.__object_type}' ) all_json_files = get_all_files_from_path(self.folder_path, ".json") while True: random_file_name = create_hex_decimal_string().upper() if random_file_name not in all_json_files: break self.__segment_id = random_file_name self.__json_file_name = random_file_name + ".json" # Update json file if self.__create_file: self.__create_json_pipeline_segment_file()
def __configure_pipeline_segment_with_existing_file(self): """ Attempts to get a json file and then re_init the 'function_pipe' and the 'json_file_name'. """ FileOutput.__init__( self, f'_Extras/Pipeline Structure/Data Pipeline Segments/{self.__object_type}' ) self.__function_pipe = deque() self.__json_file_name = copy.deepcopy(self.__segment_id) + ".json" # File/Folder error checks if not os.path.exists(self.folder_path): raise PipelineSegmentError( "Couldn't find the pipeline segment's folder when trying to configure this object with the provided json file." ) if not os.path.exists(self.folder_path + copy.deepcopy(self.__json_file_name)): raise PipelineSegmentError( f"Couldn't find the pipeline segment's file named '{self.__json_file_name}' in the pipeline's directory when trying to configure this object with the provided json file." ) json_dict = json_file_to_dict(self.folder_path + copy.deepcopy(self.__json_file_name)) # Push functions into function pipe for function_order in range( 1, json_dict["Pipeline Segment"]["Function Count"] + 1): function_name = list( json_dict["Pipeline Segment"]["Functions Performed Order"] [f"Function Order {function_order}"].keys())[0] params_dict = json_dict["Pipeline Segment"][ "Functions Performed Order"][ f"Function Order {function_order}"][function_name][ "Params Dict"] self.__function_pipe.append((function_name, params_dict))
def __init__(self, widget_child_name): FileOutput.__init__(self, f'_Extras/Pipeline Structure/Widgets/{widget_child_name}')
def __init__(self, pipeline_name, df, df_features=None, pipeline_modify_id=None, remove_past_contents=False): """ Args: pipeline_name (str): Points to/generates a folder based on the pipeline's name. pipeline_modify_id (str,NoneType): If set to 'None' then will point the 'root' or the main template of the pipeline. remove_past_contents: If an already existing folder exists for this then move to eflow's personal garbage. """ # Set up directory structure dir_path_to_pipeline = correct_directory_path( f"{os.getcwd()}/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/_Extras/Pipeline Structure/Data Pipeline/{pipeline_name}" ) configure_existing_file = False # Get json proper file name if pipeline_modify_id: json_file = f"{pipeline_modify_id.split('.')[0]}.json" else: json_file = "root_pipeline.json" self.__df_features = None # Check if folder/file exist for the pipeline if os.path.exists(dir_path_to_pipeline): if os.path.exists(dir_path_to_pipeline + json_file): print(f"The file '{json_file}' exist!") FileOutput.__init__( self, f'_Extras/Pipeline Structure/Data Pipeline/{pipeline_name}' ) configure_existing_file = True else: raise PipelineError(f"The file '{json_file}' does not exist!") # Create/Load in df_features to given object. if os.path.exists(dir_path_to_pipeline + "df_features.json"): df_features = DataFrameTypes(None) df_features.init_on_json_file(dir_path_to_pipeline + "df_features.json") else: if df_features is None: raise PipelineError( "When initializing a data pipeline structure " "you must pass a DataFrameTypes object with " "the correctly defined types!") # Create file representation else: df_features.create_json_file_representation( dir_path_to_pipeline, "df_features") # ----- if df_features is None: raise PipelineError("When initializing a data pipeline structure " "you must pass a DataFrameTypes object with " "the correctly defined types!") self.__df_features = copy.deepcopy(df_features) # Check if root file exist or if pipeline modify id self.__pipeline_name = copy.deepcopy(pipeline_name) self.__pipeline_segment_deque = deque() self.__pipeline_segment_names = set() self.__pipeline_segment_path_id = set() self.__pipeline_modify_id = copy.deepcopy(pipeline_modify_id) self.__json_file_name = json_file # Json file does exist; init DataPipeline object correctly if configure_existing_file: if remove_past_contents: print("Moving past contents to eFlow's garbage.") move_folder_to_eflow_garbage(dir_path_to_pipeline, "Data Pipelines") else: print( "Now configuring object with proper pipeline segments...") self.__configure_pipeline_with_existing_file() for removal_feature in set(df.columns) ^ set( df_features.all_features()): print(f"Removing the feature: \"{removal_feature}\"") df.drop(columns=removal_feature, inplace=True)
def add(self, segment_name, pipeline_segment_obj): """ segment_name (str): A aliased name to refer to this segment. pipeline_segment_obj (child of DataPipelineSegment): A child object of type DataPipelineSegment. Returns: Attempts to add a pipeline segment object to the objects que and update it's related json object. """ # pipeline_segment_obj = copy.deepcopy(pipeline_segment_obj) # Type check if not isinstance(pipeline_segment_obj, DataPipelineSegment): raise UnsatisfiedRequirments( f"Expected a 'DataPipelineSegment' object; received '{type(pipeline_segment_obj)}'" ) # Check if alias has already been used if segment_name in self.__pipeline_segment_names: raise PipelineError( f"The '{segment_name}' pipeline segment is already in this pipeline. Please choose a different segment name." ) try: # Check if the pipeline segment has already been used segment_path_id = pipeline_segment_obj.relative_folder_path + pipeline_segment_obj.file_name except AttributeError: raise UnsatisfiedRequirments( "The given pipeline segment didn't perform any functions.") if segment_path_id in self.__pipeline_segment_path_id: raise PipelineError( "The segment has been already found " "in this pipeline Segment path id: " + f"'{segment_path_id}.'\n" + "This can be done by:" "\n\t*Creating a completely new segment object " "and adding it to the pipeline with the 'add'" " method." "\n\t*Refer to a different segment path id") else: # Que has yet to have data pushed; set up output directory for new data if len(self.__pipeline_segment_deque) == 0: FileOutput.__init__( self, f'_Extras/Pipeline Structure/Data Pipeline/{self.__pipeline_name}' ) if os.path.exists(self.folder_path + "df_features.json"): self.__df_features.init_on_json_file(self.folder_path + "df_features.json") else: self.__df_features.create_json_file_representation( self.folder_path, "df_features.json") # Update data types for error checking self.__pipeline_segment_names.add(segment_name) self.__pipeline_segment_path_id.add(segment_path_id) # Main component of the project self.__pipeline_segment_deque.append( (segment_name, segment_path_id, pipeline_segment_obj)) # Lock down the object to prevent users from continuing to interact with it after adding it to the pipeline pipeline_segment_obj._DataPipelineSegment__lock_interaction = True # Update/Create the json file self.__create_json_pipeline_file()