Beispiel #1
0
    def __init__(self,
                 compare_shape=True,
                 compare_feature_names=True,
                 compare_random_values=True):
        """
        Args:
            compare_shape: bool
                Determines whether or not to create/compare the dataframe's shape for the snapshot

            compare_feature_names: bool
                Determines whether or not to create/compare the dataframe's the feature names for the snapshot


            compare_random_values: bool
                Determines whether or not to create/compare the dataframe's 10 'random'
                values found on each feature. As long as the same dataframe is passed the random values should be the same.
                Note:
                    Will ignore float features because of trailing value problem that all floats have.
        """

        # Copy values
        self.__compare_shape = copy.deepcopy(compare_shape)
        self.__compare_feature_names = copy.deepcopy(compare_feature_names)
        self.__compare_random_values = copy.deepcopy(compare_random_values)

        # Error check; must have at least one boolean
        if not self.__compare_shape and \
                not self.__compare_feature_names and \
                not self.__compare_random_values:
            raise UnsatisfiedRequirments(
                "At least one compare boolean must be "
                "set to True for snapshot check to properly work")
Beispiel #2
0
    def create_elbow_models(self,
                            model_names=["K-Means",
                                         "K-Medians",
                                         "K-Medoids",
                                         "Somsc",
                                         "Cure",
                                         "Fuzzy C-means"],
                            repeat_operation=3,
                            max_k_value=15,
                            display_visuals=True):

        model_names = set(model_names)

        names_model_dict = {"K-Means":kmeans,
                            "K-Medians":kmedians,
                            "K-Medoids":kmedoids,
                            "Somsc":somsc,
                            "Cure":cure,
                            "Fuzzy C-means": fcm}

        # Iterate through passed model names
        for name in model_names:

            if name in names_model_dict.keys():

                # Only requires 1 elbow sequence
                if name == "Somsc" or name == "Cure":
                    best_clusters = self.__create_elbow_seq(name,
                                                            names_model_dict[name],
                                                            repeat_operation=1,
                                                            max_k_value=max_k_value,
                                                            display_visuals=display_visuals)
                else:
                    best_clusters = self.__create_elbow_seq(name,
                                                            names_model_dict[name],
                                                            repeat_operation=repeat_operation,
                                                            max_k_value=max_k_value,
                                                            display_visuals=display_visuals)

                # Save cluster results in
                best_clusters.sort()
                self.__models_suggested_clusters[name] = best_clusters
                self.__save_update_best_model_clusters()
            else:
                raise UnsatisfiedRequirments(f"Unknown model name passed: \"{name}\"")

        return best_clusters
Beispiel #3
0
    def __init__(self,
                 dataset_name,
                 overwrite_full_path=None):
        """
        Args:
            dataset_name: string
                Sub directory to create on top of the directory
                'PARENT_OUTPUT_FOLDER_NAME'.

            overwrite_full_path: string
                The passed directory path must already exist. Will completely
                ignore the project name and attempt to point to this already
                created directory.
        """

        # Setup project structure
        if not overwrite_full_path:
            parent_structure = "/" + SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME \
                               + "/" + dataset_name + "/"

            create_dir_structure(os.getcwd(),
                                       parent_structure)
            tmp_path = correct_directory_path(
                os.getcwd() + parent_structure)

        # Trusting the user that this path must already exist
        else:
            overwrite_full_path = correct_directory_path(overwrite_full_path)

            # Path doesn't contain eflow's main output
            if f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/" not in overwrite_full_path:
                raise UnsatisfiedRequirments(f"Directory path must have {SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME} "
                                             f"as a directory name or this program will not work correctly.")

            # Unknown path found
            if not os.path.exists(overwrite_full_path):
                raise SystemError("The path must already be defined in full on "
                                  "your system to use a different directory "
                                  "structure than orginally intended.")

            tmp_path = overwrite_full_path

        from eflow._hidden.general_objects import enum
        self.__PROJECT = enum(PATH_TO_OUTPUT_FOLDER=tmp_path,
                              RELATIVE_PATH_TO_OUTPUT_FOLDER=tmp_path.split(f"/{SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME}/")[1])
Beispiel #4
0
    def __init__(self, object_type, segment_id=None, create_file=True):
        """
        Args:
            object_type: string
                The child type of all object's that inherited DataPipelineSegment

            segment_id: string
                 If init as a string instead of None; the object will attempt
                 to find the json file in the provided directory.
        Note:
            Essentially we are serializing the object with json files.
        """

        self.__json_file_name = None
        self.__object_type = copy.deepcopy(object_type)

        if not isinstance(segment_id, str) and segment_id:
            raise UnsatisfiedRequirments(
                "Segment id must be a string or set to 'None'!")

        if segment_id and not create_file:
            raise PipelineSegmentError(
                "Parameter conflict: segment_id is referring "
                "to a saved file but create_file is set to False.")

        # File extension removal
        if isinstance(segment_id, str):
            segment_id = segment_id.split(".")[0]
        self.__segment_id = copy.deepcopy(segment_id)

        # Pushes the functions info based on order they are called
        self.__function_pipe = deque()

        self.__create_file = create_file
        self.__lock_interaction = False

        # Attempt to get json file into object's attributes.
        if self.__segment_id:
            self.__configure_pipeline_segment_with_existing_file()
    def __init__(self,
                 df,
                 feature_names=[],
                 dataset_sub_dir="",
                 dataset_name="Default Dataset Name",
                 overwrite_full_path=None,
                 notebook_mode=False,
                 pca_perc=1.00):
        """
        Args:
            df: pd.Dataframe
                pd.Dataframe

            dataset_sub_dir: string
                Sub directory to write data.

            dataset_name: string
                Main project directory

            overwrite_full_path: string
                Overwrite full directory path to a given output folder

            notebook_mode: bool
                Display and show in notebook if set to true.
        """

        if isinstance(df, pd.DataFrame):
            self.__feature_names = copy.deepcopy(list(df.columns))
        else:
            if not feature_names:
                raise UnsatisfiedRequirments("If passing in a matrix like object. "
                                             "You must init feature names!")
            else:
                self.__feature_names = copy.deepcopy(feature_names)


        AutoModeler.__init__(self,
                             f'{dataset_name}/{dataset_sub_dir}',
                             overwrite_full_path)

        # Define model
        self.__cluster_models_paths = dict()

        self.__notebook_mode = copy.deepcopy(notebook_mode)

        self.__models_suggested_clusters = dict()

        self.__pca = None

        self.__first_scaler = None
        self.__second_scaler = None
        self.__cutoff_index = None
        self.__ordered_dp_indexes = None
        self.__pca_perc = pca_perc

        # --- Apply pca ---
        if pca_perc:

            # Create scaler object
            scaler = StandardScaler()
            scaled = scaler.fit_transform(df)

            self.__first_scaler = copy.deepcopy(scaler)

            print("\nInspecting scaled results!")
            self.__inspect_feature_matrix(matrix=scaled,
                                          feature_names=self.__feature_names,
                                          sub_dir="PCA",
                                          filename="Applied scaler results")

            pca, scaled = self.__visualize_pca_variance(scaled)

            self.__pca = pca

            # Generate "dummy" feature names
            pca_feature_names = ["PCA_Feature_" +
                                 str(i) for i in range(1,
                                                       len(self.__feature_names) + 1)]

            print("\nInspecting applied scaler and pca results!")
            self.__inspect_feature_matrix(matrix=scaled,
                                          feature_names=pca_feature_names,
                                          sub_dir="PCA",
                                          filename="Applied scaler and PCA results")

            if pca_perc < 1.0:
                # Find cut off point on cumulative sum
                cutoff_index = np.where(
                    pca.explained_variance_ratio_.cumsum() > pca_perc)[0][0]
            else:
                cutoff_index = scaled.shape[1] - 1

            print(
                "After applying pca with a cutoff percentage {0}%"
                " for the cumulative index. Using features 1 to {1}".format(
                    pca_perc, cutoff_index + 1))

            print("Old shape {0}".format(scaled.shape))

            scaled = scaled[:, :cutoff_index + 1]
            pca_feature_names = pca_feature_names[0: cutoff_index + 1]

            print("New shape {0}".format(scaled.shape))

            scaled = scaler.fit_transform(scaled)

            print("\nInspecting data after final scaler applied!")
            self.__inspect_feature_matrix(matrix=scaled,
                                          feature_names=pca_feature_names,
                                          sub_dir="PCA",
                                          filename="Applied final sclaer to process.")

            self.__second_scaler = copy.deepcopy(scaler)

            self.__scaled = scaled
            self.__cutoff_index = cutoff_index

        # Assumed PCA has already been applied; pass as matrix
        else:
            self.__scaled = df.values

        # Save objects to directory structure
        if self.__pca:
            pipeline_path = create_dir_structure(self.folder_path,
                                                 "Data Cluster Pipeline")

            # Pickle data pipeline objects
            pickle_object_to_file(self.__pca,
                                  pipeline_path,
                                  "PCA")

            pickle_object_to_file(self.__first_scaler,
                                  pipeline_path,
                                  "First Scaler")

            pickle_object_to_file(self.__second_scaler,
                                  pipeline_path,
                                  "First Scaler")

            pickle_object_to_file(self.__pca_perc,
                                  pipeline_path,
                                  "PCA Percentage")

            # Save Dimensions and Cutoff Index
            write_object_text_to_file(self.__cutoff_index,
                                      pipeline_path,
                                      "Cutoff Index")

            write_object_text_to_file(self.__cutoff_index + 1,
                                      pipeline_path,
                                      "Dimensions")
Beispiel #6
0
    def add(self, segment_name, pipeline_segment_obj):
        """
        segment_name (str):
            A aliased name to refer to this segment.

        pipeline_segment_obj (child of DataPipelineSegment):
            A child object of type DataPipelineSegment.

        Returns:
            Attempts to add a pipeline segment object to the objects que and
            update it's related json object.
        """

        # pipeline_segment_obj = copy.deepcopy(pipeline_segment_obj)

        # Type check
        if not isinstance(pipeline_segment_obj, DataPipelineSegment):

            raise UnsatisfiedRequirments(
                f"Expected a 'DataPipelineSegment' object; received '{type(pipeline_segment_obj)}'"
            )

        # Check if alias has already been used
        if segment_name in self.__pipeline_segment_names:
            raise PipelineError(
                f"The '{segment_name}' pipeline segment is already in this pipeline. Please choose a different segment name."
            )

        try:
            # Check if the pipeline segment has already been used
            segment_path_id = pipeline_segment_obj.relative_folder_path + pipeline_segment_obj.file_name
        except AttributeError:
            raise UnsatisfiedRequirments(
                "The given pipeline segment didn't perform any functions.")

        if segment_path_id in self.__pipeline_segment_path_id:
            raise PipelineError(
                "The segment has been already found "
                "in this pipeline Segment path id: " +
                f"'{segment_path_id}.'\n" + "This can be done by:"
                "\n\t*Creating a completely new segment object "
                "and adding it to the pipeline with the 'add'"
                " method."
                "\n\t*Refer to a different segment path id")
        else:
            # Que has yet to have data pushed; set up output directory for new data
            if len(self.__pipeline_segment_deque) == 0:
                FileOutput.__init__(
                    self,
                    f'_Extras/Pipeline Structure/Data Pipeline/{self.__pipeline_name}'
                )

                if os.path.exists(self.folder_path + "df_features.json"):
                    self.__df_features.init_on_json_file(self.folder_path +
                                                         "df_features.json")
                else:
                    self.__df_features.create_json_file_representation(
                        self.folder_path, "df_features.json")

        # Update data types for error checking
        self.__pipeline_segment_names.add(segment_name)
        self.__pipeline_segment_path_id.add(segment_path_id)

        # Main component of the project
        self.__pipeline_segment_deque.append(
            (segment_name, segment_path_id, pipeline_segment_obj))

        # Lock down the object to prevent users from continuing to interact with it after adding it to the pipeline
        pipeline_segment_obj._DataPipelineSegment__lock_interaction = True

        # Update/Create the json file
        self.__create_json_pipeline_file()
Beispiel #7
0
    def make_dummies(self,
                     df,
                     df_features,
                     qualitative_features=[],
                     _feature_values_dict=None,
                     _add_to_que=True):
        """

            Create dummies features of based on qualtative feature data and removes
            the original feature.

            Note
                _feature_values_dict does not need to be init. Used for backend
                resource.

        Args:
            df: pd.Dataframe
                Pandas dataframe.

            df_features: DataFrameTypes from eflow
                DataFrameTypes object.

            qualtative_features: collection of strings
                Feature names to convert the feature data into dummy features.

            _add_to_que: bool
                Hidden variable to determine if the function should be pushed
                to the pipeline segment.
        """
        # Convert to the correct types
        if isinstance(qualitative_features, str):
            qualtative_features = [qualitative_features]

        if not _feature_values_dict:
            _feature_values_dict = dict()

        pd.set_option('mode.chained_assignment', None)

        for cat_feature in qualitative_features:

            if cat_feature not in df_features.string_features(
            ) | df_features.categorical_features():
                raise UnsatisfiedRequirments(
                    f"No feature named '{cat_feature}' in categorical or string features."
                )

            if cat_feature not in _feature_values_dict:
                _feature_values_dict[cat_feature] = df[cat_feature].dropna(
                ).unique()
                _feature_values_dict[cat_feature].sort()
                _feature_values_dict[cat_feature] = _feature_values_dict[
                    cat_feature].tolist()

            dummy_features = []
            for feature_value in _feature_values_dict[cat_feature]:
                new_feature = cat_feature + f"_{feature_value}"
                bool_array = df[cat_feature] == feature_value
                df[new_feature] = copy.deepcopy(bool_array)
                dummy_features.append(new_feature)

            # # Make dummies and remove original feature
            # dummies_df = pd.get_dummies(_feature_values_dict[cat_feature],
            #                             prefix=cat_feature)

            df.drop(columns=[cat_feature], inplace=True)
            df_features.remove_feature(cat_feature)
            df_features.set_feature_to_dummy_encoded(cat_feature,
                                                     dummy_features)

            # # Apply to dataframe
            # for feature_name in dummies_df.columns:
            #     df[feature_name] = dummies_df[feature_name]

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.make_dummies)
            self._DataPipelineSegment__add_function_to_que(
                "make_dummies", parameters, params_dict)