コード例 #1
0
 def test_df_not_meeting_req_columns(self):
     dict_test = {'name': [str(i) + "name" for i in range(200)]}
     df = pd.DataFrame.from_dict(dict_test)
     # is {'name': ['0name', '1name', '2name' ...]} a dataframe after pd.DataFrame.from_dict ?
     ensurer_bol = DataEnsurer.validate_pd_data(df)
     # it should be false, since it doesnt have enough samples and features
     self.assertFalse(ensurer_bol)
コード例 #2
0
    def _first_iteration(self, x: DataFrame, y: NpArray, model: Any, score_type: str, n_folds_validation: int) -> tuple:
        score_lst = []  # empty list to store score values
        # iterate over all features
        for i in range(len(x.columns)):
            # in each iteration get the column
            temp_col_name = x.columns[i]
            print("Temp column name: ", temp_col_name)
            # create a temp dataframe with the selected column
            temp_x = x[[temp_col_name]]
            # get its score in a cv and append that values to the score_lst
            score = self._cv_score.get_score(temp_x, y, model, score_type, n_folds_validation)
            print("Score with last temp column: ", score)
            score_lst.append(score)

        # get the max score from the score_lst
        max_score = max(score_lst)
        # get the index of all score with max score
        max_score_index = [i for i, j in enumerate(score_lst) if j == max_score]
        # get the top score from the max score index list
        if DataEnsurer.validate_py_data(max_score_index, list):
            top_score_index = max_score_index[0]
        else:
            top_score_index = max_score_index
        # get feature name using top_score_index from original x dataframe
        new_feature = x.columns[top_score_index]
        # create a new dataframe with the winning feature
        new_best_x = x[new_feature]
        # drop the winning feature from the original dataframe and then store it into a new variable
        new_x = x.drop(new_feature, axis=1)
        # finally return the best feature dataframe, the new x without that feature and the max score of this iteration
        return new_best_x, new_x, max_score
コード例 #3
0
 def test_data_from_json_is_tuple_negative_index(self):
     # initialization of welcome_message with its path
     welcome_message = WelcomeMessage(file_path="..\\resources\\json_info\\welcome_message.json", data_type=list)
     this_is_a_tuple = welcome_message[-5]  # gets author and quote as tuple by index using class method __getitem__
     bol_answer = DataEnsurer.validate_py_data(this_is_a_tuple, tuple)
     # is the output a tuple?
     self.assertTrue(bol_answer)
コード例 #4
0
 def test_json_is_list(self):
     json_type = self._loader_creator.create_loader(
         ".\\..\\resources\\json_info\\welcome_message.json", "json")
     file = json_type.get_file_transformed()
     # is the file a deserialized json list?
     ensurer_bol = DataEnsurer.validate_py_data(file, list)
     # this should be true, since welcome_message.json has list format
     self.assertTrue(ensurer_bol)
コード例 #5
0
 def test_messenger_message_is_str(self):
     # use of WelcomeMessenger with WelcomeMessage implementation
     welcome_messenger = WelcomeMessenger(file_path="..\\resources\\json_info\\welcome_message.json")
     message = str(welcome_messenger)
     # is the returned message a string?
     bol_answer = DataEnsurer.validate_py_data(message, str)
     # is the output a string?
     self.assertTrue(bol_answer)
コード例 #6
0
 def test_data_type_is_list(self):
     # initialization of welcome_message with its path
     welcome_message = WelcomeMessage(
         file_path="..\\resources\\json_info\\welcome_message.json",
         data_type=list)
     data = welcome_message.data  # get data value using its property
     bol_answer = DataEnsurer.validate_py_data(data, list)
     # is data a list?
     self.assertTrue(bol_answer)
コード例 #7
0
 def test_loader_path_is_str(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_file = self._loader_creator.create_loader(test_full_path, "csv")
     path = csv_file.file_path
     bol_answer = DataEnsurer.validate_py_data(path, str)
     self.assertTrue(bol_answer)
コード例 #8
0
 def create_model(feature_selection: bool,
                  parameter_search: bool) -> SBSMachineLearning:
     if DataEnsurer.validate_py_data(feature_selection,
                                     bool) and DataEnsurer.validate_py_data(
                                         parameter_search, bool):
         if not feature_selection and not parameter_search:
             simple_model = ModelPossibilities.case("SM")
             return simple_model
         elif feature_selection and not parameter_search:
             only_feature_selection_model = ModelPossibilities.case("FSM")
             return only_feature_selection_model
         elif not feature_selection and parameter_search:
             only_parameter_search_model = ModelPossibilities.case("PSM")
             return only_parameter_search_model
         else:
             all_model = ModelPossibilities.case("AM")
             return all_model
     raise TypeError("Both parameters should be Boolean type")
コード例 #9
0
 def split_x_y_from_df(df: DataFrame, ravel_data: bool = True) -> tuple:
     data_splitter: DataSplitter = NormalSplitter()
     if DataEnsurer.validate_pd_data(df):
         tuple_answer = data_splitter.split_data_into_x_and_y(
             df, ravel=ravel_data)
         return tuple_answer
     raise TypeError(
         "The dataframe does no have enough samples or features to split them into x and y"
     )
コード例 #10
0
 def test_test_messenger_message_is_str_message_len_is_greater_than_zero(self):
     # use of WelcomeMessenger with WelcomeMessage implementation
     welcome_messenger = WelcomeMessenger(file_path="..\\resources\\json_info\\welcome_message.json")
     message_len = len(welcome_messenger)  # gets message len using class method __len__
     bol_answer = DataEnsurer.validate_py_data(message_len, int)
     if bol_answer and message_len > 0:
         bol_answer = True  # it is indeed a integer and is greater than zero
     else:
         bol_answer = False  # it is indeed a integer, but the len is not greater than zero
     self.assertTrue(bol_answer)
コード例 #11
0
 def test_data_is_df(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_file = self._loader_creator.create_loader(test_full_path, "csv")
     # get the dataframe from the data_returner
     this_is_a_df = csv_file.get_file_transformed()
     # use DataEnsurer and check if it is a dataframe with enough samples and features
     ensurer_bol = DataEnsurer.validate_pd_data(this_is_a_df)
     self.assertTrue(ensurer_bol)
コード例 #12
0
 def select_features(self, x: DataFrame, y: NpArray, model: Any, score_type: str, n_folds_validation: int) -> tuple:
     _, initial_y_shape = x.shape  # original column len for evaluation
     # if x only has 1 column then return original dataframe
     if initial_y_shape > 1:
         # else if x has more than 1 column
         print("Feature selection process started")
         f_best_x, new_x, f_score = self._first_iteration(x, y, model, score_type, n_folds_validation)
         # call recursive function and then return best x
         best_x, best_score = self._else_iteration(f_best_x, new_x, y, model, f_score, score_type,
                                                   n_folds_validation)
         # data might be len 1 or wrong
         print("Feature selection process finished")
         if DataEnsurer.validate_py_data(best_x, pd.Series):
             return best_x.to_frame(), best_score
         elif DataEnsurer.validate_py_data(best_x, DataFrame):
             return best_x, best_score
         else:
             raise TypeError("Output is not a dataframe")
     else:
         raise ValueError("Not enough columns to start feature selection")
コード例 #13
0
 def __getitem__(self, key: int) -> tuple:
     if DataEnsurer.validate_py_data(key, int):
         # make sure index is not out of boundaries
         if (key < len(self.data)) and (key >= -len(self.data)):
             # initialize local var author and quote
             author = self.data[key]["Author"]
             quote = self.data[key]["Quote"]
             return author, quote
         # index is out of boundaries. Raise IndexError
         raise IndexError("Index is out of boundaries")
     # index is not integer. Raise TypeError
     raise TypeError("Index is not integer")
コード例 #14
0
 def select_features(self, x: DataFrame, y: NpArray, model: Any, score_type: str, n_folds_validation: int) -> tuple:
     self._initial_x = x
     _, initial_y_shape = x.shape  # original column len for evaluation
     if initial_y_shape > 1:
         initial_score = self._cv_score.get_score(x, y, model, score_type,
                                                  n_folds_validation)
         # initial score with all features
         self._initial_score = initial_score
         # call recursive function and then return best x
         print("Feature selection process started")
         best_x, best_score = self._iteration(x, y, model, initial_score, score_type, n_folds_validation)
         # data might be len 1 or wrong
         print("Feature selection process finished")
         if DataEnsurer.validate_py_data(best_x, pd.Series):
             return best_x.to_frame(), best_score
         elif DataEnsurer.validate_py_data(best_x, DataFrame):
             return best_x, best_score
         else:
             raise TypeError("Output is not a dataframe")
     else:
         raise ValueError("Not enough columns to start feature selection")
コード例 #15
0
 def test_single_split_returns_a_tuple(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_type = self._loader_creator.create_loader(test_full_path, "CSV")
     df = csv_type.get_file_transformed()
     # use of splitterReturner with a NormalSplitter implementation
     splitter = SplitterReturner()
     # split dataframe into x and y
     data = splitter.split_x_y_from_df(df)
     result = DataEnsurer.validate_py_data(data, tuple)
     self.assertTrue(result)
コード例 #16
0
 def __getitem__(self, key: str) -> tuple:
     if DataEnsurer.validate_py_data(key, str):
         if key in self.data.keys():
             actual_data = self.data[key]
             title = actual_data["Title"]
             body = actual_data["Body"]
             example = actual_data["Example"]
             url = actual_data["Url"]
             return title, body, example, url
         # key does not exist. Raise KeyError
         raise KeyError("Key does not exist")
     # key is not string. Raise TypeError
     raise TypeError("Key is not string")
コード例 #17
0
    def _else_iteration(self, best_x: DataFrame, x: DataFrame, y: NpArray, model: Any, actual_score: float,
                        score_type: str, n_folds_validation: int) -> tuple:
        # first check x len. this variable will become smaller and smaller over time
        new_x_length = len(x.columns)
        print("X n of columns: ", new_x_length)
        # if there are columns in the x dataframe then do the following process
        if new_x_length > 0:
            score_lst = []  # empty list to store score values
            # iterate over all features from x dataframe
            for i in range(new_x_length):
                # in each iteration get the column
                temp_col_name = x.columns[i]
                print("Temp column name: ", temp_col_name)
                # create a temp dataframe with the selected column
                temp_x = x[[temp_col_name]]
                temp_new_x = pd.concat([best_x, temp_x], axis=1, ignore_index=True)
                # get its score in a cv and append that values to the score_lst
                score = self._cv_score.get_score(temp_new_x, y, model, score_type, n_folds_validation)
                print("Score with last temp column: ", score)
                score_lst.append(score)

            # get the max score from the score_lst once the for loop has ended
            max_score = max(score_lst)  # best score
            print("Max score from this iteration: ", max_score)
            # check if this iteration was worth it. If not then break the recursive method
            if is_fewer_than(max_score, actual_score, score_type):
                print("There was not an improvement in this iteration")
                return best_x, actual_score

            # get the index of all score with max score
            max_score_index = [i for i, j in enumerate(score_lst) if j == max_score]
            # get the top score from the max score index list
            if DataEnsurer.validate_py_data(max_score_index, list):
                top_score_index = max_score_index[0]
            else:
                top_score_index = max_score_index
            # get feature name using top_score_index from original x dataframe
            new_feature = x.columns[top_score_index]
            # create a new dataframe with the winning feature
            temp_x = x[new_feature]
            # create a variable to store the last best dataframe with this new best feature group
            new_best_x = pd.concat([best_x, temp_x], axis=1)
            # drop the winning feature from the x dataframe and then store it into a new variable
            new_x = x.drop(new_feature, axis=1)
            # call the recursive function all over again until the condition is met
            print("There was an improvement in this iteration")
            return self._else_iteration(new_best_x, new_x, y, model, max_score, score_type, n_folds_validation)

        # x dataframe is now empty, return best x dataframe and its score
        # this is bad scenario, because it iterated all features and there was not an improvement
        return best_x, actual_score
コード例 #18
0
    def _iteration(self, x: DataFrame, y: NpArray, model: Any, actual_score: float, score_type: str,
                   n_folds_validation: int) -> tuple:
        # first check x len. this variable will become smaller and smaller over time
        new_x_length = len(x.columns)
        # if there are columns in the x dataframe then do the following process
        if new_x_length > 1:
            score_lst = []  # empty list to store score values
            # iterate over all features from x dataframe
            for i in range(new_x_length):
                # drop feature by index and store the results in a temp variable
                temp_col_name = x.columns[i]
                print("Temp column name to drop:", temp_col_name)
                temp_x = x.drop([temp_col_name], axis=1)
                # get its score in a cv and append that value to the score_lst
                score = self._cv_score.get_score(temp_x, y, model, score_type, n_folds_validation)
                print("Score with last temp column dropped:", score)
                score_lst.append(score)
            # get the max score from the score_lst
            max_score = max(score_lst)
            print("Max score from this iteration:", max_score)
            # check if this iteration was worth it. If not then break the recursive method
            if is_greater_than(actual_score, max_score, score_type):
                print("There was not an improvement in this iteration")
                return x, actual_score
            # get the index of all score with max score
            max_score_index = [i for i, j in enumerate(score_lst) if j == max_score]
            # get the top score from the max score index list
            if DataEnsurer.validate_py_data(max_score_index, list):
                top_score_index = max_score_index[0]
            else:
                top_score_index = max_score_index
            # drop the feature where the best score is. Without that feature the model improves
            temp_col_name = x.columns[top_score_index]
            new_best_x = x.drop([temp_col_name], axis=1)
            # finally return the best feature dataframe, the new x without that feature and the max score of this
            # iteration
            print("There was an improvement in this iteration")
            return self._iteration(new_best_x, y, model, max_score, score_type, n_folds_validation)

        # x dataframe is now empty, return the initial x dataframe and its score
        # this is bad scenario, because it iterated all features and there was not an improvement
        return self._initial_x, self._initial_score
コード例 #19
0
 def get_file_transformed(self) -> DataFrame:
     # initialize separator  as ","
     separator = ";"
     # try to load file. Raise TypeError if it does not meet requirements, else raise FileNotFoundError
     try:
         with open(self.file_path, 'r', encoding="utf-8") as f:
             temp = pd.read_csv(f, sep=separator)
             if DataEnsurer.validate_pd_data(temp):
                 self.data = temp
                 return self.data
             raise TypeError
     except FileNotFoundError:
         raise FileNotFoundError("Path to SCSV file was not found")
     except TypeError:
         raise TypeError(
             "Data does not meet sample or column requirements to train a model"
         )
     except ValueError:
         raise ValueError(
             "Data does not meet requirements to be considered a scsv file")
     except OSError:
         raise OSError("Invalid file. It needs a text extension")
     except Exception as e:
         raise Exception(str(e))
コード例 #20
0
 def test_data_is_not_df(self):
     not_a_df = {'name': 'notch', 'job': 'dev'}
     # is {'name': 'notch', 'job': 'dev'} a dataframe?
     ensurer_bol = DataEnsurer.validate_pd_data(not_a_df)
     # it should be false, since input is a dict
     self.assertFalse(ensurer_bol)