def test_df_not_meeting_req_columns(self): dict_test = {'name': [str(i) + "name" for i in range(200)]} df = pd.DataFrame.from_dict(dict_test) # is {'name': ['0name', '1name', '2name' ...]} a dataframe after pd.DataFrame.from_dict ? ensurer_bol = DataEnsurer.validate_pd_data(df) # it should be false, since it doesnt have enough samples and features self.assertFalse(ensurer_bol)
def _first_iteration(self, x: DataFrame, y: NpArray, model: Any, score_type: str, n_folds_validation: int) -> tuple: score_lst = [] # empty list to store score values # iterate over all features for i in range(len(x.columns)): # in each iteration get the column temp_col_name = x.columns[i] print("Temp column name: ", temp_col_name) # create a temp dataframe with the selected column temp_x = x[[temp_col_name]] # get its score in a cv and append that values to the score_lst score = self._cv_score.get_score(temp_x, y, model, score_type, n_folds_validation) print("Score with last temp column: ", score) score_lst.append(score) # get the max score from the score_lst max_score = max(score_lst) # get the index of all score with max score max_score_index = [i for i, j in enumerate(score_lst) if j == max_score] # get the top score from the max score index list if DataEnsurer.validate_py_data(max_score_index, list): top_score_index = max_score_index[0] else: top_score_index = max_score_index # get feature name using top_score_index from original x dataframe new_feature = x.columns[top_score_index] # create a new dataframe with the winning feature new_best_x = x[new_feature] # drop the winning feature from the original dataframe and then store it into a new variable new_x = x.drop(new_feature, axis=1) # finally return the best feature dataframe, the new x without that feature and the max score of this iteration return new_best_x, new_x, max_score
def test_data_from_json_is_tuple_negative_index(self): # initialization of welcome_message with its path welcome_message = WelcomeMessage(file_path="..\\resources\\json_info\\welcome_message.json", data_type=list) this_is_a_tuple = welcome_message[-5] # gets author and quote as tuple by index using class method __getitem__ bol_answer = DataEnsurer.validate_py_data(this_is_a_tuple, tuple) # is the output a tuple? self.assertTrue(bol_answer)
def test_json_is_list(self): json_type = self._loader_creator.create_loader( ".\\..\\resources\\json_info\\welcome_message.json", "json") file = json_type.get_file_transformed() # is the file a deserialized json list? ensurer_bol = DataEnsurer.validate_py_data(file, list) # this should be true, since welcome_message.json has list format self.assertTrue(ensurer_bol)
def test_messenger_message_is_str(self): # use of WelcomeMessenger with WelcomeMessage implementation welcome_messenger = WelcomeMessenger(file_path="..\\resources\\json_info\\welcome_message.json") message = str(welcome_messenger) # is the returned message a string? bol_answer = DataEnsurer.validate_py_data(message, str) # is the output a string? self.assertTrue(bol_answer)
def test_data_type_is_list(self): # initialization of welcome_message with its path welcome_message = WelcomeMessage( file_path="..\\resources\\json_info\\welcome_message.json", data_type=list) data = welcome_message.data # get data value using its property bol_answer = DataEnsurer.validate_py_data(data, list) # is data a list? self.assertTrue(bol_answer)
def test_loader_path_is_str(self): # load diabetes.csv from disk folder_name = "datasets" file_name = "diabetes.csv" test_full_path = ".\\..\\" + folder_name + "\\" + file_name csv_file = self._loader_creator.create_loader(test_full_path, "csv") path = csv_file.file_path bol_answer = DataEnsurer.validate_py_data(path, str) self.assertTrue(bol_answer)
def create_model(feature_selection: bool, parameter_search: bool) -> SBSMachineLearning: if DataEnsurer.validate_py_data(feature_selection, bool) and DataEnsurer.validate_py_data( parameter_search, bool): if not feature_selection and not parameter_search: simple_model = ModelPossibilities.case("SM") return simple_model elif feature_selection and not parameter_search: only_feature_selection_model = ModelPossibilities.case("FSM") return only_feature_selection_model elif not feature_selection and parameter_search: only_parameter_search_model = ModelPossibilities.case("PSM") return only_parameter_search_model else: all_model = ModelPossibilities.case("AM") return all_model raise TypeError("Both parameters should be Boolean type")
def split_x_y_from_df(df: DataFrame, ravel_data: bool = True) -> tuple: data_splitter: DataSplitter = NormalSplitter() if DataEnsurer.validate_pd_data(df): tuple_answer = data_splitter.split_data_into_x_and_y( df, ravel=ravel_data) return tuple_answer raise TypeError( "The dataframe does no have enough samples or features to split them into x and y" )
def test_test_messenger_message_is_str_message_len_is_greater_than_zero(self): # use of WelcomeMessenger with WelcomeMessage implementation welcome_messenger = WelcomeMessenger(file_path="..\\resources\\json_info\\welcome_message.json") message_len = len(welcome_messenger) # gets message len using class method __len__ bol_answer = DataEnsurer.validate_py_data(message_len, int) if bol_answer and message_len > 0: bol_answer = True # it is indeed a integer and is greater than zero else: bol_answer = False # it is indeed a integer, but the len is not greater than zero self.assertTrue(bol_answer)
def test_data_is_df(self): # load diabetes.csv from disk folder_name = "datasets" file_name = "diabetes.csv" test_full_path = ".\\..\\" + folder_name + "\\" + file_name csv_file = self._loader_creator.create_loader(test_full_path, "csv") # get the dataframe from the data_returner this_is_a_df = csv_file.get_file_transformed() # use DataEnsurer and check if it is a dataframe with enough samples and features ensurer_bol = DataEnsurer.validate_pd_data(this_is_a_df) self.assertTrue(ensurer_bol)
def select_features(self, x: DataFrame, y: NpArray, model: Any, score_type: str, n_folds_validation: int) -> tuple: _, initial_y_shape = x.shape # original column len for evaluation # if x only has 1 column then return original dataframe if initial_y_shape > 1: # else if x has more than 1 column print("Feature selection process started") f_best_x, new_x, f_score = self._first_iteration(x, y, model, score_type, n_folds_validation) # call recursive function and then return best x best_x, best_score = self._else_iteration(f_best_x, new_x, y, model, f_score, score_type, n_folds_validation) # data might be len 1 or wrong print("Feature selection process finished") if DataEnsurer.validate_py_data(best_x, pd.Series): return best_x.to_frame(), best_score elif DataEnsurer.validate_py_data(best_x, DataFrame): return best_x, best_score else: raise TypeError("Output is not a dataframe") else: raise ValueError("Not enough columns to start feature selection")
def __getitem__(self, key: int) -> tuple: if DataEnsurer.validate_py_data(key, int): # make sure index is not out of boundaries if (key < len(self.data)) and (key >= -len(self.data)): # initialize local var author and quote author = self.data[key]["Author"] quote = self.data[key]["Quote"] return author, quote # index is out of boundaries. Raise IndexError raise IndexError("Index is out of boundaries") # index is not integer. Raise TypeError raise TypeError("Index is not integer")
def select_features(self, x: DataFrame, y: NpArray, model: Any, score_type: str, n_folds_validation: int) -> tuple: self._initial_x = x _, initial_y_shape = x.shape # original column len for evaluation if initial_y_shape > 1: initial_score = self._cv_score.get_score(x, y, model, score_type, n_folds_validation) # initial score with all features self._initial_score = initial_score # call recursive function and then return best x print("Feature selection process started") best_x, best_score = self._iteration(x, y, model, initial_score, score_type, n_folds_validation) # data might be len 1 or wrong print("Feature selection process finished") if DataEnsurer.validate_py_data(best_x, pd.Series): return best_x.to_frame(), best_score elif DataEnsurer.validate_py_data(best_x, DataFrame): return best_x, best_score else: raise TypeError("Output is not a dataframe") else: raise ValueError("Not enough columns to start feature selection")
def test_single_split_returns_a_tuple(self): # load diabetes.csv from disk folder_name = "datasets" file_name = "diabetes.csv" test_full_path = ".\\..\\" + folder_name + "\\" + file_name csv_type = self._loader_creator.create_loader(test_full_path, "CSV") df = csv_type.get_file_transformed() # use of splitterReturner with a NormalSplitter implementation splitter = SplitterReturner() # split dataframe into x and y data = splitter.split_x_y_from_df(df) result = DataEnsurer.validate_py_data(data, tuple) self.assertTrue(result)
def __getitem__(self, key: str) -> tuple: if DataEnsurer.validate_py_data(key, str): if key in self.data.keys(): actual_data = self.data[key] title = actual_data["Title"] body = actual_data["Body"] example = actual_data["Example"] url = actual_data["Url"] return title, body, example, url # key does not exist. Raise KeyError raise KeyError("Key does not exist") # key is not string. Raise TypeError raise TypeError("Key is not string")
def _else_iteration(self, best_x: DataFrame, x: DataFrame, y: NpArray, model: Any, actual_score: float, score_type: str, n_folds_validation: int) -> tuple: # first check x len. this variable will become smaller and smaller over time new_x_length = len(x.columns) print("X n of columns: ", new_x_length) # if there are columns in the x dataframe then do the following process if new_x_length > 0: score_lst = [] # empty list to store score values # iterate over all features from x dataframe for i in range(new_x_length): # in each iteration get the column temp_col_name = x.columns[i] print("Temp column name: ", temp_col_name) # create a temp dataframe with the selected column temp_x = x[[temp_col_name]] temp_new_x = pd.concat([best_x, temp_x], axis=1, ignore_index=True) # get its score in a cv and append that values to the score_lst score = self._cv_score.get_score(temp_new_x, y, model, score_type, n_folds_validation) print("Score with last temp column: ", score) score_lst.append(score) # get the max score from the score_lst once the for loop has ended max_score = max(score_lst) # best score print("Max score from this iteration: ", max_score) # check if this iteration was worth it. If not then break the recursive method if is_fewer_than(max_score, actual_score, score_type): print("There was not an improvement in this iteration") return best_x, actual_score # get the index of all score with max score max_score_index = [i for i, j in enumerate(score_lst) if j == max_score] # get the top score from the max score index list if DataEnsurer.validate_py_data(max_score_index, list): top_score_index = max_score_index[0] else: top_score_index = max_score_index # get feature name using top_score_index from original x dataframe new_feature = x.columns[top_score_index] # create a new dataframe with the winning feature temp_x = x[new_feature] # create a variable to store the last best dataframe with this new best feature group new_best_x = pd.concat([best_x, temp_x], axis=1) # drop the winning feature from the x dataframe and then store it into a new variable new_x = x.drop(new_feature, axis=1) # call the recursive function all over again until the condition is met print("There was an improvement in this iteration") return self._else_iteration(new_best_x, new_x, y, model, max_score, score_type, n_folds_validation) # x dataframe is now empty, return best x dataframe and its score # this is bad scenario, because it iterated all features and there was not an improvement return best_x, actual_score
def _iteration(self, x: DataFrame, y: NpArray, model: Any, actual_score: float, score_type: str, n_folds_validation: int) -> tuple: # first check x len. this variable will become smaller and smaller over time new_x_length = len(x.columns) # if there are columns in the x dataframe then do the following process if new_x_length > 1: score_lst = [] # empty list to store score values # iterate over all features from x dataframe for i in range(new_x_length): # drop feature by index and store the results in a temp variable temp_col_name = x.columns[i] print("Temp column name to drop:", temp_col_name) temp_x = x.drop([temp_col_name], axis=1) # get its score in a cv and append that value to the score_lst score = self._cv_score.get_score(temp_x, y, model, score_type, n_folds_validation) print("Score with last temp column dropped:", score) score_lst.append(score) # get the max score from the score_lst max_score = max(score_lst) print("Max score from this iteration:", max_score) # check if this iteration was worth it. If not then break the recursive method if is_greater_than(actual_score, max_score, score_type): print("There was not an improvement in this iteration") return x, actual_score # get the index of all score with max score max_score_index = [i for i, j in enumerate(score_lst) if j == max_score] # get the top score from the max score index list if DataEnsurer.validate_py_data(max_score_index, list): top_score_index = max_score_index[0] else: top_score_index = max_score_index # drop the feature where the best score is. Without that feature the model improves temp_col_name = x.columns[top_score_index] new_best_x = x.drop([temp_col_name], axis=1) # finally return the best feature dataframe, the new x without that feature and the max score of this # iteration print("There was an improvement in this iteration") return self._iteration(new_best_x, y, model, max_score, score_type, n_folds_validation) # x dataframe is now empty, return the initial x dataframe and its score # this is bad scenario, because it iterated all features and there was not an improvement return self._initial_x, self._initial_score
def get_file_transformed(self) -> DataFrame: # initialize separator as "," separator = ";" # try to load file. Raise TypeError if it does not meet requirements, else raise FileNotFoundError try: with open(self.file_path, 'r', encoding="utf-8") as f: temp = pd.read_csv(f, sep=separator) if DataEnsurer.validate_pd_data(temp): self.data = temp return self.data raise TypeError except FileNotFoundError: raise FileNotFoundError("Path to SCSV file was not found") except TypeError: raise TypeError( "Data does not meet sample or column requirements to train a model" ) except ValueError: raise ValueError( "Data does not meet requirements to be considered a scsv file") except OSError: raise OSError("Invalid file. It needs a text extension") except Exception as e: raise Exception(str(e))
def test_data_is_not_df(self): not_a_df = {'name': 'notch', 'job': 'dev'} # is {'name': 'notch', 'job': 'dev'} a dataframe? ensurer_bol = DataEnsurer.validate_pd_data(not_a_df) # it should be false, since input is a dict self.assertFalse(ensurer_bol)