def get_worksheets(self, excelFileName=None): if excelFileName: self.filename = excelFileName fu = FileUtil() if fu.file_exists(self.filename): xl = pd.ExcelFile(self.filename) return xl.sheet_names else: logger.error(f'Cannot find Excel file {self.filename}.') return None
class Test_FileUtil(TestCase): path_no_drive = 'temp' fn = 'test.csv' yaml = 'example.yaml' text_fn = 'test.txt' def __init__(self, *args, **kwargs): super(Test_FileUtil, self).__init__(*args, **kwargs) self.path = r'c:\temp' if platform.system() == 'Windows' else r'/tmp' self._fu = FileUtil() self._du = DateUtil() self.features_dict = { 'book': "Hitchhiker's Guide", 'characters': { 'answer': 42, 'name': 'Dent. Arthur Dent.' } } @classmethod def tearDownClass(cls) -> None: fu = FileUtil() path = r'c:\temp' if platform.system() == 'Windows' else r'/tmp' fu.delete_file(fu.qualified_path(path, cls.yaml)) fu.delete_file(fu.qualified_path(path, cls.fn)) fu.delete_file(fu.qualified_path(path, cls.text_fn)) @property def path(self): return self._path @path.setter def path(self, p): self._path = p def create_csv(self): lines = [ ',col1,col2', '0,1,3', '1,2,4', ] filename = self._fu.qualified_path(self.path, self.fn) self._fu.write_text_file(filename, lines) logger.debug(f'create_csv to {self.path}{sep}{self.fn}.') def create_yaml(self, keys: list, vals: list): writeMe = [] for i in range(len(keys)): writeMe.append(f'{keys[i]}: {vals[i]}') qualifiedPath = self._fu.qualified_path(self.path, self.yaml) self._fu.write_text_file(filename=qualifiedPath, lines=writeMe) def generate_text_lines(self, how_many: int = 10, width: int = None) -> List[str]: if width: ans = [ '{0:*^{width}}'.format(i, width=width) for i in range(how_many) ] return ans return [f'Line {i}' for i in range(how_many)] def create_text_file(self, filename: str, how_many: int = 10, width: int = None): lines = self.generate_text_lines(how_many, width) self._fu.write_text_file(filename, lines) @logit() def test_is_windows(self): with mock.patch('platform.system') as mocked_platform: mocked_platform.return_value = 'Linux' mocked_fu = FileUtil() test1 = mocked_fu.is_Windows self.assertFalse(test1) with mock.patch('platform.system') as mocked_platform: mocked_platform.return_value = 'Windows' mocked_fu = FileUtil() self.assertTrue(mocked_fu.is_Windows) @logit() def test_dump_yaml(self): yaml_fn = self._fu.qualified_path(self.path, self.yaml) self._fu.dump_yaml(yaml_fn, self.features_dict) self.assertTrue(self._fu.file_exists(yaml_fn)) actual = self._fu.read_yaml(yaml_fn) self.assertDictEqual(self.features_dict, actual) @logit() def test_current_directory(self): logger.debug( f'current working dir is really {self._fu.current_directory()}') my_mock_dir = r'\synthesys\testing' with mock.patch('FileUtil.getcwd', return_value=my_mock_dir): actual = self._fu.current_directory() self.assertEqual(actual, my_mock_dir) def test_read_text_file(self): filename = self._fu.qualified_path(self.path, self.text_fn) how_many_lines = randrange(10) + 2 self.create_text_file(filename, how_many_lines) expected = self.generate_text_lines(how_many_lines) actual = [x.rstrip() for x in self._fu.read_text_file(filename) ] # must remove newline chars self.assertListEqual(expected, actual) @logit() def test_read_text_file_err(self): # test an IO error filename = self._fu.qualified_path(self.path, self.text_fn) with mock.patch('FileUtil.open', create=True) as mocked_open: mocked_open.side_effect = IOError() self._fu.read_text_file(filename) @logit() def test_read_yaml(self): keys = ['firstname', 'lastname', 'zip'] vals = ['Rajah', 'Chacko', 28269] self.create_yaml(keys, vals) qualifiedPath = self._fu.qualified_path(self.path, self.yaml) d = self._fu.read_yaml(yamlFile=qualifiedPath) logger.debug(f'Contents of yaml: {d}') self.assertEqual(list(d.keys()), keys) self.assertEqual(vals[0], d[keys[0]]) @logit() @mock.patch('FileUtil.safe_load') def test_read_yaml_err(self, mock_obj): yaml_fn = self._fu.qualified_path(self.path, self.yaml) self.create_text_file(yaml_fn) mock_obj.side_effect = YAMLError('mock error') actual = self._fu.read_yaml(yamlFile=yaml_fn) self.assertIsNone(actual) @logit() def test_qualified_path(self): # Test 1. Normal case. expected = self.path + sep + self.fn actual = self._fu.qualified_path(self.path, self.fn) self.assertEqual(actual, expected, "Test 1 fail") # Test 2. Using an array and a Linux mock. with mock.patch('platform.system') as mocked_platform: mocked_platform.return_value = 'Windows' mocked_fu = FileUtil() dir_to_path = mocked_fu.separator.join( ['C:', 'dir', 'to', 'path']) # should be C:\dir\to\path for Windows pathArray = dir_to_path.split(mocked_fu.separator) expected = dir_to_path + mocked_fu.separator + self.fn self.assertEqual( expected, mocked_fu.fully_qualified_path(pathArray, self.fn, dir_path_is_array=True), "Test 2 fail") # Test 3, using a windows path with a drive exp3 = r'c:\temp\subdir\subsubdir' exp3_array = exp3.split(_BACKSLASH) test3_with_fn = deepcopy(exp3_array) test3_with_fn.append(self.fn) test3 = _BACKSLASH.join(test3_with_fn) with mock.patch('platform.system') as mocked_platform: mocked_platform.return_value = 'Windows' mocked_fu = FileUtil() actual = mocked_fu.qualified_path(dirPath=exp3_array, filename=self.fn, dir_path_is_array=True) self.assertEqual(test3, actual, "Test 3 fail") @logit() def test_fully_qualified_path(self): # Test 1, Windows (should be unchanged) path1 = r'c:\temp\subdir\subsubdir' with mock.patch('platform.system') as mocked_platform: mocked_platform.return_value = 'Windows' mocked_fu = FileUtil() exp1 = path1 + mocked_fu.separator + self.fn self.assertEqual( exp1, mocked_fu.fully_qualified_path(dirPath=path1, filename=self.fn), 'Test 1 fail') # Test 2, Linux without the leading / test2 = r'dir/to/path' # Test 3, Linux with the leading / (should be unchanged) with mock.patch('platform.system') as mocked_platform: mocked_platform.return_value = 'Linux' mocked_fu = FileUtil() exp2 = mocked_fu.separator + test2 + mocked_fu.separator + self.fn self.assertEqual( exp2, mocked_fu.fully_qualified_path(dirPath=test2, filename=self.fn, dir_path_is_array=False), "Test 2 fail") test3 = mocked_fu.separator + test2 exp3 = test3 + mocked_fu.separator + self.fn self.assertEqual( exp3, mocked_fu.fully_qualified_path(dirPath=test3, filename=self.fn, dir_path_is_array=False), "Test 3 fail") @logit() def test_split_qualified_path(self): fn = 'test.txt' qpath = self._fu.qualified_path(self.path, fn) # Test 1. c:\temp for Windows or /tmp for Linux. which_test = 1 splitpath, splitfn = self._fu.split_qualified_path(qpath, makeArray=False) self.assertEqual(splitpath, self.path, f'Test {which_test}. Paths should be equal.') self.assertEqual(splitfn, fn, f'Test {which_test}. File names should be equal.') # Test 2. Split paths into arrays. which_test = 2 pathArray, splitfn = self._fu.split_qualified_path(qpath, makeArray=True) expected = self.path.split(sep) self.assertEqual(pathArray, expected, f'Test {which_test}. Paths should be equal.') self.assertEqual(splitfn, fn, f'Test {which_test}. File names should be equal.') # Test 3. Try a more complex path. which_test = 3 complex_path = r'C:\Users\Owners\Documents\Tickers.csv' if platform.system( ) == 'Windows' else r'/tmp/parent/child/Tickers.csv' pathArray, splitfn = self._fu.split_qualified_path(complex_path, makeArray=True) expected = complex_path.split(sep) expected.pop() # Pop off the last el, which is the file name. self.assertEqual(pathArray, expected, f'Test {which_test}. Paths should be equal.') self.assertEqual(splitfn, 'Tickers.csv', f'Test {which_test}. File names should be equal.') @logit() def test_split_file_name(self): expected_file = "file" expected_ext = ".ext" expected_fn = expected_file + expected_ext # First test with just file.ext actual_file, actual_ext = self._fu.split_file_name(expected_fn) self.assertEqual(actual_file, expected_file) self.assertEqual(actual_ext, expected_ext) # Another test with path/file.ext qpath = self._fu.qualified_path(self.path, expected_fn) actual_file, actual_ext = self._fu.split_file_name(qpath) self.assertEqual(actual_file, expected_file) self.assertEqual(actual_ext, expected_ext) @logit() def test_file_exists(self): self.create_csv() qualifiedPath = self._fu.qualified_path(self.path, self.fn) self.assertTrue(self._fu.file_exists(qualifiedPath)) qualifiedPath = self._fu.qualified_path(self.path, 'noSuchFile.xxd') self.assertFalse(self._fu.file_exists(qualifiedPath)) @logit() def test_ensure_dir(self): self._fu.ensure_dir(self.path) self.assertTrue(self._fu.dir_exists(self.path)) @logit() def test_delete_file(self): self.create_csv() qualifiedPath = self._fu.qualified_path(self.path, self.fn) # delete_file should return True the first time self.assertTrue(self._fu.delete_file(qualifiedPath)) # but return false the second time. self.assertFalse(self._fu.delete_file(qualifiedPath)) @logit() @mock.patch('FileUtil.remove') def test_delete_file_err(self, mock_obj): self.create_csv() expected_log_message = 'delete_file mocktest' mock_obj.side_effect = OSError(expected_log_message) qualifiedPath = self._fu.qualified_path(self.path, self.fn) with self.assertLogs(FileUtil.__name__, level='DEBUG') as cm: ans = self._fu.delete_file(qualifiedPath) self.assertFalse(ans) self.assertTrue( next((True for line in cm.output if expected_log_message in line), False)) @logit() def test_copy_file(self): self.create_csv() copy_fn = self.fn + '.copy' copied_file = self._fu.qualified_path(self.path, copy_fn) source_path = self._fu.qualified_path(self.path, self.fn) self._fu.copy_file(source_path, copied_file) self.assertTrue(self._fu.file_exists(source_path)) self.assertTrue(self._fu.file_exists(copied_file)) self._fu.delete_file(copied_file) @logit() @mock.patch('FileUtil.copy2') def test_copy_file_err(self, mock_obj): tmp_path = self._fu.qualified_path(self.path, 'tmp') qualifiedPath = self._fu.qualified_path(self.path, self.fn) expected_log_message = 'copy_file mocktest' mock_obj.side_effect = IOError(expected_log_message) with self.assertLogs(FileUtil.__name__, level='DEBUG') as cm: _ = self._fu.copy_file(qualifiedPath, tmp_path) self.assertTrue( next((True for line in cm.output if expected_log_message in line), False)) @logit() def test_getList(self): dir_name = r'c:\temp' flist = self._fu.getList(dir_name) logger.debug(f'All list is: {flist}') def isFile_side_effect(*args, **kwargs) -> bool: """ Side effect for mocking test_get_files. Returns True if there is a .txt in the filename. Not great, but ok for mocking. :param args: :param kwargs: :return: """ return mock_is_file(args[1]) def isDir_side_effect(*args) -> bool: return mock_is_dir(args[1]) @logit() @mock.patch('FileUtil.isfile') @mock.patch('FileUtil.listdir') def test_get_files(self, mock_listdir, mock_isfile): dir_name = r'\nosuchdir' file_list = ['filea.txt', 'fileb.txt', 'filec.txt', 'somedir'] mock_listdir.return_value = file_list mock_isfile.side_effect = self.isFile_side_effect actual = self._fu.get_files(dir_name) expected = [f for f in file_list if mock_is_file(f) ] # Condition must match isFile_side_effect self.assertListEqual(expected, actual) @logit() @mock.patch('FileUtil.isdir') @mock.patch('FileUtil.listdir') def test_get_dirs(self, mock_listdir, mock_isdir): dir_name = r'\nosuchdir' file_list = ['filea.txt', 'fileb.txt', 'filec.txt', 'somedir'] mock_listdir.return_value = file_list mock_isdir.side_effect = self.isDir_side_effect actual = self._fu.get_dirs(dir_name) expected = [f for f in file_list if mock_is_dir(f) ] # Condition must match isDir_side_effect self.assertListEqual(expected, actual) @logit() def test_getRecursiveList(self): dir_name = r'\nosuchdir' file_list = ['filea.txt', 'fileb.txt', 'filec.txt'] actual = self._fu.getRecursiveList(dir_name) self.assertListEqual(actual, []) # Since no such dir, should be empty list eu = ExecUtil() exec_file = eu.exec_file_path() dir_name, _ = self._fu.split_qualified_path(exec_file) logger.debug(f'dir name is: {dir_name}') with mock.patch('FileUtil.listdir', return_value=file_list): actual = self._fu.getRecursiveList(dir_name) expected = [ self._fu.fully_qualified_path(dirPath=dir_name, filename=f) for f in file_list ] self.assertListEqual(expected, actual) @logit() def test_load_logs_and_subdir_names(self): no_such_dir_name = r'\nosuchdir' file_list = ['filea.txt', 'fileb.csv', 'otherfile.txt'] actual = self._fu.load_logs_and_subdir_names(no_such_dir_name) self.assertListEqual(actual, []) # Since no such dir, should be empty list eu = ExecUtil() dir_name = eu.executing_directory() # ensures that dir_name is real with mock.patch('FileUtil.listdir', return_value=file_list): # Test with neither prefix nor suffix actual = self._fu.load_logs_and_subdir_names(dir_name) expected = [ self._fu.fully_qualified_path(dirPath=dir_name, filename=f) for f in file_list ] self.assertListEqual(expected, actual) # Test for suffixes ending in .txt suffix = '.txt' actual = self._fu.load_logs_and_subdir_names(dir_name, requiredSuffix=suffix) txt_only = [ self._fu.fully_qualified_path(dirPath=dir_name, filename=f) for f in file_list if f.endswith(suffix) ] self.assertListEqual(txt_only, actual) # Test for prefixes starting with 'file' prefix = 'file' actual = self._fu.load_logs_and_subdir_names(dir_name, requiredPrefix=prefix) file_only = [ self._fu.fully_qualified_path(dirPath=dir_name, filename=f) for f in file_list if f.startswith(prefix) ] self.assertListEqual(file_only, actual) @logit() @mock.patch('FileUtil.isfile') @mock.patch('FileUtil.listdir') def test_cull_existing_files(self, mock_listdir, mock_isfile): dir_name = r'\nosuchdir' file_list = ['filea.txt', 'fileb.txt', 'filec.txt', 'somedir'] mock_listdir.return_value = file_list mock_isfile.side_effect = self.isFile_side_effect qualified_file_list = [ self._fu.qualified_path(dirPath=dir_name, filename=f) for f in file_list ] actual = self._fu.cull_existing_files(qualified_file_list) expected = [f for f in qualified_file_list if mock_is_file(f) ] # Condition must match isFile_side_effect self.assertListEqual(expected, actual) @logit() def test_read_generator(self): filename = self._fu.qualified_path(self.path, self.text_fn) how_many_lines = 5 self.create_text_file(filename, how_many_lines) lines_read_in = 0 for i, line in enumerate(self._fu.read_generator(filename)): logger.debug(f'Read in line {i}, which contains <{line}>.') lines_read_in += 1 self.assertEqual(how_many_lines, lines_read_in) @logit() @mock.patch('FileUtil.open') def test_read_generator_err(self, mock_open): expected_log_message = 'mocked error' mock_open.side_effect = IOError(expected_log_message) filename = self._fu.qualified_path(self.path, self.text_fn) with self.assertLogs(FileUtil.__name__, level='DEBUG') as cm: for i, line in enumerate(self._fu.read_generator(filename)): x = line logger.debug(f'Read in line {i}, which contains <{x}>.') self.assertIsNone(x) logger.debug(f'Caught exception message: {cm.output}') self.assertTrue( next((True for line in cm.output if expected_log_message in line), False)) @logit() def test_file_modify_time(self): start_time = self._du.as_timestamp() keys = [ 'greeting', 'farewell', ] vals = [ 'Hello', 'Goodbye', ] self.create_yaml(keys, vals) qualifiedPath = self._fu.qualified_path(self.path, self.yaml) mod_time = self._fu.file_modify_time(qualifiedPath) mod_timestamp = self._du.as_timestamp(dt=mod_time) logger.debug( f'mod_time is {mod_timestamp}. start_time is {start_time}.') self.assertTrue((start_time - mod_timestamp) < .1) # asserting a difference of < 0.1 seconds. @logit() def test_file_modify_time2(self): start_time = self._du.as_timestamp() keys = [ 'greeting', 'farewell', ] vals = [ 'Hello', 'Goodbye', ] self.create_yaml(keys, vals) qualifiedPath = self._fu.qualified_path(self.path, self.yaml) mod_time = self._fu.file_modify_time2(qualifiedPath) mod_timestamp = self._du.as_timestamp(dt=mod_time) self.assertTrue((start_time - mod_timestamp) < .1) # asserting a difference of < 0.1 seconds. @logit() def test_file_size(self): filename = self._fu.qualified_path(self.path, self.text_fn) width = 20 how_many_lines = randrange(10) + 2 self.create_text_file(filename, how_many_lines, width) eol_len = 2 actual = self._fu.file_size(filename) self.assertEqual((width + eol_len) * how_many_lines, actual) @logit() def test_list_modules(self): mods = [] for mod_name in self._fu.list_module_contents(module_name='itertools'): mods.append(mod_name) self.assertTrue('__docs__' in mods) @logit() def test_list_modules(self): doc = self._fu.list_module_attributes('itertools', True) logger.debug('{}'.format(doc)) mods = [] for mod_name in self._fu.list_modules(module_name='itertools'): mods.append(mod_name) self.assertTrue('__doc__' in mods) self.assertTrue('__name__' in mods)
class PandasUtil: _EMPTY_DF = pd.DataFrame() def __init__(self): self.filename = None self.worksheetName = None self._df = None self._fu = FileUtil() # make the df display look better: https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns-of-a-pandas-dataframe pd.set_option('display.max_rows', 100) pd.set_option('display.max_columns', 50) pd.set_option('display.width', 800) # Getters and setters for filename, worksheetname, and df @property def filename(self): return self._filename # Setter for filename. @filename.setter def filename(self, fn: str): self._filename = fn @property def worksheetName(self): return self._worksheetName @worksheetName.setter def worksheetName(self, wks: str): self._worksheetName = wks @property def df(self): return self._df @df.setter def df(self, myDf: pd.DataFrame): self._df = myDf @classmethod def empty_df(cls) -> pd.DataFrame: return pd.DataFrame() def pandas_version(self): """ Return the panas version as three ints :return: maj, minor, sub """ v = pd.__version__ majMinSub = [int(x) for x in v.split('.')] return majMinSub[0], majMinSub[1], majMinSub[2] def write_df_to_excel(self, df: pd.DataFrame = None, excelFileName: str = None, excelWorksheet: str = None, write_index=False) -> bool: """ Write the given df to the excel file name and worksheet (unless they have already been provided and then are optional). Caller is responsible to catch any I/O errors. :param df: :param excelFileName: :param excelWorksheet: :return: True if Excel file written, False if df is empty. """ if not df.empty: self._df = df else: logger.warning('Empty dataframe will not be written.') return False fn = excelFileName or self.filename wks = excelWorksheet or self.worksheetname writer = pd.ExcelWriter(fn) self._df.to_excel(writer, wks, index=write_index) writer.save() logger.debug(f'Successfully wrote to {fn}.') return True def write_df_to_csv(self, df: pd.DataFrame = None, csv_file_name: str = None, write_header: bool = True, write_index: bool = False, enc: str = 'utf-8') -> bool: """ Write the given df to the file name and worksheet (unless they have already been provided and then are optional). Caller is responsible to catch any I/O errors. :param df: :param csv_file_name: :param write_header: :param write_index: :param enc: :return: True if Excel file written, False if df is empty. """ if not df.empty: self._df = df else: logger.warning('Empty dataframe will not be written.') return False df.to_csv(csv_file_name, header=write_header, index=write_index, encoding=enc) logger.debug(f'Successfully wrote to {csv_file_name}.') return True def read_df_from_excel(self, excelFileName: str = None, excelWorksheet: str = 'Sheet1', header: int = 0, index_col: int = -1) -> pd.DataFrame: """ Read an Excel file. :param excelFileName: :param excelWorksheet: :param header: 0-offset location of header (0=row 1 in Excel) :param index_col: :return: dataframe result """ param_dict = {'header': header} if excelFileName: self.filename = excelFileName logger.debug(f'Will read from the Excel file: {self.filename}.') param_dict['io'] = self.filename if self._fu.file_exists(self.filename): if excelWorksheet: self.worksheetName = excelWorksheet wks = self.worksheetName major, minor, _ = self.pandas_version() logger.debug( f'Will read from the worksheet: {wks}. Pandas minor version is {minor}.' ) if wks not in self.get_worksheets(excelFileName): logger.warning( f'Cannot find Excel worksheet: {self.worksheetName}. Returning empty df.' ) return PandasUtil.empty_df() if ((major == 0) & (minor > 21)) | (major >= 1): param_dict['sheet_name'] = wks else: param_dict['sheetname'] = wks if index_col >= 0: param_dict['index_col'] = index_col self._df = pd.read_excel(**param_dict) logger.debug(f'Read in {len(self.df)} records.') return self._df else: logger.error( f'Cannot find Excel file: {self.filename}. Returning empty df.' ) return PandasUtil.empty_df() def read_df_from_csv(self, csv_file_name: str = None, header: int = 0, enc: str = 'utf-8', index_col: int = None, sep: str = None) -> pd.DataFrame: """ Write the given df to the file name and worksheet (unless they have already been provided and then are optional). :param df: :param csv_file_name: :param header: Where the headers live (0 means first line of the file) :param enc: try 'latin-1' or 'ISO-8859-1' if you are getting encoding errors :return: """ param_dict = { 'filepath_or_buffer': csv_file_name, 'header': header, 'encoding': enc, } if sep: param_dict['sep'] = sep if index_col is not None: param_dict['index_col'] = index_col ans = pd.read_csv(**param_dict) return ans def get_df_headers(self, df: pd.DataFrame = _EMPTY_DF) -> list: """ Get a list of the headers. This provides a list of the column NAMES. :param df: :param self: :return: list of headers """ if not self.is_empty(df): self.df = df return list(self.df.columns) else: logger.warning('df is empty. Returning None for headers') return None def set_df_headers(self, df: pd.DataFrame, new_headers: list): """ This sets the column NAMES. :param df: :param new_headers: list of new headers) :return: None (but side effect of changed df) """ df.columns = new_headers def get_rowCount_colCount(self, df: pd.DataFrame): """ Return the row and column_name count of the df. :param df: :return: row count, col count """ rows, cols = df.shape logger.debug(f'df has {rows} rows and {cols} columns.') return rows, cols def get_basic_data_analysis(self, df: pd.DataFrame) -> str: buffer = StringIO() df.info(buf=buffer) ans = buffer.getvalue() logger.info(f'info:\n{ans}') return ans def get_quartiles(self, df: pd.DataFrame, percentiles: list = [.25, .50, .75]) -> pd.DataFrame: """ Return basic statistics about the dataframe. :param df: :param percentiles: list of %-tiles as fractions between 0 and 1, e.g. [.2, .4, .6, .8] for quintiles :return: basic description df """ ans = df.describe(percentiles=percentiles) logger.info(f'info:\n{ans.head(10)}') return ans @logit(showRetVal=True) def get_worksheets(self, excelFileName=None): if excelFileName: self.filename = excelFileName fu = FileUtil() if fu.file_exists(self.filename): xl = pd.ExcelFile(self.filename) return xl.sheet_names else: logger.error(f'Cannot find Excel file {self.filename}.') return None def duplicate_rows(self, df: pd.DataFrame, fieldList: list = None, keep: str = 'first') -> pd.DataFrame: """ Return a dataframe with the duplicates as specified by the columns in fieldList. If fieldList is missing or None, then return the exactly duplicated rows. :param df: dataframe to scan for duplicates :param fieldList: fields in df to examine for duplicates. :param keep: 'first' or 'last' to keep the first dupe or the last. :return: df of the duplicates """ if fieldList: ans = df[df.duplicated(fieldList, keep=keep)] else: ans = df[df.duplicated(keep=keep)] return ans def drop_duplicates(self, df: pd.DataFrame, fieldList: list = None, keep: str = 'first') -> pd.DataFrame: """ Drop the duplicates as specified by the columns in fieldList. If fieldList is missing or None, then return the exactly duplicated rows. :param df: dataframe to scan for duplicates :param fieldList: fields in df to examine for duplicates. :param keep: 'first' or 'last' to keep the first dupe or the last. :return: df without the duplicates """ param_dict = {'keep': keep, 'inplace': False} if fieldList: param_dict['subset'] = fieldList return df.drop_duplicates(**param_dict) def convert_dict_to_dataframe(self, list_of_dicts: list) -> pd.DataFrame: """ Convert a list of dictionaries to a dataframe. :param list_of_dicts: :return: """ return pd.DataFrame(list_of_dicts) def convert_list_to_dataframe(self, lists: list, column_names: List = None) -> pd.DataFrame: """ Convert a list of lists to a dataframe. If provided, add the column names. If not, provide default col names. :param lists: a list of lists, like [[1,2,3], ['a', 'b', 'c']] :param column_names: Column names to use. Defaults to col00, col01, col22, .. col99 :return: """ if column_names: return pd.DataFrame(data=lists, columns=column_names) # Use the default column names: col00, col01... ans = pd.DataFrame(data=lists) self.replace_col_names_by_pattern(ans) return ans def convert_matrix_to_dataframe(self, lists: list) -> pd.DataFrame: """ convert a list of lists to a dataframe. :param lists: :return: """ return pd.DataFrame(data=lists) def convert_dataframe_to_matrix(self, df: pd.DataFrame) -> np.ndarray: """ Convert all of the values to a numpy ndarray. :param df: :return: """ return df.to_numpy() def convert_dataframe_to_vector(self, df: pd.DataFrame) -> np.ndarray: """ Convert the dataframe to a numpy vector. :param df: :return: """ cols = self.get_df_headers(df) if len(cols) == 1: return df.to_numpy().reshape(-1, ) logger.warning( f'Dataframe should have exactly one column, but contains {len(cols)}. Returning None.' ) return None def convert_dataframe_col_to_list(self, df: pd.DataFrame, column_name: str) -> list: """ Convert the given dataframe column to a list. :param df: :param column_name: a column name, like 'age' :return: a list of that column """ return df[column_name].values.tolist() def without_null_rows(self, df: pd.DataFrame, column_name: str) -> pd.DataFrame: """ Return a DataFrame without the rows that are null in the given column_name. :param df: source DataFrame :param column_name: Column name to remove. :return: new DataFrame """ try: mask = pd.notnull(df[column_name]) return df[mask] except KeyError: logger.error( f'Unable to find column_name name: {column_name}. Returning empty df.' ) return PandasUtil.empty_df() def select(self, df: pd.DataFrame, column_name: str, match_me: Union[str, int]) -> pd.DataFrame: """ Return a DataFrame that selects on the column_name that is equal to match_me. Similar to a SELECT * WHERE clause in SQL. :param df: :param column_name: :param match_me: :return: df with the column_name matching the selected clause (possibly empty) """ return df.loc[df[column_name] == match_me] def mask_blanks(self, df: pd.DataFrame, column_name: str) -> list: """ Return a boolean list with a True in the rows that have a blank column_name. :param df: :param column_name: :return: """ # ans = df.loc[df[column_name] == ''] ans = df[column_name] == '' return ans def select_blanks(self, df: pd.DataFrame, column_name: str) -> list: return df[self.mask_blanks(df, column_name)] def mask_non_blanks(self, df: pd.DataFrame, column_name: str) -> list: """ Return a boolean list with a True in the rows that have a nonblank column_name. :param df: :param column_name: :return: """ blanks = self.mask_blanks(df, column_name) non_blanks_mask = [not x for x in blanks] return non_blanks_mask def select_non_blanks(self, df: pd.DataFrame, column_name: str) -> list: return df[self.mask_non_blanks(df, column_name)] def unique_values(self, df: pd.DataFrame, column_name: str) -> list: """ Return a list of the unique values in column_name. :param df: :param column_name: :return: """ return self.drop_duplicates(df=df[column_name]).tolist() def count_by_column(self, df: pd.DataFrame, column_name: str = None) -> pd.DataFrame: """ Return a count by value of the given column. :param df: :param column_name: :return: """ return df[column_name].value_counts() def add_new_col_with_func(self, df: pd.DataFrame, column_name: str, func: Callable[[], list]) -> pd.DataFrame: """ Call the func with no args to assign a new column_name to the dataframe. func should return a list comprehension. Here's an example of what the function should do. def my_func(self) -> list: df = self.pu.df col_of_interest = df['number'] return [self.my_f(x) for x in col_of_interest] It gets called with: df = self.pu.add_new_col_with_func(df, 'new_col_name', self.my_func) :param df: :param column_name: :param func: func (usually no args) :return: """ self.df = df df[column_name] = func() return df def add_new_col_from_array(self, df: pd.DataFrame, column_name: str, new_col: np.array) -> pd.DataFrame: """ Use the values in new_col to create a new column. Limitations: this is not as sophisticated as https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas . The length of new_col must be the same as the length of df. :param df: :param column_name: :param new_col: If this really is a Series, it will try to match indexes with the existing df (probably a good thing). :return: """ df[column_name] = new_col return df def mark_rows_by_func(self, df: pd.DataFrame, column_name: str, func: Callable[[], list]) -> Bools: """ Return a list of bools depending on the func. Here's a func (which takes a list as a parameter): def is_adult(self, age:list): return age >= 21 Here's how to invoke it: mark = self.pu.mark_rows_by_func(df, 'Age', self.is_adult) :param df: dataframe under scrutiny :param column_name: name of the column_name :param func: function that is to be invoked. Takes a list and returns a list of booleans. :return: """ mask = func(df[column_name]) return mask def mark_rows_by_criterion(self, df: pd.DataFrame, column_name: str, criterion: Union[str, int, float]) -> Bools: """ Return a list of bools when column_name meets the criterion. :param df: :param column_name: :param criterion: :return: """ mask = df[column_name] == criterion return mask def mark_isnull(self, df: pd.DataFrame, column_name: str) -> Bools: mask = df[column_name].isnull() return mask def masked_df(self, df: pd.DataFrame, mask: Bools, invert_mask: bool = False): if not invert_mask: return df[mask] else: my_mask = [not x for x in mask] return df[my_mask] def slice_df(self, df: pd.DataFrame, start_index: int = 0, end_index: int = None, step: int = 1): """ Slice the df by the given start, end, and step. NOTE: this does row slicing only. :param df: :param start_index: 0-based first index to use. Defaults to 0 (the first el) :param end_index: end of list index. Defaults to None (which means the end of the list). :param step: how many to skip. 2 means skip every other. Default of 1 means don't skip. :return: """ end_idx = end_index or len(df) ans = df.iloc[start_index:end_idx:step] return ans def set_index(self, df: pd.DataFrame, columns: Union[Strings, str], is_in_place: bool = True) -> pd.DataFrame: """ Set the index of df. :param df: Dataframe under scrutiny. :param columns: Can be a str (=single column_name) or a List of strings. :param is_in_place: True to add the index in place / False to create a new df :return: df or None (if is_in_place is true) """ return df.set_index(columns, inplace=is_in_place) def reset_index(self, df: pd.DataFrame, is_in_place: bool = True, is_dropped: bool = False) -> pd.DataFrame: """ Reset the index. :param df: :param is_in_place: :param is_dropped: :return: """ return df.reset_index(drop=is_dropped, inplace=is_in_place) def drop_index(self, df: pd.DataFrame, is_in_place: bool = True) -> pd.DataFrame: """ Drop the index :param df: :param is_in_place: :param is_dropped: :return: """ return self.reset_index(df=df, is_in_place=is_in_place, is_dropped=True) def drop_col(self, df: pd.DataFrame, columns: Union[Strings, str], is_in_place: bool = True) -> pd.DataFrame: """ Drop the given column_name. :param df: :param columns: Can be a str (=single column_name) or a List of strings. :param is_in_place: if true, column_name is dropped from df in place. Otherwise, a new df is returned. :return: None if is_in_place is True. Else df with the column_name dropped. """ major, minor, _ = self.pandas_version() if (major == 0) & (minor < 21): logger.warning( f'Unable to drop column, as Pandas version is {minor}. Returning unchanged df.' ) return df return df.drop(columns=columns, inplace=is_in_place) @logit() def drop_col_keeping(self, df: pd.DataFrame, cols_to_keep: Union[Strings, str], is_in_place: bool = True) -> pd.DataFrame: """ Keep the given columns and drop the rest. :param df: :param cols_to_keep: :param is_in_place: :return: """ headers_to_drop = self.get_df_headers(df) logger.debug( f'I have these headers: {headers_to_drop}. But I will keep {cols_to_keep}' ) exceptions = cols_to_keep if isinstance(cols_to_keep, str): exceptions = [cols_to_keep] for col in exceptions: headers_to_drop.remove(col) return self.drop_col(df=df, columns=headers_to_drop, is_in_place=is_in_place) def drop_row_by_criterion(self, df: pd.DataFrame, column_name: str, criterion: Union[int, str], is_in_place: bool = True) -> pd.DataFrame: """ Drop the rows that have criterion in the given column. :param df: :param column_name: :param criterion: :param is_in_place: :return: """ return df.drop(df[df[column_name] == criterion].index, inplace=is_in_place) def drop_row_if_nan(self, df: pd.DataFrame, column_names: Strings = None, is_in_place: bool = True) -> pd.DataFrame: """ Drop a row if the given column name is NaN. :param df: :param column_names: Drop the rows based in this array of column names. If None, drop every row with all NaNs. :param is_in_place: :return: """ if column_names: return df.dropna(axis='index', subset=column_names, inplace=is_in_place) return df.dropna(axis='index', inplace=is_in_place, how='all') def reorder_cols(self, df: pd.DataFrame, columns: Strings) -> pd.DataFrame: """ Using the columns, return a new df. :param df: :param columns: list of strings, like ['colD', 'colA', 'colB', 'colC'] :return: """ return df[columns] def replace_col(self, df: pd.DataFrame, column: str, replace_dict: dict) -> pd.DataFrame: """ Replace the values of column_name using replace_dict. This will will replace the column VALUES. :param df: :param column: :param replace_dict: {'origA':'replA', 'origB':'replB'} :return: df with column_name replaced """ try: df[column] = df[column].map(replace_dict) except KeyError: logger.warning( f'Value found outside of: {replace_dict.keys()} or column_name {column} not found. Returning empty df.' ) return self.empty_df() return df def replace_col_using_func(self, df: pd.DataFrame, column_name: str, func: Callable[[], list]) -> pd.DataFrame: """ Replace the column contents by each element's value, as determined by func. This will will replace the column VALUES. :param df: Dataframe under scrutiny. :param column_name: (single column_name) name :param func: Function operates on whatever element it is presented, and returns the changed element. :return: df """ df[column_name] = df[column_name].apply(func) return df def replace_col_using_mult_cols(self, df: pd.DataFrame, column_to_replace: str, cols: Strings, func: Callable[[], list]) -> pd.DataFrame: """ Replace column_to_replace, using the given func. This will will replace the column VALUES. :param df: Dataframe under scrutiny. :param column_to_replace: (single column_name) name :param cols: list of columns used for the following func :param func: Pointer to a local function. :return: df with replaced column """ df[column_to_replace] = df[cols].apply(func, axis=1) return df def replace_col_with_scalar(self, df: pd.DataFrame, column_name: str, replace_with: Union[str, int], mask: Bools = None) -> pd.DataFrame: """ Replace the all column_name with replace_with. If a mask of bools is used, only replace those elements with a True. Helpful reference at https://kanoki.org/2019/07/17/pandas-how-to-replace-values-based-on-conditions/ :param df: :param column_name: :param replace_with: :param mask: :return: """ if mask is None: df[column_name] = replace_with elif isinstance(mask, pd.Series): df[column_name].mask(mask.tolist(), replace_with, inplace=True) elif isinstance(mask, list): # df[column_name].mask(mask, replace_with, inplace=True) # Method 1 and works df.loc[mask, column_name] = replace_with # Method 2 at kanoki. else: logger.warning( f'mask must be None, a series, or a list, but it is: {type(mask)}' ) return self.empty_df() def join_two_dfs_on_index(self, df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame: """ return a column-wise join of these two dataframes on their mutual index. :param df1: :param df2: :return: """ return pd.concat([df1, df2], axis=1, ignore_index=False) def join_dfs_by_column(self, dfs: Dataframes) -> pd.DataFrame: """ Return a column-wise join of these dataframes. :param dfs: :return: """ return pd.concat(dfs, axis='columns') def join_dfs_by_row(self, dfs: Dataframes) -> pd.DataFrame: """ Return a row-wise join of these dataframes. Note: all the dfs should have the same column names, so you might call it in this way: headers = pu.get_df_headers(big_df) pu.set_df_headers(new_df, headers) df2 = pu.join_dfs_by_row([new_df, big_df]) :param dfs: :return: """ return pd.concat(dfs, axis='rows', ignore_index=True) def dummy_var_df(self, df: pd.DataFrame, columns: Union[Strings, str], drop_first: bool = True) -> pd.DataFrame: """ Do a one-hot encoding. Create a dummy variable based on the given column. :param df: :param columns: a single column name or a list of column names. :return: """ if isinstance(columns, str): my_columns = [columns] else: my_columns = columns df = pd.get_dummies(data=df, columns=my_columns, drop_first=drop_first) return df def replace_col_names(self, df: pd.DataFrame, replace_dict: dict, is_in_place: bool = True) -> pd.DataFrame: """ :param replace_dict: {'origColA':'replColA', 'origColB':'replColB'} """ return df.rename(columns=replace_dict, inplace=is_in_place) def replace_col_names_by_pattern(self, df: pd.DataFrame, prefix: str = "col", is_in_place: bool = True) -> pd.DataFrame: """ Replace the column names with col1, col2.... :param df: :param prefix: string prefix, such as "col" :param is_in_place: :return: """ cur_names = self.get_df_headers(df) gen = generate_col_names(prefix) replacement_dict = {k: next(gen) for k in cur_names} return self.replace_col_names(df, replacement_dict, is_in_place) def coerce_to_string(self, df: pd.DataFrame, columns: Union[Strings, str]) -> pd.DataFrame: """ Coerce the given column_name name to a string. :param df: :param column_name: :return: new df with column_name coerced to str. """ if isinstance(columns, str): # Make the single str columns into a list with just that one element. cols_as_list = [columns] else: cols_as_list = columns for col in cols_as_list: df[col] = df[col].apply(str) return df def coerce_to_numeric(self, df: pd.DataFrame, columns: Union[Strings, str]) -> pd.DataFrame: """ Coerce the given column_name name to ints or floats. :param df: :param columns: a column name (or list of names) to coerce :return: df with columns coerced to a numeric in place. """ if isinstance(columns, str): # Make the single str columns into a list with just that one element. cols_as_list = [columns] else: cols_as_list = columns df[cols_as_list] = df[cols_as_list].apply(pd.to_numeric) return df def coerece_to_int(self, df: pd.DataFrame, columns: Union[Strings, str]) -> pd.DataFrame: """ Coerce the given column name(s) to an int. :param df: :param columns: a column name (or list of names) to coerce :return: df with columns coerced to a numeric in place. """ df[columns] = df[columns].astype(int) return df def round(self, df: pd.DataFrame, rounding_dict: dict) -> pd.DataFrame: """ Round the columns given in rounding_dict to the given number of decimal places. Unexpected result found in testing: python function round(4.55, 2) yields 4.5 BUT this function returns 4.6 :param df: :param rounding_dict: {'A': 2, 'B':3} :return: df rounded to the specified number of places. """ return df.round(rounding_dict) def replace_vals(self, df: pd.DataFrame, replace_me: str, new_val: str, is_in_place: bool = True) -> pd.DataFrame: """ Replace the values of replace_me with the new_val. :param df: Dataframe under scrutiny. :param :param is_in_place: True to replace values in place / False to create a new df :return: df or None (if is_in_place is true) """ return df.replace(to_replace=replace_me, value=new_val, inplace=is_in_place) def replace_vals_by_mask(self, df: pd.DataFrame, mask: Bools, col_to_change: str, new_val: Union[str, int, float]): """ Replace the values in the col_to_change with the new_val :param df: :param mask: :param col_to_change: Column Name whose rows you want to change :param new_val: :return: the changed df (also changed in place) """ ans = df.loc[mask, col_to_change] = new_val return ans def is_empty(self, df: pd.DataFrame) -> bool: """ Return true if the df is empty. :param df: Dataframe to inspect :return: True IFF it is empty """ return df.empty def aggregates(self, df: pd.DataFrame, group_by: Strings, col: str) -> pd.DataFrame: """ Return the average, min, max, and sum of the dataframe when grouped by the given strings. Reference: https://jamesrledoux.com/code/group-by-aggregate-pandas . :param df: :param group_by: :return: """ grouped_multiple = df.groupby(group_by).agg( {col: ['mean', 'min', 'max', 'sum']}) grouped_multiple.columns = ['mean', 'min', 'max', 'sum'] self.reset_index(grouped_multiple, is_in_place=True) return grouped_multiple def stats(self, df: pd.DataFrame, xlabel_col_name: str, ylabel_col_name: str): """ Calculate the main statistics. :param df: dataframe under scrutiny :param xlabel_col_name: x column label :param ylabel_col_name: y column label :return: slope, intercept, and r (correlation) """ slope, intercept, r, p, epsilon = linregress(df[xlabel_col_name], df[ylabel_col_name]) logger.info('Main equation: y = %.3f x + %.3f' % (slope, intercept)) logger.info('r^2 = %.4f' % (r * r)) logger.info('p = %.4f' % (p)) logger.info('std err: %.4f' % (epsilon)) return slope, intercept, r def head(self, df: pd.DataFrame, how_many_rows: int = 10) -> pd.DataFrame: """ Return the first how_many_rows. This works well if called as the last line of an immediate, as in: pu.head(df) :param df: :param how_many_rows: :return: """ self.df = df return self.df.head(how_many_rows) def head_as_string(self, df: pd.DataFrame, how_many_rows: int = 10) -> str: """ Return the first how_many_rows as a string, separated by \n. :param df: :param how_many_rows: :return: """ ans = str(self.head(df, how_many_rows)) logger.debug(f'First {how_many_rows} are:\n{ans}') return ans def tail_as_string(self, df: pd.DataFrame, how_many_rows: int = 10) -> str: """ Return the last how_many_rows as a string, separated by \n. :param df: :param how_many_rows: :return: """ ans = str(self.tail(df, how_many_rows)) logger.debug(f'Last {how_many_rows} are:\n{ans}') return ans def tail(self, df: pd.DataFrame, how_many_rows: int = 10) -> pd.DataFrame: """ Return the last how_many_rows. This works well if called as the last line of an immediate, as in: pu.tail(df) :param df: :param how_many_rows: :return: """ self.df = df return self.df.tail(how_many_rows) def sort(self, df: pd.DataFrame, columns: Union[Strings, str], is_in_place: bool = True, is_asc: bool = True): """ Sort the given dataFrame by the given column(s). :param df: :param columns: :param is_in_place: :param is_asc: :return: """ return df.sort_values(columns, ascending=is_asc, inplace=is_in_place, kind='quicksort', na_position='last') def largest_index(self, df: pd.DataFrame) -> Tuple[int, int]: """ Return the largest index and its value (usually an int and an int). :return: :param df: :return: (index, value of index) """ return df.index.argmax(), df.index.max() def smallest_index(self, df: pd.DataFrame) -> Tuple[int, int]: """ Return the smallest index and its value (usually an int and an int). :return: :param df: :return: (index, value of index) """ return df.index.argmin(), df.index.min()