コード例 #1
 def get_worksheets(self, excelFileName=None):
     if excelFileName:
         self.filename = excelFileName
     fu = FileUtil()
     if fu.file_exists(self.filename):
         xl = pd.ExcelFile(self.filename)
         return xl.sheet_names
         logger.error(f'Cannot find Excel file {self.filename}.')
         return None
コード例 #2
class Test_FileUtil(TestCase):
    path_no_drive = 'temp'
    fn = 'test.csv'
    yaml = 'example.yaml'
    text_fn = 'test.txt'

    def __init__(self, *args, **kwargs):
        super(Test_FileUtil, self).__init__(*args, **kwargs)
        self.path = r'c:\temp' if platform.system() == 'Windows' else r'/tmp'
        self._fu = FileUtil()
        self._du = DateUtil()
        self.features_dict = {
            'book': "Hitchhiker's Guide",
            'characters': {
                'answer': 42,
                'name': 'Dent. Arthur Dent.'

    def tearDownClass(cls) -> None:
        fu = FileUtil()
        path = r'c:\temp' if platform.system() == 'Windows' else r'/tmp'
        fu.delete_file(fu.qualified_path(path, cls.yaml))
        fu.delete_file(fu.qualified_path(path, cls.fn))
        fu.delete_file(fu.qualified_path(path, cls.text_fn))

    def path(self):
        return self._path

    def path(self, p):
        self._path = p

    def create_csv(self):
        lines = [
        filename = self._fu.qualified_path(self.path, self.fn)
        self._fu.write_text_file(filename, lines)
        logger.debug(f'create_csv to {self.path}{sep}{self.fn}.')

    def create_yaml(self, keys: list, vals: list):
        writeMe = []
        for i in range(len(keys)):
            writeMe.append(f'{keys[i]}: {vals[i]}')

        qualifiedPath = self._fu.qualified_path(self.path, self.yaml)
        self._fu.write_text_file(filename=qualifiedPath, lines=writeMe)

    def generate_text_lines(self,
                            how_many: int = 10,
                            width: int = None) -> List[str]:
        if width:
            ans = [
                '{0:*^{width}}'.format(i, width=width) for i in range(how_many)
            return ans
        return [f'Line {i}' for i in range(how_many)]

    def create_text_file(self,
                         filename: str,
                         how_many: int = 10,
                         width: int = None):
        lines = self.generate_text_lines(how_many, width)
        self._fu.write_text_file(filename, lines)

    def test_is_windows(self):
        with mock.patch('platform.system') as mocked_platform:
            mocked_platform.return_value = 'Linux'
            mocked_fu = FileUtil()
            test1 = mocked_fu.is_Windows

        with mock.patch('platform.system') as mocked_platform:
            mocked_platform.return_value = 'Windows'
            mocked_fu = FileUtil()

    def test_dump_yaml(self):
        yaml_fn = self._fu.qualified_path(self.path, self.yaml)
        self._fu.dump_yaml(yaml_fn, self.features_dict)
        actual = self._fu.read_yaml(yaml_fn)
        self.assertDictEqual(self.features_dict, actual)

    def test_current_directory(self):
            f'current working dir is really {self._fu.current_directory()}')
        my_mock_dir = r'\synthesys\testing'
        with mock.patch('FileUtil.getcwd', return_value=my_mock_dir):
            actual = self._fu.current_directory()
            self.assertEqual(actual, my_mock_dir)

    def test_read_text_file(self):
        filename = self._fu.qualified_path(self.path, self.text_fn)
        how_many_lines = randrange(10) + 2
        self.create_text_file(filename, how_many_lines)
        expected = self.generate_text_lines(how_many_lines)
        actual = [x.rstrip() for x in self._fu.read_text_file(filename)
                  ]  # must remove newline chars
        self.assertListEqual(expected, actual)

    def test_read_text_file_err(self):
        # test an IO error
        filename = self._fu.qualified_path(self.path, self.text_fn)
        with mock.patch('FileUtil.open', create=True) as mocked_open:
            mocked_open.side_effect = IOError()

    def test_read_yaml(self):
        keys = ['firstname', 'lastname', 'zip']
        vals = ['Rajah', 'Chacko', 28269]
        self.create_yaml(keys, vals)

        qualifiedPath = self._fu.qualified_path(self.path, self.yaml)
        d = self._fu.read_yaml(yamlFile=qualifiedPath)
        logger.debug(f'Contents of yaml: {d}')
        self.assertEqual(list(d.keys()), keys)
        self.assertEqual(vals[0], d[keys[0]])

    def test_read_yaml_err(self, mock_obj):
        yaml_fn = self._fu.qualified_path(self.path, self.yaml)
        mock_obj.side_effect = YAMLError('mock error')
        actual = self._fu.read_yaml(yamlFile=yaml_fn)

    def test_qualified_path(self):
        # Test 1. Normal case.
        expected = self.path + sep + self.fn
        actual = self._fu.qualified_path(self.path, self.fn)
        self.assertEqual(actual, expected, "Test 1 fail")
        # Test 2. Using an array and a Linux mock.

        with mock.patch('platform.system') as mocked_platform:
            mocked_platform.return_value = 'Windows'
            mocked_fu = FileUtil()
            dir_to_path = mocked_fu.separator.join(
                ['C:', 'dir', 'to',
                 'path'])  # should be C:\dir\to\path for Windows
            pathArray = dir_to_path.split(mocked_fu.separator)
            expected = dir_to_path + mocked_fu.separator + self.fn
                "Test 2 fail")

        # Test 3, using a windows path with a drive
        exp3 = r'c:\temp\subdir\subsubdir'
        exp3_array = exp3.split(_BACKSLASH)
        test3_with_fn = deepcopy(exp3_array)
        test3 = _BACKSLASH.join(test3_with_fn)

        with mock.patch('platform.system') as mocked_platform:
            mocked_platform.return_value = 'Windows'
            mocked_fu = FileUtil()
            actual = mocked_fu.qualified_path(dirPath=exp3_array,
            self.assertEqual(test3, actual, "Test 3 fail")

    def test_fully_qualified_path(self):
        # Test 1, Windows (should be unchanged)
        path1 = r'c:\temp\subdir\subsubdir'
        with mock.patch('platform.system') as mocked_platform:
            mocked_platform.return_value = 'Windows'
            mocked_fu = FileUtil()
            exp1 = path1 + mocked_fu.separator + self.fn
                'Test 1 fail')
        # Test 2, Linux without the leading /
        test2 = r'dir/to/path'

        # Test 3, Linux with the leading / (should be unchanged)
        with mock.patch('platform.system') as mocked_platform:
            mocked_platform.return_value = 'Linux'
            mocked_fu = FileUtil()
            exp2 = mocked_fu.separator + test2 + mocked_fu.separator + self.fn
                "Test 2 fail")
            test3 = mocked_fu.separator + test2
            exp3 = test3 + mocked_fu.separator + self.fn
                "Test 3 fail")

    def test_split_qualified_path(self):
        fn = 'test.txt'
        qpath = self._fu.qualified_path(self.path, fn)
        # Test 1. c:\temp for Windows or /tmp for Linux.
        which_test = 1
        splitpath, splitfn = self._fu.split_qualified_path(qpath,
        self.assertEqual(splitpath, self.path,
                         f'Test {which_test}. Paths should be equal.')
        self.assertEqual(splitfn, fn,
                         f'Test {which_test}. File names should be equal.')
        # Test 2. Split paths into arrays.
        which_test = 2
        pathArray, splitfn = self._fu.split_qualified_path(qpath,
        expected = self.path.split(sep)
        self.assertEqual(pathArray, expected,
                         f'Test {which_test}. Paths should be equal.')
        self.assertEqual(splitfn, fn,
                         f'Test {which_test}. File names should be equal.')
        # Test 3. Try a more complex path.
        which_test = 3
        complex_path = r'C:\Users\Owners\Documents\Tickers.csv' if platform.system(
        ) == 'Windows' else r'/tmp/parent/child/Tickers.csv'
        pathArray, splitfn = self._fu.split_qualified_path(complex_path,
        expected = complex_path.split(sep)
        expected.pop()  # Pop off the last el, which is the file name.
        self.assertEqual(pathArray, expected,
                         f'Test {which_test}. Paths should be equal.')
        self.assertEqual(splitfn, 'Tickers.csv',
                         f'Test {which_test}. File names should be equal.')

    def test_split_file_name(self):
        expected_file = "file"
        expected_ext = ".ext"
        expected_fn = expected_file + expected_ext
        # First test with just file.ext
        actual_file, actual_ext = self._fu.split_file_name(expected_fn)
        self.assertEqual(actual_file, expected_file)
        self.assertEqual(actual_ext, expected_ext)
        # Another test with path/file.ext
        qpath = self._fu.qualified_path(self.path, expected_fn)
        actual_file, actual_ext = self._fu.split_file_name(qpath)
        self.assertEqual(actual_file, expected_file)
        self.assertEqual(actual_ext, expected_ext)

    def test_file_exists(self):
        qualifiedPath = self._fu.qualified_path(self.path, self.fn)
        qualifiedPath = self._fu.qualified_path(self.path, 'noSuchFile.xxd')

    def test_ensure_dir(self):

    def test_delete_file(self):
        qualifiedPath = self._fu.qualified_path(self.path, self.fn)
        # delete_file should return True the first time
        # but return false the second time.

    def test_delete_file_err(self, mock_obj):
        expected_log_message = 'delete_file mocktest'
        mock_obj.side_effect = OSError(expected_log_message)
        qualifiedPath = self._fu.qualified_path(self.path, self.fn)
        with self.assertLogs(FileUtil.__name__, level='DEBUG') as cm:
            ans = self._fu.delete_file(qualifiedPath)
                      for line in cm.output if expected_log_message in line),

    def test_copy_file(self):
        copy_fn = self.fn + '.copy'
        copied_file = self._fu.qualified_path(self.path, copy_fn)
        source_path = self._fu.qualified_path(self.path, self.fn)
        self._fu.copy_file(source_path, copied_file)

    def test_copy_file_err(self, mock_obj):
        tmp_path = self._fu.qualified_path(self.path, 'tmp')
        qualifiedPath = self._fu.qualified_path(self.path, self.fn)
        expected_log_message = 'copy_file mocktest'
        mock_obj.side_effect = IOError(expected_log_message)
        with self.assertLogs(FileUtil.__name__, level='DEBUG') as cm:
            _ = self._fu.copy_file(qualifiedPath, tmp_path)
                      for line in cm.output if expected_log_message in line),

    def test_getList(self):
        dir_name = r'c:\temp'
        flist = self._fu.getList(dir_name)
        logger.debug(f'All list is: {flist}')

    def isFile_side_effect(*args, **kwargs) -> bool:
        Side effect for mocking test_get_files.
        Returns True if there is a .txt in the filename. Not great, but ok for mocking.
        :param args:
        :param kwargs:
        return mock_is_file(args[1])

    def isDir_side_effect(*args) -> bool:
        return mock_is_dir(args[1])

    def test_get_files(self, mock_listdir, mock_isfile):
        dir_name = r'\nosuchdir'
        file_list = ['filea.txt', 'fileb.txt', 'filec.txt', 'somedir']
        mock_listdir.return_value = file_list
        mock_isfile.side_effect = self.isFile_side_effect
        actual = self._fu.get_files(dir_name)
        expected = [f for f in file_list if mock_is_file(f)
                    ]  # Condition must match isFile_side_effect
        self.assertListEqual(expected, actual)

    def test_get_dirs(self, mock_listdir, mock_isdir):
        dir_name = r'\nosuchdir'
        file_list = ['filea.txt', 'fileb.txt', 'filec.txt', 'somedir']
        mock_listdir.return_value = file_list
        mock_isdir.side_effect = self.isDir_side_effect
        actual = self._fu.get_dirs(dir_name)
        expected = [f for f in file_list if mock_is_dir(f)
                    ]  # Condition must match isDir_side_effect
        self.assertListEqual(expected, actual)

    def test_getRecursiveList(self):
        dir_name = r'\nosuchdir'
        file_list = ['filea.txt', 'fileb.txt', 'filec.txt']
        actual = self._fu.getRecursiveList(dir_name)
                             [])  # Since no such dir, should be empty list
        eu = ExecUtil()
        exec_file = eu.exec_file_path()
        dir_name, _ = self._fu.split_qualified_path(exec_file)
        logger.debug(f'dir name is: {dir_name}')

        with mock.patch('FileUtil.listdir', return_value=file_list):
            actual = self._fu.getRecursiveList(dir_name)
            expected = [
                self._fu.fully_qualified_path(dirPath=dir_name, filename=f)
                for f in file_list
            self.assertListEqual(expected, actual)

    def test_load_logs_and_subdir_names(self):
        no_such_dir_name = r'\nosuchdir'
        file_list = ['filea.txt', 'fileb.csv', 'otherfile.txt']
        actual = self._fu.load_logs_and_subdir_names(no_such_dir_name)
                             [])  # Since no such dir, should be empty list

        eu = ExecUtil()
        dir_name = eu.executing_directory()  # ensures that dir_name is real

        with mock.patch('FileUtil.listdir', return_value=file_list):
            # Test with neither prefix nor suffix
            actual = self._fu.load_logs_and_subdir_names(dir_name)
            expected = [
                self._fu.fully_qualified_path(dirPath=dir_name, filename=f)
                for f in file_list
            self.assertListEqual(expected, actual)
            # Test for suffixes ending in .txt
            suffix = '.txt'
            actual = self._fu.load_logs_and_subdir_names(dir_name,
            txt_only = [
                self._fu.fully_qualified_path(dirPath=dir_name, filename=f)
                for f in file_list if f.endswith(suffix)
            self.assertListEqual(txt_only, actual)
            # Test for prefixes starting with 'file'
            prefix = 'file'
            actual = self._fu.load_logs_and_subdir_names(dir_name,
            file_only = [
                self._fu.fully_qualified_path(dirPath=dir_name, filename=f)
                for f in file_list if f.startswith(prefix)
            self.assertListEqual(file_only, actual)

    def test_cull_existing_files(self, mock_listdir, mock_isfile):
        dir_name = r'\nosuchdir'
        file_list = ['filea.txt', 'fileb.txt', 'filec.txt', 'somedir']
        mock_listdir.return_value = file_list
        mock_isfile.side_effect = self.isFile_side_effect
        qualified_file_list = [
            self._fu.qualified_path(dirPath=dir_name, filename=f)
            for f in file_list
        actual = self._fu.cull_existing_files(qualified_file_list)
        expected = [f for f in qualified_file_list if mock_is_file(f)
                    ]  # Condition must match isFile_side_effect
        self.assertListEqual(expected, actual)

    def test_read_generator(self):
        filename = self._fu.qualified_path(self.path, self.text_fn)
        how_many_lines = 5
        self.create_text_file(filename, how_many_lines)
        lines_read_in = 0
        for i, line in enumerate(self._fu.read_generator(filename)):
            logger.debug(f'Read in line {i}, which contains <{line}>.')
            lines_read_in += 1
        self.assertEqual(how_many_lines, lines_read_in)

    def test_read_generator_err(self, mock_open):
        expected_log_message = 'mocked error'
        mock_open.side_effect = IOError(expected_log_message)
        filename = self._fu.qualified_path(self.path, self.text_fn)
        with self.assertLogs(FileUtil.__name__, level='DEBUG') as cm:
            for i, line in enumerate(self._fu.read_generator(filename)):
                x = line
                logger.debug(f'Read in line {i}, which contains <{x}>.')
            logger.debug(f'Caught exception message: {cm.output}')
                      for line in cm.output if expected_log_message in line),

    def test_file_modify_time(self):
        start_time = self._du.as_timestamp()
        keys = [
        vals = [
        self.create_yaml(keys, vals)
        qualifiedPath = self._fu.qualified_path(self.path, self.yaml)
        mod_time = self._fu.file_modify_time(qualifiedPath)
        mod_timestamp = self._du.as_timestamp(dt=mod_time)
            f'mod_time is {mod_timestamp}. start_time is {start_time}.')
        self.assertTrue((start_time - mod_timestamp) <
                        .1)  # asserting a difference of < 0.1 seconds.

    def test_file_modify_time2(self):
        start_time = self._du.as_timestamp()
        keys = [
        vals = [
        self.create_yaml(keys, vals)
        qualifiedPath = self._fu.qualified_path(self.path, self.yaml)
        mod_time = self._fu.file_modify_time2(qualifiedPath)
        mod_timestamp = self._du.as_timestamp(dt=mod_time)
        self.assertTrue((start_time - mod_timestamp) <
                        .1)  # asserting a difference of < 0.1 seconds.

    def test_file_size(self):
        filename = self._fu.qualified_path(self.path, self.text_fn)
        width = 20
        how_many_lines = randrange(10) + 2
        self.create_text_file(filename, how_many_lines, width)
        eol_len = 2
        actual = self._fu.file_size(filename)
        self.assertEqual((width + eol_len) * how_many_lines, actual)

    def test_list_modules(self):
        mods = []
        for mod_name in self._fu.list_module_contents(module_name='itertools'):

        self.assertTrue('__docs__' in mods)

    def test_list_modules(self):
        doc = self._fu.list_module_attributes('itertools', True)
        mods = []
        for mod_name in self._fu.list_modules(module_name='itertools'):

        self.assertTrue('__doc__' in mods)
        self.assertTrue('__name__' in mods)
コード例 #3
class PandasUtil:
    _EMPTY_DF = pd.DataFrame()

    def __init__(self):
        self.filename = None
        self.worksheetName = None
        self._df = None
        self._fu = FileUtil()
        # make the df display look better: https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns-of-a-pandas-dataframe
        pd.set_option('display.max_rows', 100)
        pd.set_option('display.max_columns', 50)
        pd.set_option('display.width', 800)

    # Getters and setters for filename, worksheetname, and df
    def filename(self):
        return self._filename

    # Setter for filename.
    def filename(self, fn: str):
        self._filename = fn

    def worksheetName(self):
        return self._worksheetName

    def worksheetName(self, wks: str):
        self._worksheetName = wks

    def df(self):
        return self._df

    def df(self, myDf: pd.DataFrame):
        self._df = myDf

    def empty_df(cls) -> pd.DataFrame:
        return pd.DataFrame()

    def pandas_version(self):
        Return the panas version as three ints
        :return: maj, minor, sub
        v = pd.__version__
        majMinSub = [int(x) for x in v.split('.')]
        return majMinSub[0], majMinSub[1], majMinSub[2]

    def write_df_to_excel(self,
                          df: pd.DataFrame = None,
                          excelFileName: str = None,
                          excelWorksheet: str = None,
                          write_index=False) -> bool:
        Write the given df to the excel file name and worksheet (unless
        they have already been provided and then are optional).
        Caller is responsible to catch any I/O errors.
        :param df:
        :param excelFileName:
        :param excelWorksheet:
        :return: True if Excel file written, False if df is empty.
        if not df.empty:
            self._df = df
            logger.warning('Empty dataframe will not be written.')
            return False
        fn = excelFileName or self.filename
        wks = excelWorksheet or self.worksheetname
        writer = pd.ExcelWriter(fn)
        self._df.to_excel(writer, wks, index=write_index)
        logger.debug(f'Successfully wrote to {fn}.')
        return True

    def write_df_to_csv(self,
                        df: pd.DataFrame = None,
                        csv_file_name: str = None,
                        write_header: bool = True,
                        write_index: bool = False,
                        enc: str = 'utf-8') -> bool:
        Write the given df to the file name and worksheet (unless
        they have already been provided and then are optional).
        Caller is responsible to catch any I/O errors.
        :param df:
        :param csv_file_name:
        :param write_header:
        :param write_index:
        :param enc:
        :return: True if Excel file written, False if df is empty.
        if not df.empty:
            self._df = df
            logger.warning('Empty dataframe will not be written.')
            return False
        logger.debug(f'Successfully wrote to {csv_file_name}.')
        return True

    def read_df_from_excel(self,
                           excelFileName: str = None,
                           excelWorksheet: str = 'Sheet1',
                           header: int = 0,
                           index_col: int = -1) -> pd.DataFrame:
        Read an Excel file.
        :param excelFileName:
        :param excelWorksheet:
        :param header: 0-offset location of header (0=row 1 in Excel)
        :param index_col:
        :return: dataframe result
        param_dict = {'header': header}
        if excelFileName:
            self.filename = excelFileName
        logger.debug(f'Will read from the Excel file: {self.filename}.')
        param_dict['io'] = self.filename
        if self._fu.file_exists(self.filename):
            if excelWorksheet:
                self.worksheetName = excelWorksheet
            wks = self.worksheetName
            major, minor, _ = self.pandas_version()
                f'Will read from the worksheet: {wks}. Pandas minor version is {minor}.'
            if wks not in self.get_worksheets(excelFileName):
                    f'Cannot find Excel worksheet: {self.worksheetName}. Returning empty df.'
                return PandasUtil.empty_df()
            if ((major == 0) & (minor > 21)) | (major >= 1):
                param_dict['sheet_name'] = wks
                param_dict['sheetname'] = wks
            if index_col >= 0:
                param_dict['index_col'] = index_col
            self._df = pd.read_excel(**param_dict)
            logger.debug(f'Read in {len(self.df)} records.')
            return self._df
                f'Cannot find Excel file: {self.filename}. Returning empty df.'
            return PandasUtil.empty_df()

    def read_df_from_csv(self,
                         csv_file_name: str = None,
                         header: int = 0,
                         enc: str = 'utf-8',
                         index_col: int = None,
                         sep: str = None) -> pd.DataFrame:
        Write the given df to the file name and worksheet (unless
        they have already been provided and then are optional).
        :param df:
        :param csv_file_name:
        :param header: Where the headers live (0 means first line of the file)
        :param enc: try 'latin-1' or 'ISO-8859-1' if you are getting encoding errors
        param_dict = {
            'filepath_or_buffer': csv_file_name,
            'header': header,
            'encoding': enc,
        if sep:
            param_dict['sep'] = sep
        if index_col is not None:
            param_dict['index_col'] = index_col
        ans = pd.read_csv(**param_dict)
        return ans

    def get_df_headers(self, df: pd.DataFrame = _EMPTY_DF) -> list:
        Get a list of the headers. This provides a list of the column NAMES.
        :param df:
        :param self:
        :return: list of headers
        if not self.is_empty(df):
            self.df = df
            return list(self.df.columns)
            logger.warning('df is empty. Returning None for headers')
            return None

    def set_df_headers(self, df: pd.DataFrame, new_headers: list):
        This sets the column NAMES.
        :param df:
        :param new_headers: list of new headers)
        :return: None (but side effect of changed df)
        df.columns = new_headers

    def get_rowCount_colCount(self, df: pd.DataFrame):
        Return the row and column_name count of the df.
        :param df:
        :return: row count, col count
        rows, cols = df.shape
        logger.debug(f'df has {rows} rows and {cols} columns.')
        return rows, cols

    def get_basic_data_analysis(self, df: pd.DataFrame) -> str:
        buffer = StringIO()
        ans = buffer.getvalue()
        return ans

    def get_quartiles(self,
                      df: pd.DataFrame,
                      percentiles: list = [.25, .50, .75]) -> pd.DataFrame:
        Return basic statistics about the dataframe.
        :param df:
        :param percentiles: list of %-tiles as fractions between 0 and 1, e.g. [.2, .4, .6, .8] for quintiles
        :return: basic description df
        ans = df.describe(percentiles=percentiles)
        return ans

    def get_worksheets(self, excelFileName=None):
        if excelFileName:
            self.filename = excelFileName
        fu = FileUtil()
        if fu.file_exists(self.filename):
            xl = pd.ExcelFile(self.filename)
            return xl.sheet_names
            logger.error(f'Cannot find Excel file {self.filename}.')
            return None

    def duplicate_rows(self,
                       df: pd.DataFrame,
                       fieldList: list = None,
                       keep: str = 'first') -> pd.DataFrame:
        Return a dataframe with the duplicates as specified by the columns in fieldList.
        If fieldList is missing or None, then return the exactly duplicated rows.
        :param df: dataframe to scan for duplicates
        :param fieldList: fields in df to examine for duplicates.
        :param keep: 'first' or 'last' to keep the first dupe or the last.
        :return: df of the duplicates
        if fieldList:
            ans = df[df.duplicated(fieldList, keep=keep)]
            ans = df[df.duplicated(keep=keep)]
        return ans

    def drop_duplicates(self,
                        df: pd.DataFrame,
                        fieldList: list = None,
                        keep: str = 'first') -> pd.DataFrame:
        Drop the duplicates as specified by the columns in fieldList.
        If fieldList is missing or None, then return the exactly duplicated rows.
        :param df: dataframe to scan for duplicates
        :param fieldList: fields in df to examine for duplicates.
        :param keep: 'first' or 'last' to keep the first dupe or the last.
        :return: df without the duplicates
        param_dict = {'keep': keep, 'inplace': False}
        if fieldList:
            param_dict['subset'] = fieldList
        return df.drop_duplicates(**param_dict)

    def convert_dict_to_dataframe(self, list_of_dicts: list) -> pd.DataFrame:
        Convert a list of dictionaries to a dataframe.
        :param list_of_dicts:
        return pd.DataFrame(list_of_dicts)

    def convert_list_to_dataframe(self,
                                  lists: list,
                                  column_names: List = None) -> pd.DataFrame:
        Convert a list of lists to a dataframe. If provided, add the column names. If not, provide default col names.
        :param lists: a list of lists, like [[1,2,3], ['a', 'b', 'c']]
        :param column_names: Column names to use. Defaults to col00, col01, col22, .. col99
        if column_names:
            return pd.DataFrame(data=lists, columns=column_names)
        # Use the default column names: col00, col01...
        ans = pd.DataFrame(data=lists)
        return ans

    def convert_matrix_to_dataframe(self, lists: list) -> pd.DataFrame:
        convert a list of lists to a dataframe.
        :param lists:
        return pd.DataFrame(data=lists)

    def convert_dataframe_to_matrix(self, df: pd.DataFrame) -> np.ndarray:
        Convert all of the values to a numpy ndarray.

        :param df:
        return df.to_numpy()

    def convert_dataframe_to_vector(self, df: pd.DataFrame) -> np.ndarray:
        Convert the dataframe to a numpy vector.
        :param df:
        cols = self.get_df_headers(df)
        if len(cols) == 1:
            return df.to_numpy().reshape(-1, )
            f'Dataframe should have exactly one column, but contains {len(cols)}. Returning None.'
        return None

    def convert_dataframe_col_to_list(self, df: pd.DataFrame,
                                      column_name: str) -> list:
        Convert the given dataframe column to a list.
        :param df:
        :param column_name: a column name, like 'age'
        :return: a list of that column
        return df[column_name].values.tolist()

    def without_null_rows(self, df: pd.DataFrame,
                          column_name: str) -> pd.DataFrame:
        Return a DataFrame without the rows that are null in the given column_name.
        :param df: source DataFrame
        :param column_name: Column name to remove.
        :return: new DataFrame
            mask = pd.notnull(df[column_name])
            return df[mask]
        except KeyError:
                f'Unable to find column_name name: {column_name}. Returning empty df.'
            return PandasUtil.empty_df()

    def select(self, df: pd.DataFrame, column_name: str,
               match_me: Union[str, int]) -> pd.DataFrame:
        Return a DataFrame that selects on the column_name that is equal to match_me.
        Similar to a SELECT * WHERE clause in SQL.
        :param df:
        :param column_name:
        :param match_me:
        :return: df with the column_name matching the selected clause (possibly empty)
        return df.loc[df[column_name] == match_me]

    def mask_blanks(self, df: pd.DataFrame, column_name: str) -> list:
        Return a boolean list with a True in the rows that have a blank column_name.
        :param df:
        :param column_name:
        # ans = df.loc[df[column_name] == '']
        ans = df[column_name] == ''
        return ans

    def select_blanks(self, df: pd.DataFrame, column_name: str) -> list:
        return df[self.mask_blanks(df, column_name)]

    def mask_non_blanks(self, df: pd.DataFrame, column_name: str) -> list:
        Return a boolean list with a True in the rows that have a nonblank column_name.
        :param df:
        :param column_name:
        blanks = self.mask_blanks(df, column_name)
        non_blanks_mask = [not x for x in blanks]
        return non_blanks_mask

    def select_non_blanks(self, df: pd.DataFrame, column_name: str) -> list:
        return df[self.mask_non_blanks(df, column_name)]

    def unique_values(self, df: pd.DataFrame, column_name: str) -> list:
        Return a list of the unique values in column_name.
        :param df:
        :param column_name:
        return self.drop_duplicates(df=df[column_name]).tolist()

    def count_by_column(self,
                        df: pd.DataFrame,
                        column_name: str = None) -> pd.DataFrame:
        Return a count by value of the given column.
        :param df:
        :param column_name:
        return df[column_name].value_counts()

    def add_new_col_with_func(self, df: pd.DataFrame, column_name: str,
                              func: Callable[[], list]) -> pd.DataFrame:
        Call the func with no args to assign a new column_name to the dataframe.
        func should return a list comprehension.
        Here's an example of what the function should do.
            def my_func(self) -> list:
                df = self.pu.df
                col_of_interest = df['number']
                return [self.my_f(x) for x in col_of_interest]

        It gets called with:
            df = self.pu.add_new_col_with_func(df, 'new_col_name', self.my_func)

        :param df:
        :param column_name:
        :param func: func (usually no args)
        self.df = df
        df[column_name] = func()
        return df

    def add_new_col_from_array(self, df: pd.DataFrame, column_name: str,
                               new_col: np.array) -> pd.DataFrame:
        Use the values in new_col to create a new column.
        Limitations: this is not as sophisticated as https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas .
        The length of new_col must be the same as the length of df.
        :param df:
        :param column_name:
        :param new_col: If this really is a Series, it will try to match indexes with the existing df (probably a good thing).
        df[column_name] = new_col
        return df

    def mark_rows_by_func(self, df: pd.DataFrame, column_name: str,
                          func: Callable[[], list]) -> Bools:
        Return a list of bools depending on the func.
        Here's a func (which takes a list as a parameter):
            def is_adult(self, age:list):
                return age >= 21
        Here's how to invoke it:
            mark = self.pu.mark_rows_by_func(df, 'Age', self.is_adult)

        :param df: dataframe under scrutiny
        :param column_name: name of the column_name
        :param func:   function that is to be invoked. Takes a list and returns a list of booleans.
        mask = func(df[column_name])
        return mask

    def mark_rows_by_criterion(self, df: pd.DataFrame, column_name: str,
                               criterion: Union[str, int, float]) -> Bools:
        Return a list of bools when column_name meets the criterion.
        :param df:
        :param column_name:
        :param criterion:
        mask = df[column_name] == criterion
        return mask

    def mark_isnull(self, df: pd.DataFrame, column_name: str) -> Bools:
        mask = df[column_name].isnull()
        return mask

    def masked_df(self,
                  df: pd.DataFrame,
                  mask: Bools,
                  invert_mask: bool = False):
        if not invert_mask:
            return df[mask]
            my_mask = [not x for x in mask]
            return df[my_mask]

    def slice_df(self,
                 df: pd.DataFrame,
                 start_index: int = 0,
                 end_index: int = None,
                 step: int = 1):
        Slice the df by the given start, end, and step.
        NOTE: this does row slicing only.
        :param df:
        :param start_index: 0-based first index to use. Defaults to 0 (the first el)
        :param end_index: end of list index. Defaults to None (which means the end of the list).
        :param step: how many to skip. 2 means skip every other. Default of 1 means don't skip.
        end_idx = end_index or len(df)
        ans = df.iloc[start_index:end_idx:step]
        return ans

    def set_index(self,
                  df: pd.DataFrame,
                  columns: Union[Strings, str],
                  is_in_place: bool = True) -> pd.DataFrame:
        Set the index of df.

        :param df: Dataframe under scrutiny.
        :param columns: Can be a str (=single column_name) or a List of strings.
        :param is_in_place: True to add the index in place / False to create a new df
        :return: df or None (if is_in_place is true)
        return df.set_index(columns, inplace=is_in_place)

    def reset_index(self,
                    df: pd.DataFrame,
                    is_in_place: bool = True,
                    is_dropped: bool = False) -> pd.DataFrame:
        Reset the index.
        :param df:
        :param is_in_place:
        :param is_dropped:
        return df.reset_index(drop=is_dropped, inplace=is_in_place)

    def drop_index(self,
                   df: pd.DataFrame,
                   is_in_place: bool = True) -> pd.DataFrame:
        Drop the index
        :param df:
        :param is_in_place:
        :param is_dropped:
        return self.reset_index(df=df,

    def drop_col(self,
                 df: pd.DataFrame,
                 columns: Union[Strings, str],
                 is_in_place: bool = True) -> pd.DataFrame:
        Drop the given column_name.
        :param df:
        :param columns: Can be a str (=single column_name) or a List of strings.
        :param is_in_place: if true, column_name is dropped from df in place. Otherwise, a new df is returned.
        :return: None if is_in_place is True. Else df with the column_name dropped.
        major, minor, _ = self.pandas_version()
        if (major == 0) & (minor < 21):
                f'Unable to drop column, as Pandas version is {minor}. Returning unchanged df.'
            return df

        return df.drop(columns=columns, inplace=is_in_place)

    def drop_col_keeping(self,
                         df: pd.DataFrame,
                         cols_to_keep: Union[Strings, str],
                         is_in_place: bool = True) -> pd.DataFrame:
        Keep the given columns and drop the rest.
        :param df:
        :param cols_to_keep:
        :param is_in_place:
        headers_to_drop = self.get_df_headers(df)
            f'I have these headers: {headers_to_drop}. But I will keep {cols_to_keep}'
        exceptions = cols_to_keep
        if isinstance(cols_to_keep, str):
            exceptions = [cols_to_keep]
        for col in exceptions:
        return self.drop_col(df=df,

    def drop_row_by_criterion(self,
                              df: pd.DataFrame,
                              column_name: str,
                              criterion: Union[int, str],
                              is_in_place: bool = True) -> pd.DataFrame:
        Drop the rows that have criterion in the given column.
        :param df:
        :param column_name:
        :param criterion:
        :param is_in_place:
        return df.drop(df[df[column_name] == criterion].index,

    def drop_row_if_nan(self,
                        df: pd.DataFrame,
                        column_names: Strings = None,
                        is_in_place: bool = True) -> pd.DataFrame:
        Drop a row if the given column name is NaN.
        :param df:
        :param column_names: Drop the rows based in this array of column names. If None, drop every row with all NaNs.
        :param is_in_place:
        if column_names:
            return df.dropna(axis='index',
        return df.dropna(axis='index', inplace=is_in_place, how='all')

    def reorder_cols(self, df: pd.DataFrame, columns: Strings) -> pd.DataFrame:
        Using the columns, return a new df.
        :param df:
        :param columns: list of strings, like ['colD', 'colA', 'colB', 'colC']
        return df[columns]

    def replace_col(self, df: pd.DataFrame, column: str,
                    replace_dict: dict) -> pd.DataFrame:
        Replace the values of column_name using replace_dict.
        This will will replace the column VALUES.
        :param df:
        :param column:
        :param replace_dict: {'origA':'replA', 'origB':'replB'}
        :return: df with column_name replaced
            df[column] = df[column].map(replace_dict)
        except KeyError:
                f'Value found outside of: {replace_dict.keys()} or column_name {column} not found. Returning empty df.'
            return self.empty_df()
        return df

    def replace_col_using_func(self, df: pd.DataFrame, column_name: str,
                               func: Callable[[], list]) -> pd.DataFrame:
        Replace the column contents by each element's value, as determined by func.
        This will will replace the column VALUES.
        :param df: Dataframe under scrutiny.
        :param column_name: (single column_name) name
        :param func: Function operates on whatever element it is presented, and returns the changed element.
        :return: df
        df[column_name] = df[column_name].apply(func)
        return df

    def replace_col_using_mult_cols(self, df: pd.DataFrame,
                                    column_to_replace: str, cols: Strings,
                                    func: Callable[[], list]) -> pd.DataFrame:
        Replace column_to_replace, using the given func.
        This will will replace the column VALUES.
        :param df: Dataframe under scrutiny.
        :param column_to_replace: (single column_name) name
        :param cols: list of columns used for the following func
        :param func: Pointer to a local function.
        :return: df with replaced column
        df[column_to_replace] = df[cols].apply(func, axis=1)
        return df

    def replace_col_with_scalar(self,
                                df: pd.DataFrame,
                                column_name: str,
                                replace_with: Union[str, int],
                                mask: Bools = None) -> pd.DataFrame:
        Replace the all column_name with replace_with. If a mask of bools is used, only replace those elements with a True.
        Helpful reference at https://kanoki.org/2019/07/17/pandas-how-to-replace-values-based-on-conditions/
        :param df:
        :param column_name:
        :param replace_with:
        :param mask:
        if mask is None:
            df[column_name] = replace_with
        elif isinstance(mask, pd.Series):
            df[column_name].mask(mask.tolist(), replace_with, inplace=True)
        elif isinstance(mask, list):
            # df[column_name].mask(mask, replace_with, inplace=True) # Method 1 and works
            df.loc[mask, column_name] = replace_with  # Method 2 at kanoki.
                f'mask must be None, a series, or a list, but it is: {type(mask)}'
            return self.empty_df()

    def join_two_dfs_on_index(self, df1: pd.DataFrame,
                              df2: pd.DataFrame) -> pd.DataFrame:
        return a column-wise join of these two dataframes on their mutual index.
        :param df1:
        :param df2:
        return pd.concat([df1, df2], axis=1, ignore_index=False)

    def join_dfs_by_column(self, dfs: Dataframes) -> pd.DataFrame:
        Return a column-wise join of these dataframes.
        :param dfs:
        return pd.concat(dfs, axis='columns')

    def join_dfs_by_row(self, dfs: Dataframes) -> pd.DataFrame:
        Return a row-wise join of these dataframes.
        Note: all the dfs should have the same column names, so you might call it in this way:
          headers = pu.get_df_headers(big_df)
          pu.set_df_headers(new_df, headers)
          df2 = pu.join_dfs_by_row([new_df, big_df])
        :param dfs:
        return pd.concat(dfs, axis='rows', ignore_index=True)

    def dummy_var_df(self,
                     df: pd.DataFrame,
                     columns: Union[Strings, str],
                     drop_first: bool = True) -> pd.DataFrame:
        Do a one-hot encoding.
        Create a dummy variable based on the given column.
        :param df:
        :param columns: a single column name or a list of column names.
        if isinstance(columns, str):
            my_columns = [columns]
            my_columns = columns
        df = pd.get_dummies(data=df, columns=my_columns, drop_first=drop_first)
        return df

    def replace_col_names(self,
                          df: pd.DataFrame,
                          replace_dict: dict,
                          is_in_place: bool = True) -> pd.DataFrame:
        :param replace_dict: {'origColA':'replColA', 'origColB':'replColB'}

        return df.rename(columns=replace_dict, inplace=is_in_place)

    def replace_col_names_by_pattern(self,
                                     df: pd.DataFrame,
                                     prefix: str = "col",
                                     is_in_place: bool = True) -> pd.DataFrame:
        Replace the column names with col1, col2....
        :param df:
        :param prefix: string prefix, such as "col"
        :param is_in_place:
        cur_names = self.get_df_headers(df)
        gen = generate_col_names(prefix)
        replacement_dict = {k: next(gen) for k in cur_names}
        return self.replace_col_names(df, replacement_dict, is_in_place)

    def coerce_to_string(self, df: pd.DataFrame,
                         columns: Union[Strings, str]) -> pd.DataFrame:
        Coerce the given column_name name to a string.
        :param df:
        :param column_name:
        :return: new df with column_name coerced to str.
        if isinstance(columns, str):
            # Make the single str columns into a list with just that one element.
            cols_as_list = [columns]
            cols_as_list = columns
        for col in cols_as_list:
            df[col] = df[col].apply(str)
        return df

    def coerce_to_numeric(self, df: pd.DataFrame,
                          columns: Union[Strings, str]) -> pd.DataFrame:
        Coerce the given column_name name to ints or floats.
        :param df:
        :param columns: a column name (or list of names) to coerce
        :return: df with columns coerced to a numeric in place.
        if isinstance(columns, str):
            # Make the single str columns into a list with just that one element.
            cols_as_list = [columns]
            cols_as_list = columns
        df[cols_as_list] = df[cols_as_list].apply(pd.to_numeric)
        return df

    def coerece_to_int(self, df: pd.DataFrame,
                       columns: Union[Strings, str]) -> pd.DataFrame:
        Coerce the given column name(s) to an int.
        :param df:
        :param columns: a column name (or list of names) to coerce
        :return: df with columns coerced to a numeric in place.
        df[columns] = df[columns].astype(int)
        return df

    def round(self, df: pd.DataFrame, rounding_dict: dict) -> pd.DataFrame:
        Round the columns given in rounding_dict to the given number of decimal places.
        Unexpected result found in testing: python function round(4.55, 2) yields 4.5 BUT this function returns 4.6
        :param df:
        :param rounding_dict: {'A': 2, 'B':3}
        :return: df rounded to the specified number of places.
        return df.round(rounding_dict)

    def replace_vals(self,
                     df: pd.DataFrame,
                     replace_me: str,
                     new_val: str,
                     is_in_place: bool = True) -> pd.DataFrame:
        Replace the values of replace_me with the new_val.

        :param df: Dataframe under scrutiny.
        :param is_in_place: True to replace values in place / False to create a new df
        :return: df or None (if is_in_place is true)
        return df.replace(to_replace=replace_me,

    def replace_vals_by_mask(self, df: pd.DataFrame, mask: Bools,
                             col_to_change: str, new_val: Union[str, int,
        Replace the values in the col_to_change with the new_val
        :param df:
        :param mask:
        :param col_to_change: Column Name whose rows you want to change
        :param new_val:
        :return: the changed df (also changed in place)
        ans = df.loc[mask, col_to_change] = new_val
        return ans

    def is_empty(self, df: pd.DataFrame) -> bool:
        Return true if the df is empty.
        :param df: Dataframe to inspect
        :return: True IFF it is empty
        return df.empty

    def aggregates(self, df: pd.DataFrame, group_by: Strings,
                   col: str) -> pd.DataFrame:
        Return the average, min, max, and sum of the dataframe when grouped by the given strings.
        Reference: https://jamesrledoux.com/code/group-by-aggregate-pandas .
        :param df:
        :param group_by:
        grouped_multiple = df.groupby(group_by).agg(
            {col: ['mean', 'min', 'max', 'sum']})
        grouped_multiple.columns = ['mean', 'min', 'max', 'sum']
        self.reset_index(grouped_multiple, is_in_place=True)
        return grouped_multiple

    def stats(self, df: pd.DataFrame, xlabel_col_name: str,
              ylabel_col_name: str):
        Calculate the main statistics.
        :param df: dataframe under scrutiny
        :param xlabel_col_name: x column label
        :param ylabel_col_name: y column label
        :return: slope, intercept, and r (correlation)
        slope, intercept, r, p, epsilon = linregress(df[xlabel_col_name],
        logger.info('Main equation: y = %.3f x + %.3f' % (slope, intercept))
        logger.info('r^2 = %.4f' % (r * r))
        logger.info('p = %.4f' % (p))
        logger.info('std err: %.4f' % (epsilon))
        return slope, intercept, r

    def head(self, df: pd.DataFrame, how_many_rows: int = 10) -> pd.DataFrame:
        Return the first how_many_rows. This works well if called as the last line of an immediate, as in:
        :param df:
        :param how_many_rows:
        self.df = df
        return self.df.head(how_many_rows)

    def head_as_string(self, df: pd.DataFrame, how_many_rows: int = 10) -> str:
        Return the first how_many_rows as a string, separated by \n.
        :param df:
        :param how_many_rows:
        ans = str(self.head(df, how_many_rows))
        logger.debug(f'First {how_many_rows} are:\n{ans}')
        return ans

    def tail_as_string(self, df: pd.DataFrame, how_many_rows: int = 10) -> str:
        Return the last how_many_rows as a string, separated by \n.
        :param df:
        :param how_many_rows:
        ans = str(self.tail(df, how_many_rows))
        logger.debug(f'Last {how_many_rows} are:\n{ans}')
        return ans

    def tail(self, df: pd.DataFrame, how_many_rows: int = 10) -> pd.DataFrame:
        Return the last how_many_rows. This works well if called as the last line of an immediate, as in:
        :param df:
        :param how_many_rows:
        self.df = df
        return self.df.tail(how_many_rows)

    def sort(self,
             df: pd.DataFrame,
             columns: Union[Strings, str],
             is_in_place: bool = True,
             is_asc: bool = True):
        Sort the given dataFrame by the given column(s).
        :param df:
        :param columns:
        :param is_in_place:
        :param is_asc:
        return df.sort_values(columns,

    def largest_index(self, df: pd.DataFrame) -> Tuple[int, int]:
        Return the largest index and its value (usually an int and an int).
        :param df:
        :return: (index, value of index)
        return df.index.argmax(), df.index.max()

    def smallest_index(self, df: pd.DataFrame) -> Tuple[int, int]:
        Return the smallest index and its value (usually an int and an int).
        :param df:
        :return: (index, value of index)
        return df.index.argmin(), df.index.min()