Exemple #1
0
    def test_get_target_files_ok_file_exists(self):
        # create test file
        os.makedirs(self.__data_dir, exist_ok=True)
        os.makedirs(self.__data_subdir, exist_ok=True)

        test_file = os.path.join(self.__data_dir, "test.csv")
        open(test_file, "w").close()

        test_file_sub = os.path.join(self.__data_subdir, "test_sub.csv")
        open(test_file_sub, "w").close()

        # execute1
        target_files = File().get_target_files(
            self.__data_dir, "test(.*).csv", tree=True
        )
        assert len(target_files) == 2
        assert target_files[0] == os.path.join(self.__data_dir, "test.csv")
        assert target_files[1] == os.path.join(self.__data_dir, "sub", "test_sub.csv")

        # execute2
        target_files = File().get_target_files(
            self.__data_dir, "test(.*).csv", tree=False
        )
        assert len(target_files) == 1
        assert target_files[0] == os.path.join(self.__data_dir, "test.csv")

        shutil.rmtree(self.__data_subdir)
        shutil.rmtree(self.__data_dir)
Exemple #2
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._encoding_from, self._encoding_to],
        )
        valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            self._logger.info("No files are found. Nothing to do.")
            return

        for file in files:
            basename = os.path.basename(file)

            if self._dest_dir:
                File().convert_encoding(
                    file,
                    os.path.join(self._dest_dir, basename),
                    self._encoding_from,
                    self._encoding_to,
                )
            else:
                tmpfile = os.path.join(
                    os.path.dirname(file),
                    "." + StringUtil().random_str(10) + "." + basename,
                )
                File().convert_encoding(
                    file, tmpfile, self._encoding_from, self._encoding_to
                )
                os.remove(file)
                os.rename(tmpfile, file)

            self._logger.info("Encoded file %s" % basename)
Exemple #3
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [
                self._src_dir,
                self._src1_pattern,
                self._src2_pattern,
                self._dest_dir,
                self._dest_pattern,
            ],
        )
        valid()

        if self._dest_pattern:
            self._logger.warning(
                "'dest_pattern' will be unavailable in the near future."
                + "'dest_pattern' will change to 'dest_name'."
            )

        target1_files = File().get_target_files(self._src_dir, self._src1_pattern)
        target2_files = File().get_target_files(self._src_dir, self._src2_pattern)
        if len(target1_files) == 0:
            raise InvalidCount(
                "An input file %s does not exist."
                % os.path.join(self._src_dir, self._src1_pattern)
            )
        elif len(target2_files) == 0:
            raise InvalidCount(
                "An input file %s does not exist."
                % os.path.join(self._src_dir, self._src2_pattern)
            )
        elif len(target1_files) > 1:
            self._logger.error("Hit target files %s" % target1_files)
            raise InvalidCount("Input files must be only one.")
        elif len(target2_files) > 1:
            self._logger.error("Hit target files %s" % target2_files)
            raise InvalidCount("Input files must be only one.")

        self._logger.info("Merge %s and %s." % (target1_files[0], target2_files[0]))
        df1 = pandas.read_csv(
            os.path.join(self._src_dir, target1_files[0]),
            dtype=str,
            encoding=self._encoding,
        )
        df2 = pandas.read_csv(
            os.path.join(self._src_dir, target2_files[0]),
            dtype=str,
            encoding=self._encoding,
        )
        df = pandas.merge(df1, df2)
        if "Unnamed: 0" in df.index:
            del df["Unnamed: 0"]
        df.to_csv(
            os.path.join(self._dest_dir, self._dest_pattern),
            encoding=self._encoding,
            index=False,
        )
Exemple #4
0
    def execute(self, *args):
        for k, v in self.__dict__.items():
            self._logger.debug("%s : %s" % (k, v))

        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [
                self._src_dir,
                self.__src1_pattern,
                self.__src2_pattern,
                self._dest_dir,
                self._dest_pattern,
            ],
        )
        valid()

        target1_files = File().get_target_files(self._src_dir,
                                                self.__src1_pattern)
        target2_files = File().get_target_files(self._src_dir,
                                                self.__src2_pattern)
        if len(target1_files) == 0:
            raise InvalidCount(
                "An input file %s does not exist." %
                os.path.join(self._src_dir, self.__src1_pattern))
        elif len(target2_files) == 0:
            raise InvalidCount(
                "An input file %s does not exist." %
                os.path.join(self._src_dir, self.__src2_pattern))
        elif len(target1_files) > 1:
            self._logger.error("Hit target files %s" % target1_files)
            raise InvalidCount("Input files must be only one.")
        elif len(target2_files) > 1:
            self._logger.error("Hit target files %s" % target2_files)
            raise InvalidCount("Input files must be only one.")

        self._logger.info("Merge %s and %s." %
                          (target1_files[0], target2_files[0]))
        df1 = pandas.read_csv(
            os.path.join(self._src_dir, target1_files[0]),
            dtype=str,
            encoding=self._encoding,
        )
        df2 = pandas.read_csv(
            os.path.join(self._src_dir, target2_files[0]),
            dtype=str,
            encoding=self._encoding,
        )
        df = pandas.merge(df1, df2)
        if "Unnamed: 0" in df.index:
            del df["Unnamed: 0"]
        df.to_csv(
            os.path.join(self._dest_dir, self._dest_pattern),
            encoding=self._encoding,
            index=False,
        )
Exemple #5
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._dest_dir, self._dest_pattern],
        )
        valid()

        if self._dest_pattern:
            self._logger.warning(
                "'dest_pattern' will be unavailable in the near future."
                + "'dest_pattern' will change to 'dest_name'."
            )

        if not self._src_pattern and not self._src_filenames:
            raise InvalidParameter(
                "Specifying either 'src_pattern' or 'src_filenames' is essential."
            )
        if self._src_pattern and self._src_filenames:
            raise InvalidParameter(
                "Cannot specify both 'src_pattern' and 'src_filenames'."
            )

        if self._src_pattern:
            files = File().get_target_files(self._src_dir, self._src_pattern)
        else:
            files = []
            for file in self._src_filenames:
                files.append(os.path.join(self._src_dir, file))

        if len(files) == 0:
            raise FileNotFound("No files are found.")
        elif len(files) == 1:
            self._logger.warning("Two or more input files are required.")

        file = files.pop(0)
        df1 = pandas.read_csv(
            file,
            dtype=str,
            encoding=self._encoding,
        )

        for file in files:
            df2 = pandas.read_csv(
                file,
                dtype=str,
                encoding=self._encoding,
            )
            df1 = pandas.concat([df1, df2])

        df1.to_csv(
            os.path.join(self._dest_dir, self._dest_pattern),
            encoding=self._encoding,
            index=False,
        )
Exemple #6
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._dest_dir, self._dest_pattern],
        )
        valid()

        if not self._src_pattern and not self._src_filenames:
            raise InvalidParameter(
                "Specifying either 'src_pattern' or 'src_filenames' is essential."
            )
        if self._src_pattern and self._src_filenames:
            raise InvalidParameter(
                "Cannot specify both 'src_pattern' and 'src_filenames'.")

        if self._src_pattern:
            files = File().get_target_files(self._src_dir, self._src_pattern)
        else:
            files = []
            for file in self._src_filenames:
                files.append(os.path.join(self._src_dir, file))

        if len(files) < 2:
            raise InvalidCount("Two or more input files are required.")

        file = files.pop(0)
        df1 = pandas.read_csv(
            file,
            dtype=str,
            encoding=self._encoding,
        )

        for file in files:
            df2 = pandas.read_csv(
                file,
                dtype=str,
                encoding=self._encoding,
            )
            df1 = pandas.concat([df1, df2])

        df1.to_csv(
            os.path.join(self._dest_dir, self._dest_pattern),
            encoding=self._encoding,
            index=False,
        )
Exemple #7
0
    def test_get_target_files_ok_no_files(self):
        # create test file
        os.makedirs(self.__data_dir, exist_ok=True)

        # execute
        target_files = File().get_target_files(self.__data_dir, "test(.*).csv")

        shutil.rmtree(self.__data_dir)
        assert target_files == []
Exemple #8
0
 def test_remove_csv_col(self):
     test_input_csv = os.path.join(self._data_dir, "test_input.csv")
     test_output_csv = os.path.join(self._data_dir, "test_output.csv")
     test_csv_data = [["key", "data"], ["1", "spam"]]
     with open(test_input_csv, "w") as t:
         writer = csv.writer(t)
         writer.writerows(test_csv_data)
         t.flush()
     File().remove_csv_col(test_input_csv, test_output_csv, ["key"])
Exemple #9
0
    def execute(self, *args):
        for k, v in self.__dict__.items():
            self._logger.debug("%s : %s" % (k, v))

        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [
                self._src_dir,
                self._src_pattern,
                self._dest_dir,
                self._dest_pattern,
                self.__headers,
            ],
        )
        valid()

        target_files = File().get_target_files(self._src_dir,
                                               self._src_pattern)
        if len(target_files) == 0:
            raise InvalidCount("An input file %s does not exist." %
                               os.path.join(self._src_dir, self._src_pattern))
        elif len(target_files) > 1:
            self._logger.error("Hit target files %s" % target_files)
            raise InvalidCount("Input files must be only one.")
        self._logger.info("A target file to be converted: %s")

        dest_path = os.path.join(self._dest_dir, self._dest_pattern)
        self._logger.info("Convert header of %s. An output file is %s." %
                          (target_files[0], dest_path))
        with open(target_files[0], "r", encoding=self._encoding) as s, open(
                dest_path, "w", encoding=self._encoding) as d:
            reader = csv.reader(s)
            writer = csv.writer(d, quoting=csv.QUOTE_ALL)
            headers = next(reader, None)
            new_headers = self.__replace_headers(headers)
            writer.writerow(new_headers)
            for r in reader:
                writer.writerow(r)
            d.flush()
Exemple #10
0
    def execute(self, *args):
        for k, v in self.__dict__.items():
            self._logger.debug("%s : %s" % (k, v))

        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [
                self._src_dir, self._src_pattern, self._dest_dir,
                self._dest_pattern
            ],
        )
        valid()

        # get a target file
        target_files = File().get_target_files(self._src_dir,
                                               self._src_pattern)
        if len(target_files) == 0:
            raise InvalidCount("An input file %s does not exist." %
                               os.path.join(self._src_dir, self._src_pattern))
        elif len(target_files) > 1:
            self._logger.error("Hit target files %s" % target_files)
            raise InvalidCount("Input files must be only one.")
        self._logger.info("A target file to be converted: %s" %
                          os.path.join(target_files[0]))

        # convert
        _, dest_ext = os.path.splitext(self._dest_pattern)
        if dest_ext != ".csv":
            raise InvalidFormat(
                "%s is not supported format in %s. The supported format is .csv"
                % (dest_ext, self._dest_pattern))

        df = pandas.read_excel(target_files[0], encoding=self._encoding)
        dest_path = os.path.join(self._dest_dir, self._dest_pattern)
        self._logger.info("Convert %s to %s" % (target_files[0], dest_path))
        df.to_csv(dest_path, encoding=self._encoding)
Exemple #11
0
    def execute(self, *args):
        file = super().execute()
        valid = EssentialParameters(self.__class__.__name__, [self._columns])
        valid()

        File().remove_columns(file, self._dest_path, self._columns)
Exemple #12
0
 def get_target_files(self, src_dir, src_pattern):
     """
     Search files either with regular expression
     """
     return File().get_target_files(src_dir, src_pattern)