Example #1
0
    def execute(self, *args):
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._dest_dir],
        )
        valid()

        if not self._columns and not self._column_numbers:
            raise InvalidParameter(
                "Specifying either 'column' or 'column_numbers' is essential.")
        if self._columns and self._column_numbers:
            raise InvalidParameter(
                "Cannot specify both 'column' and 'column_numbers'.")

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            raise FileNotFound("The specified csv file not found.")

        for f in files:
            _, filename = os.path.split(f)
            dest_path = os.path.join(self._dest_dir, filename)
            if self._columns:
                Csv.extract_columns_with_names(f, dest_path, self._columns)
            elif self._column_numbers:
                if isinstance(self._column_numbers, int) is True:
                    remain_column_numbers = []
                    remain_column_numbers.append(self._column_numbers)
                else:
                    column_numbers = self._column_numbers.split(",")
                    remain_column_numbers = [int(n) for n in column_numbers]
                Csv.extract_columns_with_numbers(f, dest_path,
                                                 remain_column_numbers)
Example #2
0
 def test_extract_columns_with_numbers_with_no_headers(self):
     # create test csv
     os.makedirs(self._data_dir, exist_ok=True)
     test_csv = os.path.join(self._data_dir, "test.csv")
     test_csv_data = [
         ["1", "spam1", "hoge1"],
         ["2", "spam2", "hoge2"],
         ["3", "spam3", "hoge3"],
     ]
     with open(test_csv, "w") as t:
         writer = csv.writer(t)
         writer.writerows(test_csv_data)
         t.flush()
     try:
         output_file = os.path.join(self._data_dir, "output.csv")
         remain_column_numbers = [1, 3]
         Csv.extract_columns_with_numbers(test_csv, output_file,
                                          remain_column_numbers)
         with open(test_csv, "r") as o:
             reader = csv.reader(o)
             for r in reader:
                 assert r[0] in [
                     test_csv_data[0][0],
                     test_csv_data[1][0],
                     test_csv_data[2][0],
                 ]
                 assert r[1] in [
                     test_csv_data[0][2],
                     test_csv_data[1][2],
                     test_csv_data[2][2],
                 ]
     finally:
         shutil.rmtree(self._data_dir)
Example #3
0
    def execute(self, *args):
        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) != 1:
            raise Exception("Input file must be only one.")
        self._logger.info("Files found %s" % files)

        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [
                self._before_format,
                self._before_enc,
                self._after_format,
                self._after_enc,
                self._dest_dir,
                self._dest_pattern,
            ],
        )
        valid()

        if self._dest_pattern:
            self._logger.warning(
                "'dest_pattern' will be unavailable in the near future."
                + "Basically every classes which extends FileBaseTransform will be allowed"
                + " plural input files, and output files will be the same name with input"
                + " file names.\n"
                "At that time, if 'dest_dir' is given, transformed files will be created in the given directory.\n" # noqa
                + "If not, original files will be updated by transformed files."
            )

        with open(files[0], mode="rt", encoding=self._before_enc) as i:
            reader = csv.reader(i, delimiter=Csv.delimiter_convert(self._before_format))
            with open(
                os.path.join(self._dest_dir, self._dest_pattern),
                mode="wt",
                newline="",
                encoding=self._after_enc,
            ) as o:
                writer = csv.writer(
                    o,
                    delimiter=Csv.delimiter_convert(self._after_format),
                    quoting=Csv.quote_convert(self._quote),
                    lineterminator=Csv.newline_convert(self._after_nl),
                )
                for line in reader:
                    writer.writerow(line)
Example #4
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [
                self._before_format,
                self._before_enc,
            ],
        )
        valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)

        if self._after_format is None:
            self._after_format = self._before_format
        if self._after_enc is None:
            self._after_enc = self._before_enc

        for file in files:
            new_file = self._new_file(file)
            with open(file, mode="rt", encoding=self._before_enc) as i:
                reader = csv.reader(
                    i, delimiter=Csv.delimiter_convert(self._before_format)
                )
                with open(
                    new_file, mode="wt", newline="", encoding=self._after_enc
                ) as o:
                    writer = csv.writer(
                        o,
                        delimiter=Csv.delimiter_convert(self._after_format),
                        quoting=Csv.quote_convert(self._quote),
                        lineterminator=Csv.newline_convert(self._after_nl),
                    )

                    for i, line in enumerate(reader):
                        if i == 0:
                            writer.writerow(self._replace_headers(line))
                        else:
                            writer.writerow(line)

            os.remove(file)
            os.rename(new_file, new_file[:-5])
Example #5
0
 def test_extract_columns_with_names(self):
     # create test csv
     os.makedirs(self._data_dir, exist_ok=True)
     test_csv = os.path.join(self._data_dir, "test.csv")
     test_csv_data = [["key", "data"], ["1", "spam"]]
     with open(test_csv, "w") as t:
         writer = csv.writer(t)
         writer.writerows(test_csv_data)
         t.flush()
     output_file = os.path.join(self._data_dir, "output.csv")
     try:
         remain_columns = ["key"]
         Csv.extract_columns_with_names(test_csv, output_file,
                                        remain_columns)
         with open(test_csv, "r") as o:
             reader = csv.DictReader(o)
             for r in reader:
                 assert r["key"] == test_csv_data[1][0]
     finally:
         shutil.rmtree(self._data_dir)
Example #6
0
        def func():
            # Find csv columns from all csv files
            csv_columns = []
            for file in files:
                csv_columns.extend(Csv.get_column_names(file))
            csv_columns = sorted(set(csv_columns), key=csv_columns.index)

            if self._refresh is True:
                # Drop table in advance, If refresh is True
                self._sqlite_adptr.drop_table(self._tblname)
                self._sqlite_adptr.create_table(self._tblname, csv_columns,
                                                self._primary_key)
            else:
                self._sqlite_adptr.create_table(self._tblname, csv_columns,
                                                self._primary_key)

                if self._force_insert is True:
                    db_columns = self._sqlite_adptr.get_column_names(
                        self._tblname)
                    result = list(set(csv_columns) - set(db_columns))
                    self._sqlite_adptr.add_columns(self._tblname, result)
                else:
                    # Make sure if csv columns and db table names are exactly the same
                    db_columns = self._sqlite_adptr.get_column_names(
                        self._tblname)
                    if self._sqlite_adptr.escape_columns(
                            csv_columns) != self._sqlite_adptr.escape_columns(
                                db_columns):
                        raise CliboaException(
                            "Csv columns %s were not matched to table column %s."
                            % (csv_columns, db_columns))

            for file in files:
                self._sqlite_adptr.import_table(file,
                                                self._tblname,
                                                refresh=False,
                                                encoding=self._encoding)

            if self._index and len(self._index) > 0:
                """
                Create index (Add the index at the end for
                better performance when insert data is large)
                """
                self._sqlite_adptr.add_index(self._tblname, self._index)