def execute(self, *args): valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._dest_dir], ) valid() if not self._columns and not self._column_numbers: raise InvalidParameter( "Specifying either 'column' or 'column_numbers' is essential.") if self._columns and self._column_numbers: raise InvalidParameter( "Cannot specify both 'column' and 'column_numbers'.") files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound("The specified csv file not found.") for f in files: _, filename = os.path.split(f) dest_path = os.path.join(self._dest_dir, filename) if self._columns: Csv.extract_columns_with_names(f, dest_path, self._columns) elif self._column_numbers: if isinstance(self._column_numbers, int) is True: remain_column_numbers = [] remain_column_numbers.append(self._column_numbers) else: column_numbers = self._column_numbers.split(",") remain_column_numbers = [int(n) for n in column_numbers] Csv.extract_columns_with_numbers(f, dest_path, remain_column_numbers)
def test_extract_columns_with_numbers_with_no_headers(self): # create test csv os.makedirs(self._data_dir, exist_ok=True) test_csv = os.path.join(self._data_dir, "test.csv") test_csv_data = [ ["1", "spam1", "hoge1"], ["2", "spam2", "hoge2"], ["3", "spam3", "hoge3"], ] with open(test_csv, "w") as t: writer = csv.writer(t) writer.writerows(test_csv_data) t.flush() try: output_file = os.path.join(self._data_dir, "output.csv") remain_column_numbers = [1, 3] Csv.extract_columns_with_numbers(test_csv, output_file, remain_column_numbers) with open(test_csv, "r") as o: reader = csv.reader(o) for r in reader: assert r[0] in [ test_csv_data[0][0], test_csv_data[1][0], test_csv_data[2][0], ] assert r[1] in [ test_csv_data[0][2], test_csv_data[1][2], test_csv_data[2][2], ] finally: shutil.rmtree(self._data_dir)
def execute(self, *args): files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) != 1: raise Exception("Input file must be only one.") self._logger.info("Files found %s" % files) # essential parameters check valid = EssentialParameters( self.__class__.__name__, [ self._before_format, self._before_enc, self._after_format, self._after_enc, self._dest_dir, self._dest_pattern, ], ) valid() if self._dest_pattern: self._logger.warning( "'dest_pattern' will be unavailable in the near future." + "Basically every classes which extends FileBaseTransform will be allowed" + " plural input files, and output files will be the same name with input" + " file names.\n" "At that time, if 'dest_dir' is given, transformed files will be created in the given directory.\n" # noqa + "If not, original files will be updated by transformed files." ) with open(files[0], mode="rt", encoding=self._before_enc) as i: reader = csv.reader(i, delimiter=Csv.delimiter_convert(self._before_format)) with open( os.path.join(self._dest_dir, self._dest_pattern), mode="wt", newline="", encoding=self._after_enc, ) as o: writer = csv.writer( o, delimiter=Csv.delimiter_convert(self._after_format), quoting=Csv.quote_convert(self._quote), lineterminator=Csv.newline_convert(self._after_nl), ) for line in reader: writer.writerow(line)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [ self._before_format, self._before_enc, ], ) valid() files = super().get_target_files(self._src_dir, self._src_pattern) if self._after_format is None: self._after_format = self._before_format if self._after_enc is None: self._after_enc = self._before_enc for file in files: new_file = self._new_file(file) with open(file, mode="rt", encoding=self._before_enc) as i: reader = csv.reader( i, delimiter=Csv.delimiter_convert(self._before_format) ) with open( new_file, mode="wt", newline="", encoding=self._after_enc ) as o: writer = csv.writer( o, delimiter=Csv.delimiter_convert(self._after_format), quoting=Csv.quote_convert(self._quote), lineterminator=Csv.newline_convert(self._after_nl), ) for i, line in enumerate(reader): if i == 0: writer.writerow(self._replace_headers(line)) else: writer.writerow(line) os.remove(file) os.rename(new_file, new_file[:-5])
def test_extract_columns_with_names(self): # create test csv os.makedirs(self._data_dir, exist_ok=True) test_csv = os.path.join(self._data_dir, "test.csv") test_csv_data = [["key", "data"], ["1", "spam"]] with open(test_csv, "w") as t: writer = csv.writer(t) writer.writerows(test_csv_data) t.flush() output_file = os.path.join(self._data_dir, "output.csv") try: remain_columns = ["key"] Csv.extract_columns_with_names(test_csv, output_file, remain_columns) with open(test_csv, "r") as o: reader = csv.DictReader(o) for r in reader: assert r["key"] == test_csv_data[1][0] finally: shutil.rmtree(self._data_dir)
def func(): # Find csv columns from all csv files csv_columns = [] for file in files: csv_columns.extend(Csv.get_column_names(file)) csv_columns = sorted(set(csv_columns), key=csv_columns.index) if self._refresh is True: # Drop table in advance, If refresh is True self._sqlite_adptr.drop_table(self._tblname) self._sqlite_adptr.create_table(self._tblname, csv_columns, self._primary_key) else: self._sqlite_adptr.create_table(self._tblname, csv_columns, self._primary_key) if self._force_insert is True: db_columns = self._sqlite_adptr.get_column_names( self._tblname) result = list(set(csv_columns) - set(db_columns)) self._sqlite_adptr.add_columns(self._tblname, result) else: # Make sure if csv columns and db table names are exactly the same db_columns = self._sqlite_adptr.get_column_names( self._tblname) if self._sqlite_adptr.escape_columns( csv_columns) != self._sqlite_adptr.escape_columns( db_columns): raise CliboaException( "Csv columns %s were not matched to table column %s." % (csv_columns, db_columns)) for file in files: self._sqlite_adptr.import_table(file, self._tblname, refresh=False, encoding=self._encoding) if self._index and len(self._index) > 0: """ Create index (Add the index at the end for better performance when insert data is large) """ self._sqlite_adptr.add_index(self._tblname, self._index)