def test_get_target_files_ok_file_exists(self): # create test file os.makedirs(self.__data_dir, exist_ok=True) os.makedirs(self.__data_subdir, exist_ok=True) test_file = os.path.join(self.__data_dir, "test.csv") open(test_file, "w").close() test_file_sub = os.path.join(self.__data_subdir, "test_sub.csv") open(test_file_sub, "w").close() # execute1 target_files = File().get_target_files( self.__data_dir, "test(.*).csv", tree=True ) assert len(target_files) == 2 assert target_files[0] == os.path.join(self.__data_dir, "test.csv") assert target_files[1] == os.path.join(self.__data_dir, "sub", "test_sub.csv") # execute2 target_files = File().get_target_files( self.__data_dir, "test(.*).csv", tree=False ) assert len(target_files) == 1 assert target_files[0] == os.path.join(self.__data_dir, "test.csv") shutil.rmtree(self.__data_subdir) shutil.rmtree(self.__data_dir)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._encoding_from, self._encoding_to], ) valid() files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: self._logger.info("No files are found. Nothing to do.") return for file in files: basename = os.path.basename(file) if self._dest_dir: File().convert_encoding( file, os.path.join(self._dest_dir, basename), self._encoding_from, self._encoding_to, ) else: tmpfile = os.path.join( os.path.dirname(file), "." + StringUtil().random_str(10) + "." + basename, ) File().convert_encoding( file, tmpfile, self._encoding_from, self._encoding_to ) os.remove(file) os.rename(tmpfile, file) self._logger.info("Encoded file %s" % basename)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [ self._src_dir, self._src1_pattern, self._src2_pattern, self._dest_dir, self._dest_pattern, ], ) valid() if self._dest_pattern: self._logger.warning( "'dest_pattern' will be unavailable in the near future." + "'dest_pattern' will change to 'dest_name'." ) target1_files = File().get_target_files(self._src_dir, self._src1_pattern) target2_files = File().get_target_files(self._src_dir, self._src2_pattern) if len(target1_files) == 0: raise InvalidCount( "An input file %s does not exist." % os.path.join(self._src_dir, self._src1_pattern) ) elif len(target2_files) == 0: raise InvalidCount( "An input file %s does not exist." % os.path.join(self._src_dir, self._src2_pattern) ) elif len(target1_files) > 1: self._logger.error("Hit target files %s" % target1_files) raise InvalidCount("Input files must be only one.") elif len(target2_files) > 1: self._logger.error("Hit target files %s" % target2_files) raise InvalidCount("Input files must be only one.") self._logger.info("Merge %s and %s." % (target1_files[0], target2_files[0])) df1 = pandas.read_csv( os.path.join(self._src_dir, target1_files[0]), dtype=str, encoding=self._encoding, ) df2 = pandas.read_csv( os.path.join(self._src_dir, target2_files[0]), dtype=str, encoding=self._encoding, ) df = pandas.merge(df1, df2) if "Unnamed: 0" in df.index: del df["Unnamed: 0"] df.to_csv( os.path.join(self._dest_dir, self._dest_pattern), encoding=self._encoding, index=False, )
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.debug("%s : %s" % (k, v)) # essential parameters check valid = EssentialParameters( self.__class__.__name__, [ self._src_dir, self.__src1_pattern, self.__src2_pattern, self._dest_dir, self._dest_pattern, ], ) valid() target1_files = File().get_target_files(self._src_dir, self.__src1_pattern) target2_files = File().get_target_files(self._src_dir, self.__src2_pattern) if len(target1_files) == 0: raise InvalidCount( "An input file %s does not exist." % os.path.join(self._src_dir, self.__src1_pattern)) elif len(target2_files) == 0: raise InvalidCount( "An input file %s does not exist." % os.path.join(self._src_dir, self.__src2_pattern)) elif len(target1_files) > 1: self._logger.error("Hit target files %s" % target1_files) raise InvalidCount("Input files must be only one.") elif len(target2_files) > 1: self._logger.error("Hit target files %s" % target2_files) raise InvalidCount("Input files must be only one.") self._logger.info("Merge %s and %s." % (target1_files[0], target2_files[0])) df1 = pandas.read_csv( os.path.join(self._src_dir, target1_files[0]), dtype=str, encoding=self._encoding, ) df2 = pandas.read_csv( os.path.join(self._src_dir, target2_files[0]), dtype=str, encoding=self._encoding, ) df = pandas.merge(df1, df2) if "Unnamed: 0" in df.index: del df["Unnamed: 0"] df.to_csv( os.path.join(self._dest_dir, self._dest_pattern), encoding=self._encoding, index=False, )
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._dest_dir, self._dest_pattern], ) valid() if self._dest_pattern: self._logger.warning( "'dest_pattern' will be unavailable in the near future." + "'dest_pattern' will change to 'dest_name'." ) if not self._src_pattern and not self._src_filenames: raise InvalidParameter( "Specifying either 'src_pattern' or 'src_filenames' is essential." ) if self._src_pattern and self._src_filenames: raise InvalidParameter( "Cannot specify both 'src_pattern' and 'src_filenames'." ) if self._src_pattern: files = File().get_target_files(self._src_dir, self._src_pattern) else: files = [] for file in self._src_filenames: files.append(os.path.join(self._src_dir, file)) if len(files) == 0: raise FileNotFound("No files are found.") elif len(files) == 1: self._logger.warning("Two or more input files are required.") file = files.pop(0) df1 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) for file in files: df2 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) df1 = pandas.concat([df1, df2]) df1.to_csv( os.path.join(self._dest_dir, self._dest_pattern), encoding=self._encoding, index=False, )
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._dest_dir, self._dest_pattern], ) valid() if not self._src_pattern and not self._src_filenames: raise InvalidParameter( "Specifying either 'src_pattern' or 'src_filenames' is essential." ) if self._src_pattern and self._src_filenames: raise InvalidParameter( "Cannot specify both 'src_pattern' and 'src_filenames'.") if self._src_pattern: files = File().get_target_files(self._src_dir, self._src_pattern) else: files = [] for file in self._src_filenames: files.append(os.path.join(self._src_dir, file)) if len(files) < 2: raise InvalidCount("Two or more input files are required.") file = files.pop(0) df1 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) for file in files: df2 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) df1 = pandas.concat([df1, df2]) df1.to_csv( os.path.join(self._dest_dir, self._dest_pattern), encoding=self._encoding, index=False, )
def test_get_target_files_ok_no_files(self): # create test file os.makedirs(self.__data_dir, exist_ok=True) # execute target_files = File().get_target_files(self.__data_dir, "test(.*).csv") shutil.rmtree(self.__data_dir) assert target_files == []
def test_remove_csv_col(self): test_input_csv = os.path.join(self._data_dir, "test_input.csv") test_output_csv = os.path.join(self._data_dir, "test_output.csv") test_csv_data = [["key", "data"], ["1", "spam"]] with open(test_input_csv, "w") as t: writer = csv.writer(t) writer.writerows(test_csv_data) t.flush() File().remove_csv_col(test_input_csv, test_output_csv, ["key"])
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.debug("%s : %s" % (k, v)) # essential parameters check valid = EssentialParameters( self.__class__.__name__, [ self._src_dir, self._src_pattern, self._dest_dir, self._dest_pattern, self.__headers, ], ) valid() target_files = File().get_target_files(self._src_dir, self._src_pattern) if len(target_files) == 0: raise InvalidCount("An input file %s does not exist." % os.path.join(self._src_dir, self._src_pattern)) elif len(target_files) > 1: self._logger.error("Hit target files %s" % target_files) raise InvalidCount("Input files must be only one.") self._logger.info("A target file to be converted: %s") dest_path = os.path.join(self._dest_dir, self._dest_pattern) self._logger.info("Convert header of %s. An output file is %s." % (target_files[0], dest_path)) with open(target_files[0], "r", encoding=self._encoding) as s, open( dest_path, "w", encoding=self._encoding) as d: reader = csv.reader(s) writer = csv.writer(d, quoting=csv.QUOTE_ALL) headers = next(reader, None) new_headers = self.__replace_headers(headers) writer.writerow(new_headers) for r in reader: writer.writerow(r) d.flush()
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.debug("%s : %s" % (k, v)) # essential parameters check valid = EssentialParameters( self.__class__.__name__, [ self._src_dir, self._src_pattern, self._dest_dir, self._dest_pattern ], ) valid() # get a target file target_files = File().get_target_files(self._src_dir, self._src_pattern) if len(target_files) == 0: raise InvalidCount("An input file %s does not exist." % os.path.join(self._src_dir, self._src_pattern)) elif len(target_files) > 1: self._logger.error("Hit target files %s" % target_files) raise InvalidCount("Input files must be only one.") self._logger.info("A target file to be converted: %s" % os.path.join(target_files[0])) # convert _, dest_ext = os.path.splitext(self._dest_pattern) if dest_ext != ".csv": raise InvalidFormat( "%s is not supported format in %s. The supported format is .csv" % (dest_ext, self._dest_pattern)) df = pandas.read_excel(target_files[0], encoding=self._encoding) dest_path = os.path.join(self._dest_dir, self._dest_pattern) self._logger.info("Convert %s to %s" % (target_files[0], dest_path)) df.to_csv(dest_path, encoding=self._encoding)
def execute(self, *args): file = super().execute() valid = EssentialParameters(self.__class__.__name__, [self._columns]) valid() File().remove_columns(file, self._dest_path, self._columns)
def get_target_files(self, src_dir, src_pattern): """ Search files either with regular expression """ return File().get_target_files(src_dir, src_pattern)