def execute(self, *args): valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._dest_dir], ) valid() if not self._columns and not self._column_numbers: raise InvalidParameter( "Specifying either 'column' or 'column_numbers' is essential.") if self._columns and self._column_numbers: raise InvalidParameter( "Cannot specify both 'column' and 'column_numbers'.") files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound("The specified csv file not found.") for f in files: _, filename = os.path.split(f) dest_path = os.path.join(self._dest_dir, filename) if self._columns: Csv.extract_columns_with_names(f, dest_path, self._columns) elif self._column_numbers: if isinstance(self._column_numbers, int) is True: remain_column_numbers = [] remain_column_numbers.append(self._column_numbers) else: column_numbers = self._column_numbers.split(",") remain_column_numbers = [int(n) for n in column_numbers] Csv.extract_columns_with_numbers(f, dest_path, remain_column_numbers)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._dest_dir, self._dest_pattern], ) valid() if self._dest_pattern: self._logger.warning( "'dest_pattern' will be unavailable in the near future." + "'dest_pattern' will change to 'dest_name'." ) if not self._src_pattern and not self._src_filenames: raise InvalidParameter( "Specifying either 'src_pattern' or 'src_filenames' is essential." ) if self._src_pattern and self._src_filenames: raise InvalidParameter( "Cannot specify both 'src_pattern' and 'src_filenames'." ) if self._src_pattern: files = File().get_target_files(self._src_dir, self._src_pattern) else: files = [] for file in self._src_filenames: files.append(os.path.join(self._src_dir, file)) if len(files) == 0: raise FileNotFound("No files are found.") elif len(files) == 1: self._logger.warning("Two or more input files are required.") file = files.pop(0) df1 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) for file in files: df2 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) df1 = pandas.concat([df1, df2]) df1.to_csv( os.path.join(self._dest_dir, self._dest_pattern), encoding=self._encoding, index=False, )
def execute(self, *args): super().execute() if not self._key and not self._bucket: raise InvalidParameter("Specifying either 'key' or 'bucket' is essential.") if self._key and self._bucket: raise InvalidParameter("Cannot specify both 'key' and 'bucket'.") # fetch records and save to on-memory if self._key: valid = EssentialParameters(self.__class__.__name__, [self._tblname]) valid() self._save_to_cache() elif self._bucket: valid = EssentialParameters(self.__class__.__name__, [self._dest_dir]) valid() self._save_as_file_via_gcs()
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._dest_dir, self._dest_pattern], ) valid() if not self._src_pattern and not self._src_filenames: raise InvalidParameter( "Specifying either 'src_pattern' or 'src_filenames' is essential." ) if self._src_pattern and self._src_filenames: raise InvalidParameter( "Cannot specify both 'src_pattern' and 'src_filenames'.") if self._src_pattern: files = File().get_target_files(self._src_dir, self._src_pattern) else: files = [] for file in self._src_filenames: files.append(os.path.join(self._src_dir, file)) if len(files) < 2: raise InvalidCount("Two or more input files are required.") file = files.pop(0) df1 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) for file in files: df2 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) df1 = pandas.concat([df1, df2]) df1.to_csv( os.path.join(self._dest_dir, self._dest_pattern), encoding=self._encoding, index=False, )
def execute(self, *args): valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._dest_pattern], ) valid() if self._dest_pattern: self._logger.warning( "'dest_pattern' will be unavailable in the near future." + "Basically every classes which extends FileBaseTransform will be allowed" + " plural input files, and output files will be the same name with input" + " file names.\n" "At that time, if 'dest_dir' is given, transformed files will be created in the given directory.\n" # noqa + "If not, original files will be updated by transformed files.") files = super().get_target_files(self._src_dir, self._src_pattern) self._logger.info("Files found %s" % files) if len(files) == 0: raise InvalidCount("No files are found.") dir = self._dest_dir if self._dest_dir is not None else self._src_dir dest_path = os.path.join(dir, (self._dest_pattern + ".%s" % self._format)) if self._format == "tar": with tarfile.open(dest_path, "w") as tar: for file in files: arcname = (os.path.join(self._dest_pattern, os.path.basename(file)) if self._create_dir else os.path.basename(file)) tar.add(file, arcname=arcname) elif self._format == "zip": with zipfile.ZipFile(dest_path, "w") as zp: for file in files: arcname = (os.path.join(self._dest_pattern, os.path.basename(file)) if self._create_dir else os.path.basename(file)) zp.write(file, arcname=arcname) else: raise InvalidParameter( "'format' must set one of the followings [tar, zip]")
def _source_path_reader(self, src, encoding="utf-8"): """ Returns an path to temporary file contains content specify in src if src is dict, returns src if not """ if src is None: return src if isinstance(src, dict) and "content" in src: with tempfile.NamedTemporaryFile(mode="w", encoding=encoding, delete=False) as fp: fp.write(src["content"]) return fp.name elif isinstance(src, dict) and "file" in src: if os.path.exists(src["file"]) is False: raise FileNotFound(src) return src["file"] else: raise InvalidParameter("The parameter is invalid.")
def __call__(self): for p in self.__param_list: if not p: raise InvalidParameter( "The essential parameter is not specified in %s." % self.__cls_name )
def _save_as_file_via_gcs(self): self._logger.info("Save data as a file via GCS") os.makedirs(self._dest_dir, exist_ok=True) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % (StringUtil().random_str(self._RANDOM_STR_LENGTH), ymd_hms,) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " "Please see more information " "https://github.com/BrainPad/cliboa/blob/master/docs/modules/bigquery_read.md" ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) gbq_client = BigQuery.get_bigquery_client(key_filepath) if self._dataset and self._tblname: table_ref = gbq_client.dataset(self._dataset).table(self._tblname) elif self._dataset and not self._tblname: tmp_tbl = ( "tmp_" + StringUtil().random_str(self._RANDOM_STR_LENGTH) + "_" + ymd_hms ) table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl) gcs_client = Gcs.get_gcs_client(key_filepath) gcs_bucket = gcs_client.bucket(self._bucket) # extract job config settings ext_job_config = BigQuery.get_extract_job_config() ext_job_config.compression = BigQuery.get_compression_type() ext = ".csv" if self._filename: _, ext = os.path.splitext(self._filename) support_ext = [".csv", ".json"] if ext not in support_ext: raise InvalidParameter("%s is not supported as filename." % ext) ext_job_config.destination_format = BigQuery.get_destination_format(ext) comp_format_and_ext = {"GZIP": ".gz"} comp_ext = comp_format_and_ext.get(str(BigQuery.get_compression_type())) if self._filename: dest_gcs = "gs://%s/%s/%s%s" % ( self._bucket, prefix, self._filename, comp_ext, ) else: dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext, comp_ext) # Execute query. if self._query: query_job_config = BigQuery.get_query_job_config() query_job_config.destination = table_ref query_job_config.write_disposition = BigQuery.get_write_disposition() query_job = gbq_client.query( self._query, location=self._location, job_config=query_job_config ) query_job.result() # Extract to GCS extract_job = gbq_client.extract_table( table_ref, dest_gcs, job_config=ext_job_config, location=self._location ) extract_job.result() # Download from gcs for blob in gcs_bucket.list_blobs(prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary table if self._query: gbq_client.delete_table(table_ref) # Cleanup temporary files for blob in gcs_bucket.list_blobs(prefix=prefix): blob.delete()
def _save_as_file_via_gcs(self): self._logger.info("Save data as a file via GCS") os.makedirs(self._dest_dir, exist_ok=True) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % ( StringUtil().random_str(self._RANDOM_STR_LENGTH), ymd_hms, ) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) gbq_client = BigQuery.get_bigquery_client(self._credentials) if self._dataset and self._tblname: table_ref = gbq_client.dataset(self._dataset).table(self._tblname) elif self._dataset and not self._tblname: tmp_tbl = ("tmp_" + StringUtil().random_str(self._RANDOM_STR_LENGTH) + "_" + ymd_hms) table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl) gcs_client = Gcs.get_gcs_client(self._credentials) gcs_bucket = gcs_client.get_bucket(self._bucket) # extract job config settings ext_job_config = BigQuery.get_extract_job_config() ext_job_config.compression = BigQuery.get_compression_type() ext = ".csv" if self._filename: _, ext = os.path.splitext(self._filename) support_ext = [".csv", ".json"] if ext not in support_ext: raise InvalidParameter("%s is not supported as filename." % ext) ext_job_config.destination_format = BigQuery.get_destination_format( ext) comp_format_and_ext = {"GZIP": ".gz"} comp_ext = comp_format_and_ext.get(str( BigQuery.get_compression_type())) if self._filename: dest_gcs = "gs://%s/%s/%s%s" % ( self._bucket, prefix, self._filename, comp_ext, ) else: dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext, comp_ext) # Execute query. if self._query: query_job_config = BigQuery.get_query_job_config() query_job_config.destination = table_ref query_job_config.write_disposition = BigQuery.get_write_disposition( ) query_job = gbq_client.query(self._query, location=self._location, job_config=query_job_config) query_job.result() # Extract to GCS extract_job = gbq_client.extract_table(table_ref, dest_gcs, job_config=ext_job_config, location=self._location) extract_job.result() # Download from gcs for blob in gcs_bucket.list_blobs(prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary table if self._query: gbq_client.delete_table(table_ref) # Cleanup temporary files for blob in gcs_bucket.list_blobs(prefix=prefix): blob.delete()