def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_pattern, self._dest_dir]) valid() service = BlobServiceAdapter().get_client( account_url=self._account_url, account_access_key=self._account_access_key, connection_string=self._connection_string, ) container_client = service.get_container_client(self._container_name) blobs = container_client.list_blobs(name_starts_with=self._prefix) for blob in blobs: filename = blob.name rec = re.compile(self._src_pattern) if not rec.fullmatch(filename): continue dest_path = os.path.join(self._dest_dir, os.path.basename(filename)) blob_client = service.get_blob_client( container=self._container_name, blob=filename) with open(dest_path, "wb") as local_blob: blob_data = blob_client.download_blob() blob_data.readinto(local_blob)
def execute(self, *args): super().execute() input_valid = IOInput(self._io) input_valid() for k, v in self.__dict__.items(): self._logger.info("%s : %s" % (k, v)) param_valid = EssentialParameters(self.__class__.__name__, [self._tblname]) param_valid() tbl_valid = SqliteTableExistence(self._dbname, self._tblname) tbl_valid() def dict_factory(cursor, row): d = {} for i, col in enumerate(cursor.description): d[col[0]] = row[i] return d self._sqlite_adptr.connect(self._dbname) cur = self._sqlite_adptr.fetch(sql=self.__get_query(), row_factory=dict_factory) for r in cur: self._s.save(r)
def execute(self, *args): if self.__form_auth is True: valid = EssentialParameters( self.__class__.__name__, [self.__form_url, self.__form_id, self.__form_password], ) valid()
def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_pattern]) valid() adapter = ( S3Adapter(self._access_key, self._secret_key) if self._access_key and self._secret_key else S3Adapter() ) client = adapter.get_client() p = client.get_paginator("list_objects") for page in p.paginate( Bucket=self._bucket, Delimiter=self._delimiter, Prefix=self._prefix ): for c in page.get("Contents", []): path = c.get("Key") filename = os.path.basename(path) rec = re.compile(self._src_pattern) if not rec.fullmatch(filename): continue dest_path = os.path.join(self._dest_dir, filename) client.download_file(self._bucket, path, dest_path)
def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_pattern]) valid() if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " "Please see more information " "https://github.com/BrainPad/cliboa/blob/master/docs/modules/gcs_download.md" ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) client = Gcs.get_gcs_client(key_filepath) bucket = client.bucket(self._bucket) dl_files = [] for blob in client.list_blobs( bucket, prefix=self._prefix, delimiter=self._delimiter ): r = re.compile(self._src_pattern) if not r.fullmatch(blob.name): continue dl_files.append(blob.name) blob.download_to_filename( os.path.join(self._dest_dir, os.path.basename(blob.name)) ) ObjectStore.put(self._step, dl_files)
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern] ) valid() resource = self._s3_resource() bucket = resource.Bucket(self._bucket) files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) > 0: for f in files: bucket.upload_file( Key=os.path.join(self._key, os.path.basename(f)), Filename=f ) else: self._logger.info( "Files to upload do not exist. File pattern: {}".format( os.path.join(self._src_dir, self._src_pattern) ) ) if self._quit is True: return StepStatus.SUCCESSFUL_TERMINATION
def execute(self, *args): super().execute() if not self._key and not self._bucket: raise InvalidParameter("Specifying either 'key' or 'bucket' is essential.") if self._key and self._bucket: raise InvalidParameter("Cannot specify both 'key' and 'bucket'.") # fetch records and save to on-memory if self._key: valid = EssentialParameters(self.__class__.__name__, [self._tblname]) valid() self._save_to_cache() elif self._bucket: valid = EssentialParameters(self.__class__.__name__, [self._dest_dir]) valid() self._save_as_file_via_gcs()
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._tblname, self._bucket, self._dest_dir] ) valid() os.makedirs(self._dest_dir, exist_ok=True) if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) gbq_client = BigQuery.get_bigquery_client(key_filepath) gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname) gcs_client = Gcs.get_gcs_client(key_filepath) gcs_bucket = gcs_client.bucket(self._bucket) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % ("".join(random.choices(string.ascii_letters, k=8)), ymd_hms) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) """ gsc dir -> gs://{bucket_name} /{dataset_name}/{table_name} /{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz """ if self._filename: dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix, self._filename) else: dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix) # job config settings job_config = bigquery.ExtractJobConfig() job_config.compression = bigquery.Compression.GZIP job_config.destination_format = bigquery.DestinationFormat.CSV # Execute query. job = gbq_client.extract_table( gbq_ref, dest_gcs, job_config=job_config, location=self._location ) job.result() # Download from gcs for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary files for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix): blob.delete()
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._host, self._user, self._src_dir, self._src_pattern], ) valid() if isinstance(self._key, str): self._logger.warning(( "DeprecationWarning: " "In the near future, " "the `key` will be changed to accept only dictionary types. " "Please see more information " "https://github.com/BrainPad/cliboa/blob/master/docs/modules/sftp_delete.md" )) key_filepath = self._key else: key_filepath = self._source_path_reader(self._key) # remove src sftp = Sftp( self._host, self._user, self._password, key_filepath, self._passphrase, self._timeout, self._retry_count, self._port, ) sftp.clear_files(self._src_dir, re.compile(self._src_pattern))
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.info("%s : %s" % (k, v)) super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_dir, self._src_pattern]) valid() session = None if self._access_key and self._secret_key: session = Session(self._access_key, self._secret_key, self._region) s3 = session.resource("s3") if session else boto3.resource("s3") bucket = s3.Bucket(self._bucket) files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound( "Files matching to the specified pattern %s is not found." % os.path.join(self._src_dir, self._src_pattern)) else: for f in files: bucket.upload_file(Key=os.path.join(self._key, os.path.basename(f)), Filename=f)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._host, self._user, self._src_dir, self._src_pattern], ) valid() os.makedirs(self._dest_dir, exist_ok=True) # fetch src sftp = Sftp( self._host, self._user, self._password, self._key, self._timeout, self._retry_count, self._port, ) files = sftp.list_files( self._src_dir, self._dest_dir, re.compile(self._src_pattern) ) if self._quit is True and len(files) == 0: self._logger.info("No file was found. After process will not be processed") return StepStatus.SUCCESSFUL_TERMINATION self._logger.info("Files downloaded %s" % files) # cache downloaded file names ObjectStore.put(self._step, files)
def execute(self, *args): valid = EssentialParameters( self.__class__.__name__, [self._user, self._password] ) valid() super().execute()
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.info("%s : %s" % (k, v)) # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._host, self._user, self._src_dir, self._src_pattern], ) valid() os.makedirs(self._dest_dir, exist_ok=True) # fetch src sftp = Sftp( self._host, self._user, self._password, self._key, self._timeout, self._retry_count, self._port, ) files = sftp.list_files(self._src_dir, self._dest_dir, re.compile(self._src_pattern)) if self.__quit is True and len(files) == 0: self._logger.info( "No file was found. After process will not be processed") return 0 # cache downloaded file names ObjectStore.put(self._step, files)
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._collection, self._document, self._dest_dir] ) valid() if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " "Please see more information " "https://github.com/BrainPad/cliboa/blob/master/docs/modules/firestore_document_download.md" # noqa ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) firestore_client = Firestore.get_firestore_client(key_filepath) ref = firestore_client.document(self._collection, self._document) doc = ref.get() with open(os.path.join(self._dest_dir, doc.id), mode="wt") as f: f.write(json.dumps(doc.to_dict()))
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._dest_dir]) valid() service = BlobServiceAdapter().get_client( account_url=self._account_url, account_access_key=self._account_access_key, connection_string=self._connection_string, ) files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) > 0: for f in files: path = os.path.join(self._dest_dir, os.path.basename(f)) blob_client = service.get_blob_client( container=self._container_name, blob=path) with open(f, "rb") as data: blob_client.upload_blob(data, overwrite=True) else: self._logger.info( "Files to upload do not exist. File pattern: {}".format( os.path.join(self._src_dir, self._src_pattern))) if self._quit is True: return StepStatus.SUCCESSFUL_TERMINATION
def test_essential_parameters_ng(self): """ EssentialParameters invalid case """ with pytest.raises(CliboaException) as excinfo: valid = EssentialParameters("DummyClass", [""]) valid() assert "is not specified" in str(excinfo.value)
def execute(self, *args): param_valid = EssentialParameters(self.__class__.__name__, [self._raw_query]) param_valid() def func(): self._sqlite_adptr.execute(self._raw_query) super().execute(func)
def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._key]) valid() df = pandas.read_gbq( query=self._get_query(), dialect="standard", location=self._location, project_id=self._project_id, credentials=self._auth(), ) ObjectStore.put(self._key, df)
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._tblname, self._bucket, self._dest_dir]) valid() os.makedirs(self._dest_dir, exist_ok=True) gbq_client = bigquery.Client.from_service_account_json( self._credentials) gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname) gcs_client = storage.Client.from_service_account_json( self._credentials) gcs_bucket = gcs_client.get_bucket(self._bucket) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % ("".join(random.choices(string.ascii_letters, k=8)), ymd_hms) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) # gsc dir -> gs://{bucket_name}/{dataset_name}/{table_name}/{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz if self._filename: dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix, self._filename) else: dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix) # job config settings job_config = bigquery.ExtractJobConfig() job_config.compression = bigquery.Compression.GZIP job_config.desctination_format = bigquery.DestinationFormat.CSV # Execute query. job = gbq_client.extract_table(gbq_ref, dest_gcs, job_config=job_config, location=self._location) job.result() # Download from gcs for blob in gcs_bucket.list_blobs(prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary files for blob in gcs_bucket.list_blobs(prefix=prefix): blob.delete()
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._collection, self._document, self._dest_dir]) valid() firestore_client = self._firestore_client() ref = firestore_client.document(self._collection, self._document) doc = ref.get() with open(os.path.join(self._dest_dir, doc.id), mode="wt") as f: f.write(json.dumps(doc.to_dict()))
def execute(self, *args): super().execute() param_valid = EssentialParameters(self.__class__.__name__, [self._tblname]) param_valid() tbl_valid = SqliteTableExistence(self._dbname, self._tblname) tbl_valid() output_valid = IOOutput(self._io) output_valid() # get table column definition self._sqlite_adptr.connect(self._dbname) column_def = self.__get_column_def() if self._refresh is True: self.__refresh_table(column_def) # database transaction def insert(): self._logger.info("Start to insert") insert_rows = [] with open(self._s.cache_file, "r", encoding="utf-8") as f: for i, l_str in enumerate(f, 1): l_dict = ast.literal_eval(l_str) insert_rows.append(l_dict) # Check only once if i == 1: self.__valid_column_def(column_def, l_dict) # execute bulk insert if i % self._insert_cnt == 0: self._sqlite_adptr.execute_many_insert( self._tblname, column_def, insert_rows, self._replace_into) insert_rows.clear() if len(insert_rows) > 0: self._sqlite_adptr.execute_many_insert( self._tblname, column_def, insert_rows, self._replace_into) insert_rows.clear() self._logger.info("Finish to insert") super().execute(insert) self._s.remove()
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._host, self._user, self._src_dir, self._src_pattern], ) valid() os.makedirs(self._dest_dir, exist_ok=True) if isinstance(self._key, str): self._logger.warning(( "DeprecationWarning: " "In the near future, " "the `key` will be changed to accept only dictionary types. " "Please see more information " "https://github.com/BrainPad/cliboa/blob/master/docs/modules/sftp_download.md" )) key_filepath = self._key else: key_filepath = self._source_path_reader(self._key) # fetch src sftp = Sftp( self._host, self._user, self._password, key_filepath, self._passphrase, self._timeout, self._retry_count, self._port, ) files = sftp.list_files( self._src_dir, self._dest_dir, re.compile(self._src_pattern), self._endfile_suffix, self._ignore_empty_file, ) if self._quit is True and len(files) == 0: self._logger.info( "No file was found. After process will not be processed") return StepStatus.SUCCESSFUL_TERMINATION self._logger.info("Files downloaded %s" % files) # cache downloaded file names ObjectStore.put(self._step, files)
def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._dest_path]) valid() self._sqlite_adptr.connect(self._dbname) try: self._sqlite_adptr.export_table( self._tblname, self._dest_path, encoding=self._encoding, order=self._order, no_duplicate=self._no_duplicate, ) finally: self._close_database()
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._host, self._user, self._src_dir, self._src_pattern], ) valid() # remove src sftp = Sftp( self._host, self._user, self._password, self._key, self._timeout, self._retry_count, self._port, ) sftp.clear_files(self._src_dir, re.compile(self._src_pattern))
def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_pattern]) valid() client = self._s3_client() p = client.get_paginator("list_objects") for page in p.paginate(Bucket=self._bucket, Delimiter=self._delimiter, Prefix=self._prefix): for c in page.get("Contents", []): filename = c.get("Key") rec = re.compile(self._src_pattern) if not rec.fullmatch(filename): continue dest_path = os.path.join(self._dest_dir, os.path.basename(filename)) client.download_file(self._bucket, filename, dest_path)
def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_pattern]) valid() client = self._gcs_client() bucket = client.get_bucket(self._bucket) dl_files = [] for blob in bucket.list_blobs(prefix=self._prefix, delimiter=self._delimiter): r = re.compile(self._src_pattern) if not r.fullmatch(blob.name): continue dl_files.append(blob.name) blob.download_to_filename( os.path.join(self._dest_dir, os.path.basename(blob.name))) ObjectStore.put(self._step, dl_files)
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.info("%s : %s" % (k, v)) # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._host, self._user, self._src_dir, self._src_pattern], ) valid() # remove src sftp = Sftp( self._host, self._user, self._password, self._key, self._timeout, self._retry_count, self._port, ) sftp.clear_files(self._src_dir, re.compile(self._src_pattern))
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.info("%s : %s" % (k, v)) super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_pattern]) valid() c = storage.Client(self._project_id, credentials=ServiceAccount.auth(self._credentials)) bucket = c.get_bucket(self._bucket) dl_files = [] for blob in bucket.list_blobs(prefix=self._prefix, delimiter=self._delimiter): r = re.compile(self._src_pattern) if not r.fullmatch(blob.name): continue dl_files.append(blob.name) blob.download_to_filename( os.path.join(self._dest_dir, os.path.basename(blob.name))) ObjectStore.put(self._step, dl_files)
def execute(self, *args): super().execute() valid = EssentialParameters(self.__class__.__name__, [self._key]) valid() if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) df = pandas.read_gbq( query=self._get_query(), dialect="standard", location=self._location, project_id=self._project_id, credentials=ServiceAccount.auth(key_filepath), ) ObjectStore.put(self._key, df)
def execute(self, *args): # essential parameters check param_valid = EssentialParameters(self.__class__.__name__, [self._dbname]) param_valid()