def should_run(): bucket = CloudStorage.factory(project_id).get_bucket(bucket_name) blobs = CloudStorage.factory(project_id).list_blobs(bucket) files = list(map(lambda b: b.name, blobs)) if isinstance(files, list) and any('.csv' in file for file in files): return 'schedule_df_wrench_to_lake' else: return 'finish'
def process(self, unused_el): if isinstance(self._files_startwith, ValueProvider): self._files_startwith = self._files_startwith.get() if isinstance(self._files_ext, ValueProvider): self._files_ext = self._files_ext.get() if isinstance(self._sort_key, ValueProvider): self._sort_key = self._sort_key.get() if isinstance(self._env, ValueProvider): self._env = self._env.get() if isinstance(self._bucket, ValueProvider): self._bucket = self._bucket.get() project_id = GCLOUD.project(self._env) blobs = CloudStorage.factory(project_id).list_blobs( self._bucket, self._files_startwith) # Keep only files at the root bucket paths = [ f'gs://{b.bucket.name}/{b.name}' for b in blobs if '/' not in b.name and self._files_ext in b.name ] if isinstance(self._sort_key, str): self._sort_key = dill.loads(bytes.fromhex(self._sort_key)) paths.sort(key=self._sort_key) if len(paths) > 1 else None for file in paths: yield file
def create_dag(): dag = DAG( DAG_ID, default_args=default_args, # Be sure to stagger the dags so they don't run all at once, # possibly causing max memory usage and pod failure. - Stu M. schedule_interval='0 * * * *', catchup=False) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') storage = CloudStorage.factory(project_id) cdc_imports_bucket = storage.get_bucket(bucket) cdc_imports_processed_bucket = storage.get_bucket(processed_bucket) for files_startwith, table in table_map.items(): pusher_task_id = f'schedule_df_gcs_to_lake_{table}' continue_if_file_task = BranchPythonOperator( task_id=f'continue_if_file_{files_startwith}', python_callable=should_continue, op_args=[files_startwith, cdc_imports_bucket, table]) schedule_df_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=project_id, template_name=f'load_cdc_from_gcs_to_lake', job_name=f'gcs-to-lake-{table}', job_parameters={ 'files_startwith': files_startwith, 'dest': f'{project_id}:lake.{table}' }, provide_context=True) monitor_df_job_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', pusher_task_id=pusher_task_id, poke_interval=airflow_vars['dags']['cdc_from_gcs_to_lake'] ['poke_interval'], timeout=airflow_vars['dags']['cdc_from_gcs_to_lake'] ['poke_timeout'], dag=dag) move_files_task = PythonOperator( task_id=f'move_processed_files_{files_startwith}', python_callable=storage.move_files, op_args=[ files_startwith, cdc_imports_bucket, cdc_imports_processed_bucket ], ) (start_task >> continue_if_file_task >> schedule_df_task >> monitor_df_job_task >> move_files_task >> finish_task) return dag
def mv_to_s3(gcp_bucket, table, aws_access_key_id, aws_secret_access_key, aws_s3_bucket): def _is_cloud_storage_dir(object_name): return object_name.endswith('/') aws_s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) # This directory should be removed once the operation is complete, because of GDPR. Stu M. 11/29/19 with tempdir() as tmp: bucket = CloudStorage.factory(project).get_bucket(gcp_bucket) blobs = bucket.list_blobs(prefix=table) bucket_dirs_marked_for_deletion = [] for blob in blobs: key = blob.name file_or_dir = '{}/{}'.format(tmp, key) if _is_cloud_storage_dir(key): os.mkdir(file_or_dir) bucket_dirs_marked_for_deletion.append(key) else: splits = key.split('/') splits[len(splits) - 1] = str(uuid.uuid4()) + '-' + splits[len(splits) - 1] key = '/'.join(splits) dirname = os.path.dirname(file_or_dir) if not os.path.isdir(dirname): os.mkdir(dirname) blob.download_to_filename(file_or_dir) aws_s3_client.upload_file(file_or_dir, aws_s3_bucket, key) blob.delete() # Cleanup here because, folders in gcs are not deleted. Stu. M. 2/29/20 for key in bucket_dirs_marked_for_deletion: blob = bucket.blob(key) blob.delete()
def should_continue(prefix=None, bucket=None, table=None): if CloudStorage.factory(project_id).has_file(bucket=bucket, prefix=prefix): return f'schedule_df_gcs_to_lake_{table}' else: return 'finish'
def list_blobs(bucket, files_startswith): return CloudStorage.factory(project_id).list_blobs(bucket, files_startswith)
def delete_db_import_file(): CloudStorage.factory(project_id).delete_blob(bucket, f'{import_file_name}')
def should_run(): exist = CloudStorage.factory(project_id).blob_exists(bucket, f'{import_file_name}') if exist: return 'start_sql_instance' else: return 'finish'
def move_files(): bucket = CloudStorage.factory(project_id).get_bucket(bucket_name) blobs = CloudStorage.factory(project_id).list_blobs(bucket) for b in blobs: b.bucket.rename_blob(b, f'{processed_file_dir}/{b.name}')
def execute(self, context): state = CloudStorage.factory(self.project_id).blob_exists( self.bucket, self.file_name) return state
def clear_gcs_bucket_by_table(env, table): bucket = CloudStorage.factory(project).get_bucket(gcs_bucket) blobs = bucket.list_blobs(prefix=table) for blob in blobs: blob.delete()