def execute(self, context): if self.sa360_hook is None: self.sa360_hook = GoogleSearchAds360Hook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) request = self.sa360_hook.get_service().reports().get( reportId=self.report_id) response = request.execute() temp_file = tempfile.NamedTemporaryFile(delete=False) try: self._download_report(self.report_id, temp_file, len(response['files'])) destination_object_name = self._get_destination_uri( self.destination_object, temp_file) self.gcs_hook.upload(bucket=self.destination_bucket, object=destination_object_name, filename=temp_file.name, multipart=True) context['task_instance'].xcom_push('destination_bucket', self.destination_bucket) context['task_instance'].xcom_push('destination_object', destination_object_name) finally: temp_file.close() os.unlink(temp_file.name)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.gcs_schema_object: gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads( gcs_hook.download(gcs_bucket, gcs_object).decode("utf-8")) else: schema_fields = self.schema_fields conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_table(project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, schema_fields=schema_fields, time_partitioning=self.time_partitioning, labels=self.labels)
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id ) hook.insert_object_acl(bucket_name=self.bucket, object_name=self.object_name, entity=self.entity, role=self.role, generation=self.generation, user_project=self.user_project)
def execute(self, context): logging.info('Exporting data to Cloud Storage bucket ' + self.bucket) if self.overwrite_existing and self.namespace: gcs_hook = GoogleCloudStorageHook(self.cloud_storage_conn_id) objects = gcs_hook.list(self.bucket, prefix=self.namespace) for o in objects: gcs_hook.delete(self.bucket, o) ds_hook = DatastoreHook(self.datastore_conn_id, self.delegate_to) result = ds_hook.export_to_storage_bucket( bucket=self.bucket, namespace=self.namespace, entity_filter=self.entity_filter, labels=self.labels) operation_name = result['name'] result = ds_hook.poll_operation_until_done( operation_name, self.polling_interval_in_seconds) state = result['metadata']['common']['state'] if state != 'SUCCESSFUL': raise AirflowException( 'Operation failed: result={}'.format(result)) if self.xcom_push: return result
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_external_table( external_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs, labels=self.labels )
def execute(self, context): gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) schema_fields = self.schema_fields if self.schema_fields else json.loads(gcs_hook.download(self.bucket, self.schema_object)) source_uris = map(lambda schema_object: 'gs://{}/{}'.format(self.bucket, schema_object), self.source_objects) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id)) return max_id
def copy_docs_to_gcs(self, bucket: str, bucket_path: str, project_path: str): """ Copy doc files generated with dbt docs generate to GCS :param bucket: Bucket where the doc files will be copied :param bucket_path: Path in the bucket :param project_path: Local project folder """ hook = GoogleCloudStorageHook() for doc_file in DBT_DOC_FILES: doc_file_path = f"{project_path}/{DBT_DOC_FOLDER}/{doc_file}" if os.path.exists(doc_file_path): logging.info( f"{doc_file} found. Copying to gs://{bucket}/{bucket_path}" ) hook.upload( bucket, object=f"{bucket_path}/{doc_file}" if bucket_path else doc_file, filename=doc_file_path, mime_type="text/html" if doc_file.endswith(".html") else "application/json", ) else: logging.warning(f"{doc_file} not found. Skipping")
def data_to_GCS(csv_name: str, folder_name: str, bucket_name="task_ts_data", **kwargs): hook = GoogleCloudStorageHook() data = load_data() df = pd.DataFrame(data=data) df.to_csv('corona_data.csv', index=False) columns_to_consider_for_uniqueness = ['country', 'region', 'sub_region'] unique_column_name = 'full_county' minimum_datapoints_threshold = 60 """ Function to split data-frame based on state or county. """ unique_df_list = [] for col in columns_to_consider_for_uniqueness: df[col] = df[col].fillna('').apply(lambda x: x.replace(" ", "_")) df[unique_column_name] = df[columns_to_consider_for_uniqueness[0]].str.cat( df[columns_to_consider_for_uniqueness[1:]], sep="__") for i, g in df.groupby('full_county'): df_code = g.copy() ts_count = len(df_code) if ts_count > minimum_datapoints_threshold: df_code.reset_index( drop=True).loc[:, ~df.columns.str.contains('^Unnamed')].to_csv( '{}.csv'.format(i), index=False) hook.upload(bucket_name, object='{}/{}.csv'.format(folder_name, i), filename='{}.csv'.format(i), mime_type='text/csv') """ Function for full data pull
def poke(self, context): self.log.info('Sensor checks existence of objects: %s, %s', self.bucket, self.prefix) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) objects = [] for prefix in self.prefixes: objects.extend(list(hook.list(self.bucket, prefix=prefix))) self.log.info(f'Objects list: {objects}') names, files, objects = names_match(objects) if names: ti = context['ti'] self.__download(hook, objects, files, ti) ti.xcom_push(key='names', value=names) ti.xcom_push(key='files', value=files) ti.xcom_push(key='objects', value=objects) for name, fil in zip(names, files): ti.xcom_push(key=f'{name}', value=fil) self.log.info( f'names: {names}\nfiles: {files}\nobjects: {objects}') data_timestamp = current_datetime().isoformat() ti.xcom_push(key='data_timestamp', value=data_timestamp) return True return False
def execute(self, context): """ 1. Prepare data from the Dimension table, clean & store in CSV on local 2. Upload the CSV to GCS """ # depending on the flow name, execute the task if self.flow_name == 'dimension_currency': # prepare dimension data dimension_currency_to_csv(self.raw_data_filepath, self.clean_filepath) elif self.flow_name == 'exchange_rate_history': # prepare exchnage rate history data exchange_rate_history_to_csv(self.raw_data_filepath, self.clean_filepath) else: raise AirflowException("Incorrect Flow name") # upload file to GCS hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id) hook.upload(bucket=self.gcs_bucket, object=self.gcs_filepath, filename=self.clean_filepath) logging.info("File uploaded to GCS") # remove files if os.path.exists(self.clean_filepath): os.remove(self.clean_filepath) logging.info(f"{self.clean_filepath} : File deleted from local")
def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if gcs_hook.exists(self.gcs_source_bucket, self.gcs_source_uri) is False: self.log.error('Skip object not found: gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri) raise AirflowException('Skip object not found: gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri) tmp = tempfile.NamedTemporaryFile() self.log.info('Download gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri) gcs_hook.download( bucket=self.gcs_source_bucket, object=self.gcs_source_uri, filename=tmp.name, ) self.log.info('Upload s3://%s/%s', self.s3_destination_bucket, self.s3_destination_uri) s3_hook.load_file( filename=tmp.name, bucket_name=self.s3_destination_bucket, key=self.s3_destination_uri, replace=True, acl_policy=self.s3_acl_policy ) tmp.close()
def outputManager(self, context, output, key, bucket): if len(output) == 0 or output is None: if self.total_output_files == 0: logging.info("No records pulled from Hubspot.") downstream_tasks = context['task'].get_flat_relatives(upstream=False) logging.info('Skipping downstream tasks...') logging.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) else: logging.info('Logging {0} to GCS...'.format(key)) output = [flatten(e) for e in output] output = '\n'.join([json.dumps({boa.constrict(k): v for k, v in i.items()}) for i in output]) gcs = GoogleCloudStorageHook(self.gcs_conn_id) with open("__temp__", "w") as fid: fid.write(output) gcs_conn.upload(self.gcs_bucket, self.gcs_object, "__temp__") self.total_output_files += 1
def execute(self, context): self.log.info('Executing copy - Source_Bucket: %s, Source_directory: %s, ' 'Destination_bucket: %s, Destination_directory: %s', self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_directory or self.source_object) hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) self.log.info('Getting list of the files to copy. Source Bucket: %s; Source Object: %s', self.source_bucket, self.source_object) # Create a list of objects to copy from Source bucket. The function uses prefix keyword to pass the name of # the object to copy. self.files_to_copy = hook.list(bucket=self.source_bucket, prefix=self.source_object, delimiter=self.source_files_delimiter) # Log the names of all objects to be copied self.log.info('Files to copy: %s', self.files_to_copy) if self.files_to_copy is not None: for file_to_copy in self.files_to_copy: self.log.info('Source_Bucket: %s, Source_Object: %s, ' 'Destination_bucket: %s, Destination_Directory: %s', self.source_bucket, file_to_copy, self.destination_bucket or self.source_bucket, self.destination_directory + file_to_copy) hook.copy(self.source_bucket, file_to_copy, self.destination_bucket, self.destination_directory + file_to_copy) else: self.log.info('No Files to copy.')
def gcs_to_psql_import(**kwargs): fd, tmp_filename = tempfile.mkstemp(text=True) # download file locally gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=kwargs['gcp_conn_id']) gcs_hook.download(bucket=kwargs['bucket'], object=kwargs['object'], filename=tmp_filename) del gcs_hook # load the file into postgres pg_hook = PostgresHook(postgres_conn_id=kwargs['postgres_conn_id'], schema=kwargs['database']) pg_hook.bulk_load( '{schema}.{table}'.format(schema=kwargs['schema'], table=kwargs['table']), tmp_filename) # output errors for output in pg_hook.conn.notices: print(output) # remove temp file os.close(fd) os.unlink(tmp_filename)
def do_copy_model_to_final(**kwargs): gcs = GoogleCloudStorageHook() # Returns all the objects within the bucket. All sub-buckets are considered # as prefix of the leaves. List does not differentiate files from subbuckets all_jobs_files = gcs.list( bucket=COMPOSER_BUCKET_NAME, prefix='{}/export/estimate'.format(PREFIX_JOBS_EXPORT)) # Extract the latest model bucket parent of variables/ and saved_model.pbtxt # The max() string contains the latest model folders in 1234567, we need to # extract that using regex # ex: jobs/clv-composer/export/estimate/1234567890/variables/variables.index # returns /1234567890/ latest_model_bucket = re.findall(r'/\d+/', max(all_jobs_files))[0] # List all the files that needs to be copied (only files in the latest bucket # and skip the ones that are not files but sub buckets) for c in [ f for f in all_jobs_files if latest_model_bucket in f and f[-1] != '/' ]: # The model used for training is saved into a 'final' sub bucket of the # export bucket. dest_object = c.split(latest_model_bucket)[1] dest_object = '{}/{}'.format(PREFIX_FINAL_MODEL, dest_object) logging.info("Copying {} to {} ...".format(dest_object, COMPOSER_BUCKET_NAME)) gcs.copy(source_bucket=COMPOSER_BUCKET_NAME, source_object=c, destination_object=dest_object)
def execute(self, context): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if self.cm_hook is None: self.cm_hook = GoogleCampaignManagerHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) temp_file = tempfile.NamedTemporaryFile(delete=False) try: report_file_name = self._download_report(self.report_id, self.file_id, temp_file, self.chunk_size) destination_object_name = self._get_destination_uri( self.destination_object, report_file_name) self.gcs_hook.upload(bucket=self.destination_bucket, object=destination_object_name, filename=temp_file.name, gzip=True, multipart=True) context['task_instance'].xcom_push('destination_bucket', self.destination_bucket) context['task_instance'].xcom_push('destination_object', destination_object_name) finally: temp_file.close() os.unlink(temp_file.name)
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) if '*' in self.source_object: wildcard_position = self.source_object.index('*') objects = hook.list( self.source_bucket, prefix=self.source_object[:wildcard_position], delimiter=self.source_object[wildcard_position + 1:]) for source_object in objects: self.log.info('Executing copy of gs://{0}/{1} to ' 'gs://{2}/{3}/{1}'.format( self.source_bucket, source_object, self.destination_bucket, self.destination_object, source_object)) hook.copy( self.source_bucket, source_object, self.destination_bucket, "{}/{}".format(self.destination_object, source_object)) if self.move_object: hook.delete(self.source_bucket, source_object) else: self.log.info('Executing copy: %s, %s, %s, %s', self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_object or self.source_object) hook.copy(self.source_bucket, self.source_object, self.destination_bucket, self.destination_object) if self.move_object: hook.delete(self.source_bucket, self.source_object)
def execute(self, context): # use the super to list all files in an Google Cloud Storage bucket files = super(GoogleCloudStorageToS3Operator, self).execute(context) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if not self.replace: # if we are not replacing -> list all files in the S3 bucket # and only keep those files which are present in # Google Cloud Storage and not in S3 bucket_name, _ = S3Hook.parse_s3_url(self.dest_s3_key) existing_files = s3_hook.list_keys(bucket_name) files = set(files) - set(existing_files) if files: hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) for file in files: file_bytes = hook.download(self.bucket, file) dest_key = self.dest_s3_key + file self.log.info("Saving file to %s", dest_key) s3_hook.load_bytes(file_bytes, key=dest_key, replace=self.replace) self.log.info("All done, uploaded %d files to S3", len(files)) else: self.log.info("In sync, no files needed to be uploaded to S3") return files
def execute(self, context): """ See class definition. """ # Get Columns From Salesforce sf_cols = self.fetch_sf_columns(self.sf_conn_id, self.sf_object) print('this is SF data') print(sf_cols) self.xcom_push(context, key='sf_cols', value=[col['sf_name'] for col in sf_cols]) # Get Columns From Redshift #bq_cols = self.fetch_bq_columns(self.bq_table) bq_cols = self.patch_bq_cols(self.bq_table, sf_cols) gcs = GoogleCloudStorageHook(self.gcs_conn_id) with NamedTemporaryFile("w") as tmp: tmp.file.write(str(bq_cols).replace("'", '"')) tmp.file.flush() gcs.upload(bucket=self.gcs_bucket, object=self.gcs_key, filename=tmp.name) tmp.close() self.xcom_push(context, key='bq_cols', value=str(bq_cols))
def execute(self, context): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if self.ga_hook is None: self.ga_hook = GoogleAnalyticsManagementHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, ) try: temp_ga_upload_file = tempfile.NamedTemporaryFile(delete=False) self._get_file_from_cloud_storage(self.gcs_hook, self.storage_bucket, self.storage_name_object, temp_ga_upload_file) self.ga_hook.upload_file(temp_ga_upload_file.name, self.account_id, self.web_property_id, self.custom_data_source_id, self.mime_type) finally: temp_ga_upload_file.close() os.unlink(temp_ga_upload_file.name)
def execute(self, context): if self.hook is None: self.hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) temp_file = tempfile.NamedTemporaryFile(delete=False) try: # TODO(efolgar): Directly stream to storage instead of temp file self._download_report(self.report_url, temp_file, self.chunk_size) destination_object_name = self._get_destination_uri( self.destination_object, self.report_url) self.hook.upload( bucket=self.destination_bucket, object=destination_object_name, filename=temp_file.name, multipart=True) context['task_instance'].xcom_push( 'destination_bucket', self.destination_bucket) context['task_instance'].xcom_push( 'destination_object', destination_object_name) finally: temp_file.close() os.unlink(temp_file.name)
def execute(self, context): self.log.info("Fetching launch data") launch_hook = LaunchHook(conn_id=self._launch_conn_id) result = launch_hook.get_launches( start_date=self._start_date, end_date=self._end_date ) self.log.info("Fetched data for %d launches", len(result)) self.log.info( "Uploading data to gcs://%s/%s", self._output_bucket, self._output_path ) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self._gcp_conn_id ) with tempfile.TemporaryDirectory() as tmp_dir: tmp_path = os.path.join(tmp_dir, "result.json") with open(tmp_path, "w") as file_: json.dump(result, file_) gcs_hook.upload( bucket=self._output_bucket, object=self._output_path, filename=tmp_path )
def _upload_to_gcs(self, files_to_upload): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for object, tmp_file_handle in files_to_upload.items(): hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
def _get_data_from_gcs(gcp_conn_id, bucket, input): hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=gcp_conn_id) tmp_file = NamedTemporaryFile(delete=False) hook.download(bucket, input, tmp_file.name) filename = tmp_file.name return filename
def poke(self, context): self.log.info('Sensor checks existence of : %s, %s', self.bucket, self.object) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return hook.exists(self.bucket, self.object)
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id) hook.insert_bucket_acl(bucket_name=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project)
def poke(self, context): self.log.info('Sensor checks existence of objects: %s, %s', self.bucket, self.prefix) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return bool(hook.list(self.bucket, prefix=self.prefix))
def execute(self, context): logging.info('Executing download: %s, %s, %s', self.bucket, self.object, self.filename) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) print(hook.download(self.bucket, self.object, self.filename))
def schema(self): hook = GoogleCloudStorageHook() objs = hook.download( self.config['bucket_name'], '{}/{}.json'.format(self.config['schemas_clean_path'], self.table)) return json.loads(objs)
def poke(self, context): logging.info('Sensor checks existence of : %s, %s', self.bucket, self.object) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return hook.is_updated_after(self.bucket, self.object, self.ts_func(context))