def execute(self, context): # use the super to list all files in an Azure Data Lake path files = super(AdlsToGoogleCloudStorageOperator, self).execute(context) g_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket=bucket_name, prefix=prefix) files = set(files) - set(existing_files) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def _upload_to_gcs(self, files_to_upload): """ Upload all of the file splits (and optionally the schema .json file) to Google cloud storage. """ hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for object, tmp_file_handle in files_to_upload.items(): hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
def _upload_to_gcs(self, files_to_upload): """ Upload all of the file splits (and optionally the schema .json file) to Google cloud storage. """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for tmp_file in files_to_upload: hook.upload(self.bucket, tmp_file.get('file_name'), tmp_file.get('file_handle').name, mime_type=tmp_file.get('file_mime_type'))
def execute(self, context): """ Uploads the file to Google cloud storage """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) hook.upload( bucket=self.bucket, object=self.dst, mime_type=self.mime_type, filename=self.src)
def _upload_to_gcs(self, files_to_upload): """ Upload all of the file splits (and optionally the schema .json file) to Google cloud storage. """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for tmp_file in files_to_upload: hook.upload(self.bucket, tmp_file.get('file_name'), tmp_file.get('file_handle').name, mime_type=tmp_file.get('file_mime_type'), gzip=self.gzip if tmp_file.get('file_name') == self.schema_filename else False)
def execute(self, context): gcp_text_to_speech_hook = GCPTextToSpeechHook(gcp_conn_id=self.gcp_conn_id) result = gcp_text_to_speech_hook.synthesize_speech( input_data=self.input_data, voice=self.voice, audio_config=self.audio_config, retry=self.retry, timeout=self.timeout, ) with NamedTemporaryFile() as temp_file: temp_file.write(result.audio_content) cloud_storage_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id) cloud_storage_hook.upload( bucket=self.target_bucket_name, object=self.target_filename, filename=temp_file.name )
def execute(self, context): # get data from cloud function API httphook = HttpHook(method=self.method, http_conn_id=self.http_conn_id) response = httphook.run(endpoint=self.endpoint) # store date locally in temp file with NamedTemporaryFile() as tempfile: tempfile.write(response.content) tempfile.flush() #upload to bucket gcshook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id) gcshook.upload(bucket=self.gcs_bucket, object=self.gcs_path, filename=tempfile.name)
def execute(self, context): """ Uploads the file to Google cloud storage """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) hook.upload( bucket_name=self.bucket, object_name=self.dst, mime_type=self.mime_type, filename=self.src, gzip=self.gzip, )
def execute(self, context): http = HttpHook(self.method, http_conn_id=self.http_conn_id) self.log.info("Calling HTTP method") response = http.run(self.endpoint) with NamedTemporaryFile() as tmp_file_handle: tmp_file_handle.write(response.content) tmp_file_handle.flush() hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcs_conn_id) hook.upload(bucket=self.bucket, object=self.gcs_path, filename=tmp_file_handle.name)
def get_weather(**kwargs): """ Query openweathermap.com's API and to get the weather for Jakarta, ID and then dump the json to the /src/data/ directory with the file name "<today's date>.json" """ # My API key is defined in my config.py file. paramaters = {'q': 'Jakarta, ID', 'appid': API_KEY} logging.info("API_KEY={}".format(API_KEY)) result = requests.get("http://api.openweathermap.org/data/2.5/weather?", paramaters) # If the API call was sucessful, get the json and dump it to a file with # today's date as the title. if result.status_code == 200: # Get the json data json_data = result.json() logging.info("Response from API: {}".format(json_data)) # Save output file file_name = str(kwargs["execution_date"]) + '.json' dir_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', kwargs["dag"].dag_id, kwargs["task"].task_id) if not os.path.exists(dir_path): os.makedirs(dir_path) tot_name = os.path.join(dir_path, file_name) logging.info("Will write output to {}".format(tot_name)) with open(tot_name, 'w') as outputfile: json.dump(json_data, outputfile) logging.info("Successfully write local output file") # upload to GCS gcs = GoogleCloudStorageHook('gcp_airflow_lab') gcs_dest_object = os.path.join(kwargs["dag"].dag_id, kwargs["task"].task_id, file_name) gcs.upload(GCS_BUCKET, gcs_dest_object, tot_name, mime_type='application/octet-stream') logging.info( "Successfully write output file to GCS: gs://{}/{}".format( GCS_BUCKET, gcs_dest_object)) else: raise ValueError('"Error In API call."')
def execute(self, context): hook = GCPTextToSpeechHook(gcp_conn_id=self.gcp_conn_id) result = hook.synthesize_speech( input_data=self.input_data, voice=self.voice, audio_config=self.audio_config, retry=self.retry, timeout=self.timeout, ) with NamedTemporaryFile() as temp_file: temp_file.write(result.audio_content) cloud_storage_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id) cloud_storage_hook.upload(bucket_name=self.target_bucket_name, object_name=self.target_filename, filename=temp_file.name)
def execute(self, context): response = super().execute(context) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id) with tempfile.NamedTemporaryFile(prefix="gcs-local") as file: file.write(response.encode('utf-8')) file.flush() hook.upload( bucket=self.bucket, filename=file.name, object=self.filename, mime_type=self.mime_type )
def simpleNumpyToGCS(csv_name: str, folder_name: str, bucket_name="airflow-gcp-bucket", **kwargs): hook = GoogleCloudStorageHook() data = {'col1': [1, 2], 'col2': [3, 4]} df = pd.DataFrame(data=data) df.to_csv('example1.csv', index=False) hook.upload(bucket_name, object='{}/{}.csv'.format(folder_name, csv_name), filename='example1.csv', mime_type='text/csv')
def execute(self, context): http = HttpHook(self.method, http_conn_id=self.http_conn_id) gchook = GoogleCloudStorageHook() self.log.info("Calling HTTP method") response = http.run(self.endpoint, self.data, self.headers, self.extra_options) if self.log_response: self.log.info(response.text) if self.response_check: if not self.response_check(response): raise AirflowException("Response check returned False.") f = open("aaaa", "w") f.write(response.text) f.close() gchook.upload(object="bucketie", filename="aaaa", bucket="buckster")
class HttpToGcsOperator(BaseOperator): """ Calls an endpoint on an HTTP system to execute an action :param http_conn_id: The connection to run the operator against :type http_conn_id: string :param endpoint: The relative part of the full url. (templated) :type endpoint: string :param gcs_path: The path of the GCS to store the result :type gcs_path: string """ template_fields = ('endpoint', 'gcs_path') template_ext = () ui_color = '#f4a460' @apply_defaults def __init__(self, endpoint, gcs_path, http_conn_id, gcs_conn_id, *args, **kwargs): super(HttpToGcsOperator, self).__init__(*args, **kwargs) self.http_conn_id = http_conn_id self.gcs_conn_id = gcs_conn_id self.gcs_path = gcs_path self.endpoint = endpoint def execute(self, context): self.http_hook = HttpHook(self.http_conn_id, method='GET') self.gcs_hook = GoogleCloudStorageHook(self.gcs_conn_id) bucket, blob = self.gcs_hook._parse_gcs_url(self.gcs_path) # Parse the query into components, extract query part parsed = urlparse(self.endpoint) base_url = urlunparse(list(parsed[:4]) + ["", ""]) # Create temporary with tempfile.NamedTemporaryFile() as fp: # Get response response = self.http_hook.run(base_url, date=parsed.query) fp.write(response) fp.close() # Upload the file to storage self.gcs_hook.upload(bucket, blob, filename=fp.name)
def execute(self, context): http = HttpHook(self.method, http_conn_id=self.http_conn_id) self.log.info("Calling HTTP method") response = http.run(self.endpoint, self.data, self.headers, self.extra_options) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) local_filename = '/tmp/' + self.filename f = open(local_filename, "w") f.write(response.text) f.close() hook.upload(bucket=self.bucket, object=self.filename, filename=local_filename, mime_type='application/json')
def execute(self, context): # build moat tile logging.info('Instantiate Moat Tile') tile = MoatTile(self.brand_id, self.level_filter, self.dimensions) token = Variable.get('rtf_moat_token') ## somehow get token in a better way logging.info('Fetch Token') if self.level_filter: filter_id = [*self.level_filter.values()][0] time.sleep(random.randint(1, 5)) # f**k shit up filename = tile.get_data(self.s, self.e, token) if filename: logging.info('Response Saved Locally @ {}'.format(filename)) else: logging.error('No Response') raise AirflowSkipException() file_tokens = [self.brand_id, filter_id, self.suffix] blob_name = "_".join([str(x) for x in file_tokens if x]) ## PRETTY CLEVER KYLE if self.prefix: blob_name = str(self.prefix) + blob_name + '.json' hook = GoogleCloudStorageHook() hook.upload(bucket=self.bucket, object=blob_name, filename=filename) ## docs don't match repo logging.info("{} uploaded to {}".format(blob_name, self.bucket)) os.remove(filename) logging.info("{} deleted from local".format(filename)) return ( self.bucket, blob_name ) ## should get pushed to xcom if do_xcom_push is set to True in baseclass
def text2speech(**kwargs): ti = kwargs['ti'] data = {"message": ti.xcom_pull(task_ids="input")} response = requests.post( "https://us-central1-devops-218113.cloudfunctions.net/Text2Speech", json=data) fileName = str(uuid.uuid4()) with open(fileName, "wb") as outfile: outfile.write(response.content) gcs = GoogleCloudStorageHook() gcs.upload("workflowstorage", fileName, fileName, mime_type='application/octet-stream') os.remove(fileName) return fileName
def compression(**kwargs): ti = kwargs['ti'] fileName = ti.xcom_pull(task_ids="conversion") gcs = GoogleCloudStorageHook() gcs.download("workflowstorage", fileName, fileName) file = {"to_compress": open(fileName, 'rb')} response = requests.post( "https://us-central1-devops-218113.cloudfunctions.net/Compression", files=file) newFileName = str(uuid.uuid4()) with open(newFileName, "wb") as outfile: outfile.write(response.content) gcs.upload("workflowstorage", newFileName, newFileName, mime_type='application/octet-stream') os.remove(newFileName) return newFileName
def execute(self, context): gcshook = GoogleCloudStorageHook(self.gcp_conn_id) self.log.info(gcshook.list("testcovidlinh")) # Create a temporary folder # print(os.path.) if not path.exists("tmp"): os.mkdir("tmp") # Track failure failure_count = 0 # Passing filename to next job file_list = [] # Consume API for state in self.state_code: URL = "https://covidtracking.com/api/v1/states/" + state.lower() + "/daily.json" # self.log.info(URL) response = requests.get(URL).json() try: # If we have any message error self.log.info(response["message"]) failure_count += 1 continue except: # The response is successfully filename = "tmp/"+state+".json" # self.log.info(filename) with open(filename,'w', encoding='utf-8') as f: dict2str = [json.dumps(i,sort_keys=True) for i in response] json_output = "\n".join(dict2str) f.write(json_output) # json.dump(response, f, ensure_ascii=False) object_name = 'US-' + state + "/" + "covidstat.json" file_list.append(object_name) gcshook.upload(bucket=self.gcs_bucket, object=object_name, filename=filename) self.log.info("Number of failure cases: "+str(failure_count)) task_instance = context['task_instance'] task_instance.xcom_push(self.xcom_task_id_key, file_list)
def execute(self, context): http = HttpHook(self.method, http_conn_id=self.http_conn_id) response = http.run(self.endpoint, self.data, self.headers, self.extra_options) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to, ) tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handle.write(response.content) tmp_file_handle.flush() hook.upload(self.bucket, self.filename, tmp_file_handle.name, "application/json") tmp_file_handle.close()
def _upload_to_gcs(self, tmp_file_handles): """ Upload all of the file splits (and optionally the schema .json file) to Google cloud storage. """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for object_name, tmp_file_handle in tmp_file_handles.items(): # File is not empty if tmp_file_handle.tell() > 0: self.log.info( f'Uploading file {tmp_file_handle.name} to GCS as gs://{self.bucket}/{object_name}' ) hook.upload(self.bucket, object_name, tmp_file_handle.name, 'application/json', (self.gzip if object_name != self.schema_filename else False))
def execute(self, context): gcs_hook = GoogleCloudStorageHook() # splitting the file path to extract the desired parts (which should be a path like gs://bucket/path/file.csv) file_parts = self.gcs_file_path.split('/') # gets the bucket bucket = file_parts[2] # getting the path to the file file_path = '/'.join(file_parts[3:-1]) # getting the file name file_name = file_parts[-1] # setting the local path with a "Pre" and preparing a processed path for the file local_file_path = '/home/airflow/gcs/data/Pre_{}'.format(file_name) prepared_file_path = '/home/airflow/gcs/data/{}'.format(file_name) # obtaining the Geocode job id task_instance = context['task_instance'] create_resp = task_instance.xcom_pull(task_ids=self.create_job_task) job_id = create_resp['resourceSets'][0]['resources'][0]['id'] # calling and downloading the file bm_hook = BingMapsHook(bing_maps_conn_id=self.bing_maps_conn_id) method = '{}/output/succeeded'.format(job_id) bm_hook.call(method=method, api_params={}, operation='GET', file_path=local_file_path) # processing the file and uploading to the bucket with open(local_file_path, 'r') as rf: with open(prepared_file_path, 'w') as wf: for num, line in enumerate(rf, 1): if num == 1: pass elif num == 2: wf.write(line.replace('/', '_')) else: wf.write(line) gcs_hook.upload(bucket, '{}/{}'.format(file_path, file_name), prepared_file_path)
def generateSchema(self, keyword, stagetable_flag=True): """ Generate schema for bigquery """ schema_json = [{ "name": "date", "type": "STRING" }, { "name": "state", "type": "STRING" }] data_type = 'STRING' file_path = self.AIRFLOW_HOME + "/tmp/googletrend_schema.json" if stagetable_flag: schema_json = [{ "name": "date", "type": "STRING" }, { "name": "state", "type": "STRING" }] data_type = 'STRING' file_path = self.AIRFLOW_HOME + "/tmp/googletrend_schema_stage.json" d = {} print(keyword) for word in keyword: d["name"] = word.replace(" ", "_") d["type"] = data_type schema_json.append(d) d = {} with open(file_path, "w") as f: json.dump(schema_json, f, indent=4) # Upload schema to GCS object_name = "googletrend_schema.json" gcshook = GoogleCloudStorageHook(self.gcp_conn_id) gcshook.upload(bucket=self.gcs_bucket, object=object_name, filename=file_path)
def execute(self, context): self.log.info("Fetching launch data") launch_hook = LaunchHook(conn_id=self._launch_conn_id) result = launch_hook.get_launches(start_date=self._start_date, end_date=self._end_date) self.log.info("Fetched data for %d launches", len(result)) self.log.info("Uploading data to gcs://%s/%s", self._output_bucket, self._output_path) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self._gcp_conn_id) with tempfile.TemporaryDirectory() as tmp_dir: tmp_path = os.path.join(tmp_dir, "result.json") with open(tmp_path, "w") as file_: json.dump(result, file_) gcs_hook.upload(bucket=self._output_bucket, object=self._output_path, filename=tmp_path)
def load_table(**kwargs): """ Processes the json data, checks the types and enters into the Postgres database. """ pg_hook = PostgresHook(postgres_conn_id='weatherdb_postgres_conn') gcs = GoogleCloudStorageHook('gcp_airflow_lab') prev_task_id = 'transform_data' # Set source file source_file_name = str(kwargs["execution_date"]) + '.csv' source_dir_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', kwargs["dag"].dag_id, prev_task_id) source_full_path = os.path.join(source_dir_path, source_file_name) # download from GCS gcs_src_object = os.path.join(kwargs["dag"].dag_id, prev_task_id, source_file_name) gcs.upload(GCS_BUCKET, gcs_src_object, source_full_path, mime_type='application/octet-stream') logging.info("Successfully download file from GCS: gs://{}/{}".format( GCS_BUCKET, gcs_src_object)) # open the csv source file and read it in with open(source_full_path, 'r') as inputfile: csv_reader = csv.reader(inputfile, delimiter=',') for row in csv_reader: insert_cmd = """INSERT INTO weather (city, country, latitude, longitude, todays_date, humidity, pressure, min_temp, max_temp, temp, weather) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""" pg_hook.run(insert_cmd, parameters=row) logging.info( "Successfully insert to database using command: {}".format( insert_cmd))
def write_str_to_gcp(string: str, gcp_path: str, conn_id: str = 'google_cloud_default'): """Dump a string into a file in google bucket""" storage_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=conn_id) destination_uri = urlparse(gcp_path) with tempfile.TemporaryDirectory() as tmp_folder: temp_path_abs = os.path.join(tmp_folder, 'config_file') with open(temp_path_abs, 'w') as f: f.write(string) if destination_uri.path.startswith('/'): destination_path = destination_uri.path[1:] else: destination_path = destination_uri.path storage_hook.upload( destination_uri.netloc, destination_path, temp_path_abs )
def upload_to_gcs(**kwargs): """ Generates a CSV that is then uploaded to Google Cloud Storage using the GoogleCloudStorageHook. This is meant to imitate the first step of a traditional ETL DAG: ingesting data from some external sourceself. """ df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=['col_a', 'col_b', 'col_c', 'col_d']) df.to_csv('test_data.csv', index=False) hook = GoogleCloudStorageHook(google_cloud_storage_conn_id='astro_gcs') hook.upload(bucket='psl-poc-viraj', object='test_data.csv', filename='test_data.csv', mime_type='text/plain')
def download_and_transform_erf(self, partner_id=None): """Load and Transform ERF files to Newline Delimeted JSON. Then upload this file to the project GCS. Args: self: The operator this is being used in. partner_id: A string of the DCM id of the partner. Returns: entity_read_file_ndj: The filename for the converted entity read file. """ if partner_id: self.erf_bucket = 'gdbm-%s' % partner_id else: self.erf_bucket = 'gdbm-public' gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id) entity_read_file = tempfile.NamedTemporaryFile(delete=False) gcs_hook.download(self.erf_bucket, self.erf_object, entity_read_file.name) temp_file = None # Creating temp file. Not using the delete-on-close functionality # as opening the file for reading while still open for writing # will not work on all platform # https://docs.python.org/2/library/tempfile.html#tempfile.NamedTemporaryFile try: temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False) temp_file.writelines(json_to_jsonlines(entity_read_file.name)) temp_file.close() # Random here used as a nonce for writing multiple files at once. filename = '%s_%s_%d.json' % (randint(1, 1000000), self.entity_type, time.time() * 1e+9) gcs_hook.upload(self.gcs_bucket, filename, temp_file.name) finally: if temp_file: temp_file.close() os.unlink(temp_file.name) return filename
def execute(self, context): # use the super to list all files in an Azure Data Lake path files = super().execute(context) g_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket_name=bucket_name, prefix=prefix) files = set(files) - set(existing_files) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload( bucket_name=dest_gcs_bucket, object_name=dest_path, filename=f.name, gzip=self.gzip ) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def execute(self, context): facebook_conn = FacebookAdsHook(self.facebook_conn_id) gcs_conn = GoogleCloudStorageHook(self.gcs_conn_id) time_range = { "since": datetime.strptime(self.since, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"), "until": datetime.strptime(self.until, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"), } file_name = "/tmp/{key}.jsonl".format(key=self.gcs_key) with open(file_name, "w") as insight_file: for account_id in self.account_ids: insights = facebook_conn.get_insights_for_account_id( account_id, self.insight_fields, self.breakdowns, time_range, self.time_increment, self.level, self.limit, ) if len(insights) > 0: for insight in insights[:-1]: insight_file.write(json.dumps(insight) + "\n") insight_file.write(json.dumps(insights[-1:][0])) else: return gcs_conn.upload(filename=file_name, bucket=gcs_bucket, object=gcs_key, gzip=True) os.remove(file_name)
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info('Extracting data from Hive') logging.info(self.hql) data = hive.get_pandas_df(self.hql, schema=self.schema) gcp_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id) logging.info('Inserting rows onto google cloud storage') with tempfile.NamedTemporaryFile(suffix='.json', prefix='tmp') as tmp_file: data = data.to_json(orient='records') recs = json.loads(data) for record in recs: tmp_file.write(json.dumps(record)) tmp_file.write("\n") tmp_file.flush() remote_file_name = self.file_pattern.format('aa') remote_name = os.path.join(self.subdir, remote_file_name) gcp_hook.upload(self.bucket, remote_name, tmp_file.name) logging.info('Done.')
def execute(self, context): ga_conn = GoogleAnalyticsHook(self.google_analytics_conn_id) gcs_conn = GoogleCloudStorageHook(self.gcs_conn_id) try: since_formatted = datetime.strptime(self.since, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') except: since_formatted = str(self.since) try: until_formatted = datetime.strptime(self.until, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') except: until_formatted = str(self.until) report = ga_conn.get_analytics_report(self.view_id, since_formatted, until_formatted, self.sampling_level, self.dimensions, self.metrics, self.page_size, self.include_empty_rows) columnHeader = report.get('columnHeader', {}) # Right now all dimensions are hardcoded to varchar(255), will need a map if any non-varchar dimensions are used in the future # Unfortunately the API does not send back types for Dimensions like it does for Metrics (yet..) dimensionHeaders = [ {'name': header.replace('ga:', ''), 'type': 'varchar(255)'} for header in columnHeader.get('dimensions', []) ] metricHeaders = [ {'name': entry.get('name').replace('ga:', ''), 'type': self.metricMap.get(entry.get('type'), 'varchar(255)')} for entry in columnHeader.get('metricHeader', {}).get('metricHeaderEntries', []) ] with NamedTemporaryFile("w") as ga_file: rows = report.get('data', {}).get('rows', []) for row_counter, row in enumerate(rows): root_data_obj = {} dimensions = row.get('dimensions', []) metrics = row.get('metrics', []) for index, dimension in enumerate(dimensions): header = dimensionHeaders[index].get('name').lower() root_data_obj[header] = dimension for metric in metrics: data = {} data.update(root_data_obj) for index, value in enumerate(metric.get('values', [])): header = metricHeaders[index].get('name').lower() data[header] = value data['viewid'] = self.view_id data['timestamp'] = self.since ga_file.write(json.dumps(data) + ('' if row_counter == len(rows) else '\n')) gcs_conn.upload(self.gcs_bucket, self.gcs_object, ga_file.name)
def _upload_to_gcs(self, files_to_upload): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for object, tmp_file_handle in files_to_upload.items(): hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super(S3ToGoogleCloudStorageOperator, self).execute(context) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.dest_gcs_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list(bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info('{0} files are going to be synced: {1}.'.format( len(files), files)) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files
class GoogleCampaignManagerDownloadReportOperator( GoogleMarketingPlatformBaseOperator): """Downloads a Campaign Manager report into Google Cloud Storage. Attributes: report_id: The DCM report ID with which the report file is associated with. (templated) file_id: The DCM file ID of the report file to download. (templated) destination_bucket: The destination Google cloud storage bucket where the report should be written to. (templated) destination_object: The destination name of the object in the destination Google cloud storage bucket. (templated) If the destination points to an existing folder, the report will be written under the specified folder. gcp_conn_id: The connection ID to use when fetching connection info. delegate_to: The account to impersonate, if any. XComs: destination_bucket: The Google cloud storage bucket the report was written to. destination_object: The Google cloud storage URI for the report. """ template_fields = [ 'report_id', 'file_id', 'destination_bucket', 'destination_object' ] def __init__(self, report_id, file_id, destination_bucket, destination_object=None, gcp_conn_id='google_cloud_default', chunk_size=5 * 1024 * 1024, delegate_to=None, *args, **kwargs): super(GoogleCampaignManagerDownloadReportOperator, self).__init__(*args, **kwargs) self.file_id = file_id self.report_id = report_id self.destination_bucket = destination_bucket self.destination_object = destination_object self.chunk_size = chunk_size self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.gcs_hook = None self.cm_hook = None def _download_report(self, report_id, file_id, destination_file, chunk_size): file_metadata = self.cm_hook.get_service().files().get( reportId=report_id, fileId=file_id).execute() if file_metadata['status'] != 'REPORT_AVAILABLE': msg = 'File with ID = %s and Report ID = %s not available, status = %s.' % ( file_id, report_id, file_metadata['status']) raise Exception(msg) request = self.cm_hook.get_service().files().get_media( reportId=report_id, fileId=file_id) downloader = http.MediaIoBaseDownload(destination_file, request, chunksize=chunk_size) download_finished = False while not download_finished: _, download_finished = downloader.next_chunk() return file_metadata['fileName'] def _get_destination_uri(self, destination_object, report_file_name): report_file_name = '%s.csv.gz' % report_file_name if destination_object is None: return report_file_name if destination_object.endswith('/'): return destination_object + report_file_name return destination_object def execute(self, context): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if self.cm_hook is None: self.cm_hook = GoogleCampaignManagerHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) temp_file = tempfile.NamedTemporaryFile(delete=False) try: report_file_name = self._download_report(self.report_id, self.file_id, temp_file, self.chunk_size) destination_object_name = self._get_destination_uri( self.destination_object, report_file_name) self.gcs_hook.upload(bucket=self.destination_bucket, object=destination_object_name, filename=temp_file.name, gzip=True, multipart=True) context['task_instance'].xcom_push('destination_bucket', self.destination_bucket) context['task_instance'].xcom_push('destination_object', destination_object_name) finally: temp_file.close() os.unlink(temp_file.name)
class GCSLog(object): """ Utility class for reading and writing logs in GCS. Requires airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and REMOTE_LOG_CONN_ID configuration options in airflow.cfg. """ def __init__(self): """ Attempt to create hook with airflow[gcp_api]. """ remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') self.hook = None try: from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook self.hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=remote_conn_id) except: logging.error( 'Could not create a GoogleCloudStorageHook with connection id ' '"{}". Please make sure that airflow[gcp_api] is installed ' 'and the GCS connection exists.'.format(remote_conn_id)) def read(self, remote_log_location, return_error=False): """ Returns the log found at the remote_log_location. :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param return_error: if True, returns a string error message if an error occurs. Otherwise returns '' when an error occurs. :type return_error: bool """ if self.hook: try: bkt, blob = self.parse_gcs_url(remote_log_location) return self.hook.download(bkt, blob).decode() except: pass # raise/return error if we get here err = 'Could not read logs from {}'.format(remote_log_location) logging.error(err) return err if return_error else '' def write(self, log, remote_log_location, append=False): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if self.hook: if append: old_log = self.read(remote_log_location) log = old_log + '\n' + log try: bkt, blob = self.parse_gcs_url(remote_log_location) from tempfile import NamedTemporaryFile with NamedTemporaryFile(mode='w+') as tmpfile: tmpfile.write(log) # Force the file to be flushed, since we're doing the # upload from within the file context (it hasn't been # closed). tmpfile.flush() self.hook.upload(bkt, blob, tmpfile.name) except: # raise/return error if we get here logging.error('Could not write logs to {}'.format(remote_log_location)) def parse_gcs_url(self, gsurl): """ Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a tuple containing the corresponding bucket and blob. """ # Python 3 try: from urllib.parse import urlparse # Python 2 except ImportError: from urlparse import urlparse parsed_url = urlparse(gsurl) if not parsed_url.netloc: raise AirflowException('Please provide a bucket name') else: bucket = parsed_url.netloc blob = parsed_url.path.strip('/') return (bucket, blob)
def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super().execute(context) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.dest_gcs_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list( bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info( '%s files are going to be synced: %s.', len(files), files ) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files