def setUp(self): session = requests.Session() adapter = requests_mock.Adapter() session.mount('mock', adapter) self.get_hook = HttpHook(method='GET') self.get_lowercase_hook = HttpHook(method='get') self.post_hook = HttpHook(method='POST') configuration.load_test_config()
def execute(self, context): http = HttpHook(self.method, http_conn_id=self.http_conn_id) self.log.info("data :" + self.jobId) self.log.info("api_key : " + self.api_key) self.log.info("Calling HTTP method") response = http.run(self.endpoint+"?api_key=" + self.api_key, self.data, self.headers, None) self.log.info("Launch job call status : " + str(response.status_code)) self.log.info("Response : " + str(response.text)) if self.response_check: if not self.response_check(response): raise AirflowException("Response check returned False with code : " + str(response.status_code) + " and message : " + response.text) rJson = RespAsync(response.text) eta = rJson.eta retryInterval = rJson.retryInterval job = rJson.jobId # timestamp = int(time.time()) #timestamp = datetime.timestamp(now) httpCallback = HttpHook('GET', http_conn_id=self.http_conn_id) time.sleep(eta) while True: respCallback = httpCallback.run(self.endpointAsync+"/"+job+"?api_key=" + self.api_key, None, self.headers, self.extra_options) if respCallback.status_code != 200: raise AirflowException("Error while calling callback method check returned False with code : " + str(respCallback.status_code) + " and message : " + respCallback.text) retryResp = RespAsyncRetry(respCallback.text) self.log.info("Retry call with status : " + retryResp.status) if retryResp.status == "PENDING": self.log.info("waiting ...") time.sleep(retryInterval) continue else: if self.response_check_callback: if not self.response_check_callback(retryResp): raise AirflowException("Response check returned False with code : " + str(retryResp.status) + " message : " + retryResp.message) if self.xcom_push_flag: return respCallback.text break
def test(token_hook_conn_id, api_hook_conn_id): token_hook = HttpHook(http_conn_id=token_hook_conn_id) api_hook = HttpHook(http_conn_id=api_hook_conn_id) token = login(token_hook) logging.debug("登录获取的token:%s", token) res = run_job('fund_cn', 'tt_fund_bank_list', api_hook, token) job_id = res['data'] logging.info("启动作用,id为%s", job_id) res = query_job_status(job_id, api_hook, token) logging.info("查询作业状态:%s", res)
def execute(self, context): job_label = '({}/{})'.format(self.project, self.job) get_hook = HttpHook(http_conn_id='cml_rest_api', method='GET') post_hook = HttpHook(http_conn_id='cml_rest_api', method='POST') projects_url = 'api/v2/projects' r = get_hook.run(endpoint=projects_url) projects = {p['name']: p['id'] for p in r.json()['projects']} if r.ok else None if projects and self.project in projects.keys(): jobs_url = '{}/{}/jobs'.format(projects_url, projects[self.project]) r = get_hook.run(endpoint=jobs_url) jobs = {j['name']: j['id'] for j in r.json()['jobs']} if r.ok else None if jobs and self.job in jobs.keys(): runs_url = '{}/{}/runs'.format(jobs_url, jobs[self.job]) r = post_hook.run(endpoint=runs_url) run = r.json() if r.ok else None if run: status = run['status'] RUNNING_STATES = [ 'ENGINE_SCHEDULING', 'ENGINE_STARTING', 'ENGINE_RUNNING' ] SUCCESS_STATES = ['ENGINE_SUCCEEDED'] POLL_INTERVAL = 10 while status and status in RUNNING_STATES: run_id_url = '{}/{}'.format(runs_url, run['id']) r = get_hook.run(endpoint=run_id_url) status = r.json()['status'] if r.ok else None time.sleep(POLL_INTERVAL) if status not in SUCCESS_STATES: raise AirflowException( 'Error while waiting for CML job ({}) to complete'. format(job_label)) else: raise AirflowException( 'Problem triggering CML job ({})'.format(job_label)) else: raise AirflowException( 'Problem finding the CML job ID ({})'.format(self.job)) else: raise AirflowException( 'Problem finding the CML project ID ({})'.format(self.project))
def insert_rows(): pg_hook = PostgresHook(postgres_conn_id='postgres_default') sql_insert = f"""INSERT INTO {table_variables['name']} VALUES (%s, %s, %s, %s, %s, %s ,%s, %s, %s, %s)""" http_hook = HttpHook(http_conn_id=table_variables['http_conn_id'], method='GET') res = http_hook.run(endpoint=table_variables['endpoint'], data={ 'resource_id': table_variables['resource_id'], 'limit': '10000000' }) http_hook.check_response(response=res) unemployment_measures = res.json()['result']['records'] unemployment_df = pd.DataFrame(unemployment_measures) unemployment_df = unemployment_df[[ '_id', 'Any', 'Mes', 'Codi_Districte', 'Nom_Districte', 'Codi_Barri', 'Nom_Barri', 'Durada_atur', 'Nombre' ]] unemployment_df.replace({ 'NA': np.nan, '-Inf': np.nan, 'Inf': np.nan }, inplace=True) insert_ts = datetime.utcnow() for row in unemployment_df.itertuples(index=False): pg_hook.run(sql_insert, parameters=(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], insert_ts))
def copy_brazil_data_file(origin_host, origin_filepath, dest_bucket, dest_key): """Copy Brazil data file to a local bucket. Copy the source file which contains detailed data about Brazil to an AWS S3 bucket to make it available to AWS EMR. args: origin_host (str): host where the source file is in origin_filepath (str): full path to the file in the host dest_bucket (str): name of the bucket to store the file dest_key (str): prefix/name of the file in the destination bucket """ logging.info('Copying Brazil data file ' \ f'FROM: http://{origin_host}/{origin_filepath} ' \ f'TO: s3://{dest_bucket}/{dest_key}') # Create a connection to the source server conn = Connection(conn_id='http_conn_brasilio', conn_type='http', host=origin_host, port=80) #create a connection object session = settings.Session() # get the session session.add(conn) session.commit() # Get the data file http_hook = HttpHook(method='GET', http_conn_id='http_conn_brasilio') response_br_data = http_hook.run(origin_filepath) # Store data file into s3 bucket s3_hook = S3Hook(aws_conn_id='aws_default') s3_hook.load_bytes(response_br_data.content, dest_key, bucket_name=dest_bucket, replace=True) logging.info('Data copy finished.')
def check_spark_app_status(self, app_id): logging.info( "Getting app status (id={app_id}) from Spark REST API...".format( app_id=app_id)) endpoint = "{SPARK_ENDPOINT}/{app_id}/jobs".format( SPARK_ENDPOINT=SPARK_ENDPOINT, app_id=app_id) response = HttpHook(method="GET", http_conn_id=self.http_conn_id_spark).run(endpoint) try: jobs = json.loads(response.content) expected_status = "SUCCEEDED" for job in jobs: job_id = job["jobId"] job_status = job["status"] logging.info( "Job id {job_id} associated with application '{app_id}' " "is '{job_status}'".format(job_id=job_id, app_id=app_id, job_status=job_status)) if job_status != expected_status: raise AirflowException( "Job id '{job_id}' associated with application '{app_id}' " "is '{job_status}', expected status is '{expected_status}'" .format(job_id=job_id, app_id=app_id, job_status=job_status, expected_status=expected_status)) except (JSONDecodeError, LookupError, TypeError) as ex: log_response_error("$.jobId, $.status", response) raise AirflowBadRequest(ex)
def _fetch_headers(self, force_refresh=False): headers = {"Content-Type": "application/x-ndjson"} if not self.protected: return headers if ( self.access_token is None or time.time() + self.token_expires_margin > self.token_expires_time or force_refresh ): form_params = dict( grant_type="client_credentials", client_id=OIDC_CLIENT_ID, client_secret=OIDC_CLIENT_SECRET, ) http = HttpHook(http_conn_id="oidc_server", method="POST") for i in range(3): try: response = http.run(OIDC_TOKEN_ENDPOINT, data=form_params) except AirflowException: self.log.exception("Keycloak unreachable") time.sleep(1) else: break else: raise token_info = response.json() self.access_token = token_info["access_token"] self.token_expires_time = time.time() + token_info["expires_in"] headers["Authorization"] = f"Bearer {self.access_token}" return headers
def query_and_extract(**context): http_conn = HttpHook('GET', http_conn_id) redis_conn = RedisHook(redis_conn_id) prev_exec_date = context.get('prev_execution_date') next_exec_date = context.get('next_execution_date') query_meta = "SELECT fileName FROM archive_files WHERE archiveName = '{}'" \ " AND ingestDate > '{}' and ingestDate <= '{}' ORDER BY ingestDate".format(collection, prev_exec_date.strftime( datetime_format), next_exec_date.strftime(datetime_format)) logging.info('Query: {}'.format(query_meta)) data = { 'QUERY': query_meta, 'LANG': 'ADQL', 'FORMAT': '{}'.format(output_format) } with http_conn.run('/ad/auth-sync?{}'.format( parse.urlencode(data))) as response: artifact_files_list = response.text.split()[1:] if artifact_files_list: redis_key = '{}_{}_{}.{}'.format(collection, _to_milliseconds(prev_exec_date), _to_milliseconds(next_exec_date), output_format) redis_conn.get_conn().rpush(redis_key, artifact_files_list) return redis_key
def get_data_zomato_api(*args, **kwargs): api_hook = HttpHook(http_conn_id="zomato_api", method='GET') data_dict = {} schema = {"properties": {"restaurants": {"mergeStrategy": "append"}}} merger = Merger(schema) for i in range(0, 100, 20): endpoint_url = "search?entity_id=3&entity_type=city&start={}&count=20&sort=rating".format( i) resp_url = api_hook.run(endpoint=endpoint_url) resp = json.loads(resp_url.content) if i == 0: data_dict.update(resp) result = data_dict else: result = merger.merge(result, resp) with open( "/Users/preetiyerkuntwar/documents/Zomato-test/all_restro.json", "w") as f: json.dump(result, f) f.close()
def get_spin(**kwargs): http = HttpHook('GET', http_conn_id='http_gbfs_spin') response = http.run("/api/gbfs/v1/detroit/free_bike_status") spins = json.loads(response.text) for s in spins['data']['bikes']: device_id = s.pop('bike_id') lat = s.pop('lat') lon = s.pop('lon') insert = f""" insert into scooters.availability ( vendor, device_id, timestamp, extra, geom ) values ( 'spin', '{device_id}', '{kwargs['execution_date']}', '{json.dumps(s)}', ST_SetSRID(ST_MakePoint({lon},{lat}), 4326) ) """ pg.run(insert) return response
def poke(self, context): logging.info("Getting session {session_id} status...".format( session_id=self.session_id)) endpoint = "{ENDPOINT}/{session_id}/state".format( ENDPOINT=ENDPOINT, session_id=self.session_id) response = HttpHook(method="GET", http_conn_id=self.http_conn_id).run(endpoint) try: state = json.loads(response.content)["state"] except (JSONDecodeError, LookupError) as ex: log_response_error("$.state", response, self.session_id) raise AirflowBadRequest(ex) if state == "starting": logging.info("Session {session_id} is starting...".format( session_id=self.session_id)) return False if state == "idle": logging.info( "Session {session_id} is ready to receive statements.".format( session_id=self.session_id)) return True raise AirflowException( "Session {session_id} failed to start. " "State='{state}'. Expected states: 'starting' or 'idle' (ready).". format(session_id=self.session_id, state=state))
def spill_session_logs(self): dashes = '-' * 50 logging.info( "{dashes}Full log for session {session_id}{dashes}".format( dashes=dashes, session_id=self.session_id)) endpoint = "{ENDPOINT}/{session_id}/log".format( ENDPOINT=ENDPOINT, session_id=self.session_id) hook = HttpHook(method="GET", http_conn_id=self.http_conn_id) line_from = 0 line_to = LOG_PAGE_LINES while True: log_page = self.fetch_log_page(hook, endpoint, line_from, line_to) try: logs = log_page["log"] for log in logs: logging.info(log.replace("\\n", "\n")) actual_line_from = log_page["from"] total_lines = log_page["total"] except LookupError as ex: log_response_error("$.log, $.from, $.total", log_page, self.session_id) raise AirflowBadRequest(ex) actual_lines = len(logs) if actual_line_from + actual_lines >= total_lines: logging.info("{dashes}End of full log for session {session_id}" "{dashes}".format(dashes=dashes, session_id=self.session_id)) break line_from = actual_line_from + actual_lines
def poke(self, context): logging.info("Getting status for statement {statement_id} " "in session {session_id}".format( statement_id=self.statement_id, session_id=self.session_id)) endpoint = "{ENDPOINT}/{session_id}/statements/{statement_id}"\ .format(ENDPOINT=ENDPOINT, session_id=self.session_id, statement_id=self.statement_id) response = HttpHook(method="GET", http_conn_id=self.http_conn_id).run(endpoint) try: statement = json.loads(response.content) state = statement["state"] except (JSONDecodeError, LookupError) as ex: log_response_error("$.state", response, self.session_id, self.statement_id) raise AirflowBadRequest(ex) if state in ["waiting", "running"]: logging.info("Statement {statement_id} in session {session_id} " "has not finished yet (state is '{state}')".format( statement_id=self.statement_id, session_id=self.session_id, state=state)) return False if state == "available": self.__check_status(statement, response) return True raise AirflowBadRequest( "Statement {statement_id} in session {session_id} failed due to " "an unknown state: '{state}'.\nKnown states: 'waiting', 'running', " "'available'".format(statement_id=self.statement_id, session_id=self.session_id, state=state))
def print_hello(): task_id = 'CG_details' #task_id='Lookup_Post', http_conn_id = 'cg_default' #http_conn_id='lookup_conn', method = 'POST' data = rq_body_param, #data={"sql":"SELECT a.shape_name, a.num_sides, b.color_name, b.red_value, b.green_value, b.blue_value FROM shapes_production a, colors_production b WHERE a.color_name = b.color_name LIMIT 5;"}, #data=json.dumps({'ldap' : 'tangupta'}), endpoint = 'xyz/test/execute/jobs' #endpoint='/lookup/dataSets/testCollection/keys/ldap?imsOrg=testDb', headers = { "Content-Type": "application/json", "accept": "application/json", "x-api-key": "acp_testing", "Authorization": "Bearer eyJ4NXUiOiJpbXNfbmExLXN0ZzEta2V5LTEuY2VyIiwiYWxnIjoiUlMyNTYifQ.eyJpZCI6IjE1MTcyMTI4MjAzMjhfNjNkMzI5NjMtOTYzYy00YjA2LTk3MjAtN2M2OTExZDI2Y2E5X3VlMSIsImNsaWVudF9pZCI6ImFjcF90ZXN0aW5nIiwidXNlcl9pZCI6ImFjcF90ZXN0aW5nQEFkb2JlSUQiLCJ0eXBlIjoiYWNjZXNzX3Rva2VuIiwiYXMiOiJpbXMtbmExLXN0ZzEiLCJwYWMiOiJhY3BfdGVzdGluZ19zdGciLCJydGlkIjoiMTUxNzIxMjgyMDMyOV84YzFhYzRhOC1lZjM0LTQ3ZWYtOWFkNi0xMmI0ZTg3MjYzNjdfdWUxIiwicnRlYSI6IjE1MTg0MjI0MjAzMjkiLCJtb2kiOiJkMjVhMzg5ZSIsImMiOiJZSFg3Rld5d2JnaDhTYy9FMW1vaWJBPT0iLCJleHBpcmVzX2luIjoiODY0MDAwMDAiLCJzY29wZSI6ImFjcC5mb3VuZGF0aW9uLmFjY2Vzc0NvbnRyb2wsYWNwLmNvcmUucGlwZWxpbmUsc3lzdGVtLG9wZW5pZCxBZG9iZUlELGFkZGl0aW9uYWxfaW5mby5yb2xlcyxhZGRpdGlvbmFsX2luZm8ucHJvamVjdGVkUHJvZHVjdENvbnRleHQsYWNwLmZvdW5kYXRpb24sYWNwLmZvdW5kYXRpb24uY2F0YWxvZyxhY3AuZGlzY292ZXJ5IiwiY3JlYXRlZF9hdCI6IjE1MTcyMTI4MjAzMjgifQ.Q0eAxwLdkQ7XEDzpVwDtoKsmwySkEN26F85wDWjgo5j8lriO_8hUDEYYTXJjvXd0xOr82OnIQnWrDe8LXGLswH2rUYmR0oC40Wfv_ZMLf6IPyghNSw5QWKMYhOKTq-4n2kFvnvSh2Dq_F3govWSo1OWR609xC-HKLGAfBgWqAvCN5WPGQzQ8e5zeqCgclBTk4noBqJIVV06hJROSiD2Gt7FyC6YNMm3B-fVaOfFb4C2WBeGprQphXsVirMSvt9lWEYKqo5pGHgOlL5U40LeWFQMcnfOcmIntDG56BE3lhdyQeeltYbZlg1_RwsVwL5OcVWCtceyB0PWj9HheqvRsvA" } extra_options = {} http = HttpHook(method, http_conn_id='cg_default') logging.info('Calling HTTP method') print(os.environ['PATH']) response = http.run(endpoint, data, headers, extra_options) print(response) print(response.text) print(configuration.get('testing', 'tanuj').encode('utf-8')) return 'Hello world!'
def store_product_catalog_with_partition(**kwargs): task_instance = kwargs['ti'] get_product_catalog_db_response = task_instance.xcom_pull( key=None, task_ids='get_product_catalog_db') get_product_catalog_db_json = json.loads(get_product_catalog_db_response) http_hook = HttpHook( method='POST', http_conn_id='product_catalog', ) for index, product_catalog in enumerate( get_product_catalog_db_json['rows']): product_catalog_document = product_catalog['value'] product_catalog_document.pop('_rev', None) product_catalog_document['_id'] = f"{product_catalog_document['type']}_{product_catalog_document['subtype']}" \ f":{product_catalog_document['_id']}" print("request", product_catalog_document) print("request", json.dumps(product_catalog_document).encode('utf-8')) print("request", json.loads(json.dumps(product_catalog_document))) response = http_hook.run( endpoint='copy_product', headers={"Content-Type": "application/json; charset=utf-8"}, json=product_catalog_document, ) print("response", response)
def get_rates(ds, **kwargs): pg_hook = PostgresHook(postgres_conn_id='rates') api_hook = HttpHook(http_conn_id='openexchangerates', method='GET') # If either of these raises an exception then we'll be notified via # Airflow resp = api_hook.run('') resp = json.loads(resp.content) # These are the only valid pairs the DB supports at the moment. Anything # else that turns up will be ignored. valid_pairs = ( 'AED', 'AFN', 'ALL', 'AMD', 'ANG', 'AOA', 'ARS', 'AUD', 'AWG', 'AZN', 'BAM', 'BBD', 'BDT', 'BGN', 'BHD', 'BIF', 'BMD', 'BND', 'BOB', 'BRL', 'BSD', 'BTC', 'BTN', 'BWP', 'BYN', 'BYR', 'BZD', 'CAD', 'CDF', 'CHF', 'CLF', 'CLP', 'CNY', 'COP', 'CRC', 'CUC', 'CUP', 'CVE', 'CZK', 'DJF', 'DKK', 'DOP', 'DZD', 'EEK', 'EGP', 'ERN', 'ETB', 'EUR', 'FJD', 'FKP', 'GBP', 'GEL', 'GGP', 'GHS', 'GIP', 'GMD', 'GNF', 'GTQ', 'GYD', 'HKD', 'HNL', 'HRK', 'HTG', 'HUF', 'IDR', 'ILS', 'IMP', 'INR', 'IQD', 'IRR', 'ISK', 'JEP', 'JMD', 'JOD', 'JPY', 'KES', 'KGS', 'KHR', 'KMF', 'KPW', 'KRW', 'KWD', 'KYD', 'KZT', 'LAK', 'LBP', 'LKR', 'LRD', 'LSL', 'LTL', 'LVL', 'LYD', 'MAD', 'MDL', 'MGA', 'MKD', 'MMK', 'MNT', 'MOP', 'MRO', 'MTL', 'MUR', 'MVR', 'MWK', 'MXN', 'MYR', 'MZN', 'NAD', 'NGN', 'NIO', 'NOK', 'NPR', 'NZD', 'OMR', 'PAB', 'PEN', 'PGK', 'PHP', 'PKR', 'PLN', 'PYG', 'QAR', 'RON', 'RSD', 'RUB', 'RWF', 'SAR', 'SBD', 'SCR', 'SDG', 'SEK', 'SGD', 'SHP', 'SLL', 'SOS', 'SRD', 'STD', 'SVC', 'SYP', 'SZL', 'THB', 'TJS', 'TMT', 'TND', 'TOP', 'TRY', 'TTD', 'TWD', 'TZS', 'UAH', 'UGX', 'USD', 'UYU', 'UZS', 'VEF', 'VND', 'VUV', 'WST', 'XAF', 'XAG', 'XAU', 'XCD', 'XDR', 'XOF', 'XPD', 'XPF', 'XPT', 'YER', 'ZAR', 'ZMK', 'ZMW', 'ZWL') rates_insert = """INSERT INTO rates (pair, valid_until, rate) VALUES (%s, %s, %s);""" # If this raises an exception then we'll be notified via Airflow valid_until = datetime.fromtimestamp(resp['timestamp']) for (iso2, rate) in resp['rates'].items(): # If converting the rate to a float fails for whatever reason then # just move on. try: rate = float(rate) except: continue iso2 = iso2.upper().strip() if iso2 not in valid_pairs or rate < 0: continue pg_hook.run(rates_insert, parameters=(iso2, valid_until, rate))
def test_connection_without_host(self, mock_get_connection): c = Connection(conn_id='http_default', conn_type='http') mock_get_connection.return_value = c hook = HttpHook() hook.get_conn({}) self.assertEqual(hook.base_url, 'http://')
def print_hello(): task_id = 'IMS_details' #task_id='Lookup_Post', http_conn_id = 'cg_default' #http_conn_id='lookup_conn', method = 'POST' data = {}, #data={"sql":"SELECT a.shape_name, a.num_sides, b.color_name, b.red_value, b.green_value, b.blue_value FROM shapes_production a, colors_production b WHERE a.color_name = b.color_name LIMIT 5;"}, #data=json.dumps({'ldap' : 'tangupta'}), endpoint = '' #endpoint='/lookup/dataSets/testCollection/keys/ldap?imsOrg=testDb', headers = { "Content-Type": "application/x-www-form-urlencoded", "accept": "application/json" } http = HttpHook(method, http_conn_id='ims_default') logging.info('Calling HTTP method') print(os.environ['PATH']) response = http.run(endpoint, data, headers) print(response) print(response.text) print(configuration.get('testing', 'tanuj').encode('utf-8')) return 'Hello world!'
def snapshot(**kwargs): """ Query the TAP service and snapshot the OMM data. #FIXME: The query should have some conditions to limit the data. """ logging.info('Populating inputs.') query = Variable.get('omm_input_uri_query') redis = RedisHook(redis_conn_id='redis_default') data = {'QUERY': query, 'REQUEST': 'doQuery', 'LANG': 'ADQL', 'FORMAT': 'csv'} http_connection = HttpHook(method='GET', http_conn_id='tap_service_host') count = -1 with http_connection.run('/tap/sync?', parse.urlencode(data)) as response: arr = response.text.split('\n') count = len(arr) logging.info('Found {} items.'.format(count)) sanitized_uris = [] for uri in arr[1:]: if uri: artifact_uri = uri.split('/')[1].strip() sanitized_artifact_uri = artifact_uri.replace( '+', '_').replace('%', '__') logging.info('Output is {}'.format(sanitized_artifact_uri)) sanitized_uris.append(sanitized_artifact_uri) redis.get_conn().rpush(redis_key, *sanitized_uris) redis.get_conn().persist(redis_key) return 'Extracted {} items'.format(len(sanitized_uris))
def get_lime(**kwargs): http = HttpHook('GET', http_conn_id='http_gbfs_lime') # get availability endpoint with limit = 1000 response = http.run("/api/partners/v1/gbfs/detroit/free_bike_status.json") limes = json.loads(response.text) for l in limes['data']['bikes']: device_id = l.pop('bike_id') lat = l.pop('lat') lon = l.pop('lon') insert = f""" insert into scooters.availability ( vendor, device_id, timestamp, extra, geom ) values ( 'lime', '{device_id}', '{kwargs['execution_date']}', '{json.dumps(l)}', ST_SetSRID(ST_MakePoint({lon},{lat}), 4326) ) """ pg.run(insert) return response
def _download_from_http(self): http = HttpHook("GET", http_conn_id=self.http_connection_id) self.log.info("Calling HTTP method") response = http.run(self.http_endpoint) self.log.info(response.text) return response.text
def spill_batch_logs(self): dashes = 50 logging.info( f"{'-'*dashes}Full log for batch {self.batch_id}{'-'*dashes}") endpoint = f"{LIVY_ENDPOINT}/{self.batch_id}/log" hook = HttpHook(method="GET", http_conn_id=self.http_conn_id_livy) line_from = 0 line_to = LOG_PAGE_LINES while True: log_page = self.fetch_log_page(hook, endpoint, line_from, line_to) try: logs = log_page["log"] for log in logs: logging.info(log.replace("\\n", "\n")) actual_line_from = log_page["from"] total_lines = log_page["total"] except LookupError as ex: log_response_error("$.log, $.from, $.total", log_page) raise AirflowBadRequest(ex) actual_lines = len(logs) if actual_line_from + actual_lines >= total_lines: logging.info( f"{'-' * dashes}End of full log for batch {self.batch_id}" f"{'-' * dashes}") break line_from = actual_line_from + actual_lines
def send_status_msg(**kwargs): http_conn_id='ingest_api_connection' endpoint='/datasets/status' method='PUT' headers={ #'authorization' : 'Bearer ' + kwargs['params']['auth_tok'], 'content-type' : 'application/json'} extra_options=[] http = HttpHook(method, http_conn_id=http_conn_id) md_fname = os.path.join(os.environ['AIRFLOW_HOME'], 'data/temp', kwargs['run_id'], 'rslt.yml') with open(md_fname, 'r') as f: md = yaml.safe_load(f) data = {'dataset_id' : kwargs['dag_run'].conf['submission_id'], 'status' : 'QA', 'message' : 'the process ran', 'metadata': md} print('data: ', data) print("Calling HTTP method") response = http.run(endpoint, json.dumps(data), headers, extra_options) print(response.text)
def sub_dag(child_dag_id, input_file_names, key): sub_dag = DAG('{}.{}'.format(dag_id, child_dag_id), default_args=default_args, catchup=False, schedule_interval=vlass_dag.schedule_interval) http_conn = HttpHook('GET', http_conn_id) auth_conn = HttpHook.get_connection(http_conn_id) with http_conn.run('/cred/auth/priv/users/{}'.format( auth_conn.login)) as response: cert = response.text for idx, x in enumerate(input_file_names): KubernetesPodOperator( dag=sub_dag, namespace='default', task_id='vlass-transform-{}-{}'.format(idx, key), in_cluster=True, get_logs=True, cmds=['{}_run_single'.format(collection.lower())], arguments=[x, cert], name='airflow-vlass-transform-pod', volumes=[volume], volume_mounts=[volume_mount]) return sub_dag
def _get_weather_data(self, lat, lon): """ Gets the weather data from the specified coordinates and time :param lat: latitude to be used as query param. :param lon: longitude to be used as query param. :return: response retrieved from the API """ open_weather = HttpHook(method="GET", http_conn_id=self.open_weather_conn) data = { "lat": lat, "lon": lon, "dt": calendar.timegm(self.date.timetuple()), "appid": self.app_id, "units": "metric", } response = open_weather.run("/data/2.5/onecall/timemachine", data=data) if response.status_code == 200: self.log.info("Weather data successfully retrived from location") return self._weather_date_to_datetime(response.json()["hourly"]) else: raise ValueError
def insert_rows(): insert_ts = datetime.utcnow() pg_hook = PostgresHook(postgres_conn_id='postgres_default') sql_insert = f"""INSERT INTO {table_variables['name']} VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" http_hook = HttpHook(http_conn_id=table_variables['http_conn_id'], method='GET') res = http_hook.run(endpoint=table_variables['endpoint'], data={'codigo': table_variables['codigo']}) http_hook.check_response(response=res) cases_df = pd.DataFrame(res.json()['timeline']) for row in cases_df.itertuples(index=False): date = row.fecha information = pd.Series(row.regiones[0]['data']) information = information[['casosConfirmados', 'casosUci', 'casosFallecidos', 'casosHospitalizados', 'casosRecuperados', 'casosConfirmadosDiario', 'casosUciDiario', 'casosFallecidosDiario', 'casosHospitalizadosDiario', 'casosRecuperadosDiario']] pg_hook.run(sql_insert, parameters=(date, information[0], information[1], information[2], information[3], information[4], information[5], information[6], information[7], information[8], information[9], insert_ts))
def test_host_encoded_https_connection(self, mock_get_connection): c = Connection(conn_id='http_default', conn_type='http', host='https://localhost') mock_get_connection.return_value = c hook = HttpHook() hook.get_conn({}) self.assertEqual(hook.base_url, 'https://localhost')
def __init__(self, http_conn_id, token, job_name, data=None, headers=None, method='start', daemon=True, parallelism=0, retry_times=3, retry_sleep_time=1, *args, **kwargs): basic_headers = {'Content-Type': "application/json", 'Token': token} if headers: basic_headers.update(headers) self.headers = basic_headers self.http_conn_id = http_conn_id self.job_name = job_name self.http = HttpHook('POST', http_conn_id=self.http_conn_id) self.data = data if data is not None else {} self.job_last_run_id = dict() self.job_pools = [] self.all_jobs = None self.finished_jobs = [] self.parallelism = parallelism self.method = method self.daemon = daemon self.retry_times = retry_times self.retry_sleep_time = retry_sleep_time self.start_run_time = time.time() self.failed_jobs = defaultdict(int) super(_BaseJobOperator, self).__init__(*args, **kwargs)
def insert_rows(): pg_hook = PostgresHook(postgres_conn_id='postgres_default') sql_insert = f"""INSERT INTO {table_variables['name']} VALUES (%s, %s, %s, %s, %s, %s ,%s, %s, %s, %s)""" http_hook = HttpHook(http_conn_id=table_variables['http_conn_id'], method='GET') res = http_hook.run(endpoint=table_variables['endpoint'], data={ 'resource_id': table_variables['resource_id'], 'limit': '10000000' }) http_hook.check_response(response=res) bcn_covid_measures = res.json()['result']['records'] bcn_covid_df = pd.DataFrame(bcn_covid_measures) bcn_covid_df = bcn_covid_df[[ '_id', 'Data_Indicador', 'Font', 'Frequencia_Indicador', 'Nom_Indicador', 'Nom_Variable', 'Territori', 'Unitat', 'Valor' ]] bcn_covid_df.replace({ 'NA': np.nan, '-Inf': np.nan, 'Inf': np.nan }, inplace=True) insert_ts = datetime.utcnow() for row in bcn_covid_df.itertuples(index=False): pg_hook.run(sql_insert, parameters=(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], insert_ts))