def get_metrics(self, day, aws_conn_id=DEFAULT_AWS_CONN_ID, region_name=None, metrics=None): """ Get the metrics :param day: Date to be exported as string in YYYY-MM-DD format or date/datetime instance (default: yesterday) :type day: str, date or datetime :param aws_conn_id: AWS connection id (default: aws_default) :type aws_conn_id: str :param region_name: AWS Region :type region_name: str :param metrics: Metrics :type metrics: list """ aws_hook = AwsHook(aws_conn_id) region_name = region_name or aws_hook.get_session().region_name if not day or (isinstance(day, str) and day.lower() == 'yesterday'): ds = datetime.today() - timedelta(days=1) elif isinstance(day, date): # datetime is a subclass of date ds = day else: ds = dateutil.parser.parse(day) self.metrics = metrics.split(',') if isinstance(metrics, str) else metrics self.log.info( 'ds: {ds:%Y-%m-%d} aws_conn_id: {aws_conn_id} region_name: {region_name} metrics: {metrics}' .format(ds=ds, aws_conn_id=aws_conn_id, region_name=region_name, metrics=metrics)) return self.get_metrics_perform_query(ds, metrics, aws_hook, region_name)
def test_get_session_returns_a_boto3_session(self): hook = AwsHook(aws_conn_id='aws_default') session_from_hook = hook.get_session() resource_from_session = session_from_hook.resource('dynamodb') table = resource_from_session.create_table(TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[{ 'AttributeName': 'name', 'AttributeType': 'S' }], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 }) table.meta.client.get_waiter('table_exists').wait( TableName='test_airflow') self.assertEqual(table.item_count, 0)
def execute(self, context): hook = AwsHook(aws_conn_id=self.aws_credentials_id) s3 = hook.get_session(region_name=self.region).resource('s3') my_bucket = s3.Bucket(self.bucket) logging.info("upload files from %s to %s/%s/", self.src, my_bucket, self.dst) my_bucket.upload_file(self.src, self.dst)
def test_get_session_returns_a_boto3_session(self): hook = AwsHook(aws_conn_id='aws_default') session_from_hook = hook.get_session() resource_from_session = session_from_hook.resource('dynamodb') table = resource_from_session.create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'name', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) table.meta.client.get_waiter( 'table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 0)
def execute(self, context): """Extract id list from local DB and upload id list to S3 """ if self.only_current_date: start_date = context["execution_date"] + datetime.timedelta(days=-1) end_date = start_date + datetime.timedelta(days=1) else: start_date, end_date = self.min_max_timestamp() logging.info("%s, %s", start_date, end_date) n_date = (end_date - start_date).days + 1 date_fmt = "%Y-%m-%d" hook = AwsHook(aws_conn_id=self.aws_credentials_id) s3 = hook.get_session(region_name=self.region).resource('s3') my_bucket = s3.Bucket(self.bucket) table_name = 'themoviedb' logging.info("%s date", n_date) with tempfile.TemporaryDirectory() as tmp: for i in range(n_date): dt1 = start_date + timedelta(days=i) dt2 = dt1 + timedelta(days=1) res = self.extract_json_data(table_name, dt1, dt2) logging.info("result length %s", len(res)) output_file = "{}.json".format(dt1.strftime(date_fmt)) local_path = tmp + "/" + output_file remote_path = self.tmdb_data_path + "/" + output_file logging.info("pathes %s to %s", local_path, remote_path) try: with open(local_path, "w") as f: logging.info("write %s", local_path) f.writelines([json.dumps(d[0]) + "\n" for d in res]) logging.info("upload from %s to %s", local_path, remote_path) res = my_bucket.upload_file(local_path, remote_path) except ClientError as e: logging.error(e) raise ValueError("", e) except OSError as e: logging.error(e) raise ValueError("", e) except Exception as e: logging.error(e) raise ValueError("", e) logging.info("uploaded finished")
iam = boto3.client('iam', region_name=region_name) response = iam.get_role(RoleName=role_name) return response["Role"]["Arn"] # ============================================================================= # setting up training, tuning and transform configuration # ============================================================================= # read config file config = cfg.config # set configuration for tasks hook = AwsHook(aws_conn_id='airflow-sagemaker') region = config["job_level"]["region_name"] sess = hook.get_session(region_name=region) role = get_sagemaker_role_arn(config["train_model"]["sagemaker_role"], sess.region_name) container = get_image_uri(sess.region_name, 'factorization-machines') hpo_enabled = is_hpo_enabled() # create estimator fm_estimator = Estimator(image_name=container, role=role, sagemaker_session=sagemaker.session.Session(sess), **config["train_model"]["estimator_config"]) # train_config specifies SageMaker training configuration train_config = training_config(estimator=fm_estimator, inputs=config["train_model"]["inputs"])
def boto3_session(self, aws_creds_name: str) -> 'boto3.session.Session': from airflow.contrib.hooks.aws_hook import AwsHook aws_hook = AwsHook(aws_creds_name) return aws_hook.get_session()
# must create a SageMaker team role that also has Glue access - must add this instruction in the blog def get_sagemaker_role_arn(role_name, region_name): iam = boto3.client("iam", region_name=region_name) response = iam.get_role(RoleName=role_name) return response["Role"]["Arn"] # ============================================================================= # setting up training, model creation and endpoint deployment configuration # ============================================================================= # set configuration for tasks hook = AwsHook(aws_conn_id="airflow-sagemaker") sess = hook.get_session( region_name=config.REGION_NAME ) #how is this session different from the SageMaker session - necessary? sagemaker_role = get_sagemaker_role_arn(config.SAGEMAKER_ROLE_NAME, config.REGION_NAME) container = get_image_uri(sess.region_name, "xgboost") # initialize training hyperparameters hyperparameters = { "max_depth": "5", "eta": "0.2", "gamma": "4", "min_child_weight": "6", "subsample": "0.8", "objective": "binary:logistic", "num_round": "100" }